In [11]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import RidgeCV, Lasso, ElasticNet
import time
from sklearn.model_selection import cross_validate, cross_val_predict
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
import numpy as np
from xgboost import XGBRegressor

In [2]:
trainDF = pd.read_csv('../app/data/02_train.csv', sep=';')
testDF = pd.read_csv('../app/data/02_test.csv', sep=';')

In [3]:
columns = trainDF.columns.tolist()
columns.remove("communityAverage")
columns.remove("totalPrice")
print(columns)

['Lng', 'Lat', 'square', 'livingRoom', 'drawingRoom', 'kitchen', 'bathRoom', 'floor', 'buildingType', 'renovationCondition', 'buildingStructure', 'ladderRatio', 'elevator', 'fiveYearsProperty', 'subway', 'district', 'placeRank', 'town', 'districtPopulation', 'districtArea', 'tradeYear', 'tradeMonth', 'tradeDay', 'floorType', 'constructionTimePeriod']


In [4]:
X_train = trainDF.drop(['totalPrice'], axis=1)
y_train = trainDF['totalPrice']
X_test = testDF.drop(['totalPrice'], axis=1)
y_test = testDF['totalPrice']

In [5]:
X_train = X_train[X_train.columns.intersection(columns)]
X_test = X_test[X_test.columns.intersection(columns)]

In [6]:
X = X_train
y = y_train

In [7]:
estimators = [
    ("xgboost regressor", XGBRegressor(
        booster="dart",
        eta=0.2,
        max_depth=9,
        min_child_weight=2,
        n_jobs=-1
    )),
    ("RandomFR", RandomForestRegressor(n_estimators=200, n_jobs=-1)),
    ("Gradient Boosting", HistGradientBoostingRegressor()),
]

stacking_regressor = StackingRegressor(estimators=estimators, final_estimator=RidgeCV(), n_jobs=-1)

In [8]:
# predicted vs true stacking_regressor
start = time.time()
stacking_regressor.fit(X_train, y_train)
end = time.time()
print(f"Time to fit: {end - start}")
y_pred = stacking_regressor.predict(X_test)
print(f"Mean squared error: {mean_squared_error(y_test, y_pred)}")
print(f"R2 score: {r2_score(y_test, y_pred)}")


Time to fit: 439.93285179138184
Mean squared error: 3102.360368436635
R2 score: 0.9404414346047367


In [16]:
y_test_pred = stacking_regressor.predict(X_test)
y_train_pred = stacking_regressor.predict(X_train)

In [21]:
rows = [
    [
        "test", 
        r2_score(y_test, y_test_pred),
        mean_absolute_error(y_test, y_test_pred), 
        mean_squared_error(y_test, y_test_pred), 
        np.sqrt(mean_squared_error(y_test, y_test_pred)),
        np.max(np.abs(y_test_pred - y_test))
    ],
    [
        "train", 
        r2_score(y_train, y_train_pred),
        mean_absolute_error(y_train, y_train_pred), 
        mean_squared_error(y_train, y_train_pred), 
        np.sqrt(mean_squared_error(y_train, y_train_pred)),
        np.max(np.abs(y_train_pred - y_train))
    ],
]
predDf = pd.DataFrame(rows, columns=["Dataset", "R2", "MAE", "MSE", "RMSE", "Max_Error"])

In [22]:
predDf.head()

Unnamed: 0,Dataset,R2,MAE,MSE,RMSE,Max_Error
0,test,0.940441,30.294753,3102.360368,55.698836,3557.456938
1,train,0.969647,25.331448,1646.255511,40.574074,1937.043498


In [None]:
def plot_regression_results(ax, y_true, y_pred, title, scores, elapsed_time):
    """Scatter plot of the predicted vs true targets."""
    ax.plot(
        [y_true.min(), y_true.max()], [y_true.min(), y_true.max()], "--r", linewidth=2
    )
    ax.scatter(y_true, y_pred, alpha=0.2)

    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)
    ax.get_xaxis().tick_bottom()
    ax.get_yaxis().tick_left()
    ax.spines["left"].set_position(("outward", 10))
    ax.spines["bottom"].set_position(("outward", 10))
    ax.set_xlim([y_true.min(), y_true.max()])
    ax.set_ylim([y_true.min(), y_true.max()])
    ax.set_xlabel("Measured")
    ax.set_ylabel("Predicted")
    extra = plt.Rectangle(
        (0, 0), 0, 0, fc="w", fill=False, edgecolor="none", linewidth=0
    )
    ax.legend([extra], [scores], loc="upper left")
    title = title + "\n Evaluation in {:.2f} seconds".format(elapsed_time)
    ax.set_title(title)


fig, axs = plt.subplots(2, 2, figsize=(9, 7))
axs = np.ravel(axs)

for ax, (name, est) in zip(
    axs, estimators + [("Stacking Regressor", stacking_regressor)]
):
    start_time = time.time()
    score = cross_validate(
        est, X, y, scoring=["r2", "neg_mean_absolute_error"], n_jobs=-1, verbose=0
    )
    elapsed_time = time.time() - start_time

    y_pred = cross_val_predict(est, X, y, n_jobs=-1, verbose=0)

    plot_regression_results(
        ax,
        y,
        y_pred,
        name,
        (r"$R^2={:.2f} \pm {:.2f}$" + "\n" + r"$MAE={:.2f} \pm {:.2f}$").format(
            np.mean(score["test_r2"]),
            np.std(score["test_r2"]),
            -np.mean(score["test_neg_mean_absolute_error"]),
            np.std(score["test_neg_mean_absolute_error"]),
        ),
        elapsed_time,
    )

plt.suptitle("Single predictors versus stacked predictors")
plt.tight_layout()
plt.subplots_adjust(top=0.9)
plt.show()