In [9]:
# data handling
import pandas as pd
import numpy as np
import time
# visualization
import matplotlib.pyplot as plt
# machine learning
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn import linear_model
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import cross_validate, cross_val_predict
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import IsolationForest
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV

In [10]:
trainDF = pd.read_csv('../app/data/02_train.csv', sep=';')
testDF = pd.read_csv('../app/data/02_test.csv', sep=';')

In [11]:
columns = [
        'square', 
        'livingRoom', 
        'drawingRoom', 
        'kitchen', 
        'bathRoom', 
        'floor', 
        'buildingType', 
        'renovationCondition', 
        'buildingStructure', 
        'elevator', 
        'fiveYearsProperty',
        'subway',
        'district',
        'floorType',
        'constructionTimePeriod'
    ]

In [12]:
X_train = trainDF.drop(['totalPrice'], axis=1)
y_train = trainDF['totalPrice']
X_test = testDF.drop(['totalPrice'], axis=1)
y_test = testDF['totalPrice']

In [13]:
X = X_train
y = y_train

In [14]:
X_train

Unnamed: 0,Lng,Lat,square,livingRoom,drawingRoom,kitchen,bathRoom,floor,buildingType,renovationCondition,...,elevator,fiveYearsProperty,subway,district,communityAverage,tradeYear,tradeMonth,tradeDay,floorType,constructionTimePeriod
0,116.474283,39.930035,51.82,2,1,1,1,6,120872,54138,...,0.0,1.0,1.0,7,67222.0,2015,10,9,84113,16727
1,116.416156,40.082585,97.78,2,1,1,1,32,59076,83147,...,1.0,0.0,1.0,6,33820.0,2011,11,13,84113,139258
2,116.520858,39.918846,61.13,2,1,1,1,6,120872,54138,...,0.0,1.0,1.0,7,56752.0,2015,4,12,44810,13522
3,116.350909,39.745849,62.82,2,1,1,1,6,120872,54138,...,0.0,1.0,0.0,4,38492.0,2014,8,24,44810,13522
4,116.351933,39.887523,68.89,2,1,1,1,6,120872,54138,...,0.0,1.0,0.0,10,87885.0,2016,2,2,49078,139258
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
223166,116.416187,39.969551,119.16,2,2,1,1,13,41709,83147,...,1.0,1.0,0.0,1,92338.0,2013,1,4,84113,139258
223167,116.298972,39.943157,119.07,2,2,1,2,7,120872,54138,...,1.0,1.0,1.0,8,87420.0,2016,5,9,84113,139258
223168,116.440460,39.860650,120.14,3,1,1,2,13,120872,82146,...,1.0,1.0,0.0,2,57690.0,2016,4,30,26053,52909
223169,116.368316,39.892214,74.61,2,1,1,1,19,59076,82146,...,1.0,1.0,1.0,10,75806.0,2014,4,28,49078,139258


In [None]:
StandardScaler().fit_transform(X_train[['subway']])

In [None]:
pipe = make_pipeline(StandardScaler(), SGDRegressor(alpha=0.0001, loss='squared_loss', penalty='l1', random_state=42))

In [None]:
estimators = [
    ("pipe", pipe),
    ("RandomFR", RandomForestRegressor(n_jobs=-1)),
    ("Gradient Boosting", HistGradientBoostingRegressor()),
]

stacking_regressor = StackingRegressor(estimators=estimators, final_estimator=RidgeCV(), n_jobs=-1)

In [None]:
def plot_regression_results(ax, y_true, y_pred, title, scores, elapsed_time):
    """Scatter plot of the predicted vs true targets."""
    ax.plot(
        [y_true.min(), y_true.max()], [y_true.min(), y_true.max()], "--r", linewidth=2
    )
    ax.scatter(y_true, y_pred, alpha=0.2)

    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)
    ax.get_xaxis().tick_bottom()
    ax.get_yaxis().tick_left()
    ax.spines["left"].set_position(("outward", 10))
    ax.spines["bottom"].set_position(("outward", 10))
    ax.set_xlim([y_true.min(), y_true.max()])
    ax.set_ylim([y_true.min(), y_true.max()])
    ax.set_xlabel("Measured")
    ax.set_ylabel("Predicted")
    extra = plt.Rectangle(
        (0, 0), 0, 0, fc="w", fill=False, edgecolor="none", linewidth=0
    )
    ax.legend([extra], [scores], loc="upper left")
    title = title + "\n Evaluation in {:.2f} seconds".format(elapsed_time)
    ax.set_title(title)


fig, axs = plt.subplots(2, 2, figsize=(9, 7))
axs = np.ravel(axs)

for ax, (name, est) in zip(
    axs, estimators + [("Stacking Regressor", stacking_regressor)]
):
    start_time = time.time()
    score = cross_validate(
        est, X, y, scoring=["r2", "neg_mean_absolute_error"], n_jobs=-1, verbose=0
    )
    elapsed_time = time.time() - start_time

    y_pred = cross_val_predict(est, X, y, n_jobs=-1, verbose=0)

    plot_regression_results(
        ax,
        y,
        y_pred,
        name,
        (r"$R^2={:.2f} \pm {:.2f}$" + "\n" + r"$MAE={:.2f} \pm {:.2f}$").format(
            np.mean(score["test_r2"]),
            np.std(score["test_r2"]),
            -np.mean(score["test_neg_mean_absolute_error"]),
            np.std(score["test_neg_mean_absolute_error"]),
        ),
        elapsed_time,
    )

plt.suptitle("Single predictors versus stacked predictors")
plt.tight_layout()
plt.subplots_adjust(top=0.9)
plt.show()