In [10]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import Ridge, Lasso, ElasticNet
import time
from sklearn.model_selection import cross_validate, cross_val_predict
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
import numpy as np

In [3]:
trainDF = pd.read_csv('../app/data/02_train.csv', sep=';')
testDF = pd.read_csv('../app/data/02_test.csv', sep=';')

In [4]:
columns = [
        'Lng',
        'Lat',
        'square', 
        'livingRoom', 
        'drawingRoom', 
        'kitchen', 
        'bathRoom', 
        'floor', 
        'buildingType', 
        'renovationCondition', 
        'buildingStructure', 
        'elevator', 
        'fiveYearsProperty',
        'subway',
        'district',
        'floorType',
        'constructionTimePeriod',
        'town',
        'placeRank',
        'districtPopulation',
        'districtArea'
    ]

In [5]:
X_train = trainDF.drop(['totalPrice'], axis=1)
y_train = trainDF['totalPrice']
X_test = testDF.drop(['totalPrice'], axis=1)
y_test = testDF['totalPrice']

In [6]:
X_train = X_train[X_train.columns.intersection(columns)]
X_test = X_test[X_test.columns.intersection(columns)]

In [7]:
X = X_train
y = y_train

In [14]:
estimators = [
    ("sgd_regressor", 
        make_pipeline(
            StandardScaler(), 
            SGDRegressor(alpha=0.0001, loss='squared_loss', penalty='l1', random_state=42)
        )   
    ),
    ('ENet', ElasticNet(alpha=0.001)),
    ("RandomFR", RandomForestRegressor(n_jobs=-1)),
    ("Gradient Boosting", HistGradientBoostingRegressor()),
]

stacking_regressor = StackingRegressor(estimators=estimators, final_estimator=Lasso(), n_jobs=-1)

In [15]:
# predicted vs true stacking_regressor
start = time.time()
stacking_regressor.fit(X_train, y_train)
end = time.time()
print(f"Time to fit: {end - start}")
y_pred = stacking_regressor.predict(X_test)
print(f"Mean squared error: {mean_squared_error(y_test, y_pred)}")
print(f"R2 score: {r2_score(y_test, y_pred)}")


  model = cd_fast.enet_coordinate_descent(


Time to fit: 87.90016889572144
Mean squared error: 13272.462986619523
R2 score: 0.7451976041251651


In [None]:
def plot_regression_results(ax, y_true, y_pred, title, scores, elapsed_time):
    """Scatter plot of the predicted vs true targets."""
    ax.plot(
        [y_true.min(), y_true.max()], [y_true.min(), y_true.max()], "--r", linewidth=2
    )
    ax.scatter(y_true, y_pred, alpha=0.2)

    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)
    ax.get_xaxis().tick_bottom()
    ax.get_yaxis().tick_left()
    ax.spines["left"].set_position(("outward", 10))
    ax.spines["bottom"].set_position(("outward", 10))
    ax.set_xlim([y_true.min(), y_true.max()])
    ax.set_ylim([y_true.min(), y_true.max()])
    ax.set_xlabel("Measured")
    ax.set_ylabel("Predicted")
    extra = plt.Rectangle(
        (0, 0), 0, 0, fc="w", fill=False, edgecolor="none", linewidth=0
    )
    ax.legend([extra], [scores], loc="upper left")
    title = title + "\n Evaluation in {:.2f} seconds".format(elapsed_time)
    ax.set_title(title)


fig, axs = plt.subplots(2, 2, figsize=(9, 7))
axs = np.ravel(axs)

for ax, (name, est) in zip(
    axs, estimators + [("Stacking Regressor", stacking_regressor)]
):
    start_time = time.time()
    score = cross_validate(
        est, X, y, scoring=["r2", "neg_mean_absolute_error"], n_jobs=-1, verbose=0
    )
    elapsed_time = time.time() - start_time

    y_pred = cross_val_predict(est, X, y, n_jobs=-1, verbose=0)

    plot_regression_results(
        ax,
        y,
        y_pred,
        name,
        (r"$R^2={:.2f} \pm {:.2f}$" + "\n" + r"$MAE={:.2f} \pm {:.2f}$").format(
            np.mean(score["test_r2"]),
            np.std(score["test_r2"]),
            -np.mean(score["test_neg_mean_absolute_error"]),
            np.std(score["test_neg_mean_absolute_error"]),
        ),
        elapsed_time,
    )

plt.suptitle("Single predictors versus stacked predictors")
plt.tight_layout()
plt.subplots_adjust(top=0.9)
plt.show()