In [62]:
%load_ext autoreload

%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [63]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor

from lightgbm import LGBMRegressor

import warnings
warnings.filterwarnings("ignore")

In [64]:
from regression_classifier import *

In [65]:
df = pd.read_csv('./data/AB_NYC_2019_EDA.csv').drop(['Unnamed: 0'], axis=1)

target_name = 'price'

In [66]:
X, y = df.drop(columns=[target_name]), df[target_name]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

In [67]:
%%time
class_reg = ClassRegressorOnelevelEnsemble(n_bins=20, bins_calc_method='equal', leaf_model=None)
class_reg.fit(X_train_scaled, y_train)

pred_train = class_reg.predict(X_train_scaled)
pred_test = class_reg.predict(X_test_scaled)

measured_metrics = pd.DataFrame({"error_type":["MAE", "RMSE", "R2", "MAPE"]})
measured_metrics.set_index("error_type")

measured_metrics["class_reg"] = dataframe_metrics(y_test, pred_test)
measured_metrics

Wall time: 50.8 s


Unnamed: 0,error_type,class_reg
0,MAE,46.74999
1,RMSE,80.669638
2,R2,0.389429
3,MAPE,34.326945


In [68]:
%%time
class_reg = ClassRegressorOnelevelEnsemble(n_bins=20, bins_calc_method='equal', leaf_model=LinearRegression)
class_reg.fit(X_train_scaled, y_train)

pred_train = class_reg.predict(X_train_scaled)
pred_test = class_reg.predict(X_test_scaled)

measured_metrics = pd.DataFrame({"error_type":["MAE", "RMSE", "R2", "MAPE"]})
measured_metrics.set_index("error_type")

measured_metrics["class_reg"] = dataframe_metrics(y_test, pred_test)
measured_metrics

Wall time: 58.6 s


Unnamed: 0,error_type,class_reg
0,MAE,47.358477
1,RMSE,78.00797
2,R2,0.429056
3,MAPE,35.89947


In [69]:
%%time
class_reg = ClassRegressorOnelevelEnsemble(n_bins=20, bins_calc_method='percentile')
class_reg.fit(X_train_scaled, y_train)

pred_train = class_reg.predict(X_train_scaled)
pred_test = class_reg.predict(X_test_scaled)

measured_metrics = pd.DataFrame({"error_type":["MAE", "RMSE", "R2", "MAPE"]})
measured_metrics.set_index("error_type")

measured_metrics["class_reg"] = dataframe_metrics(y_test, pred_test)
measured_metrics

Wall time: 38.7 s


Unnamed: 0,error_type,class_reg
0,MAE,45.685017
1,RMSE,80.157457
2,R2,0.397158
3,MAPE,31.299079


In [70]:
%%time
class_reg = ClassRegressorOnelevelEnsemble(n_bins=20, bins_calc_method='percentile', leaf_model=LinearRegression)
class_reg.fit(X_train_scaled, y_train)

pred_train = class_reg.predict(X_train_scaled)
pred_test = class_reg.predict(X_test_scaled)

measured_metrics = pd.DataFrame({"error_type":["MAE", "RMSE", "R2", "MAPE"]})
measured_metrics.set_index("error_type")

measured_metrics["class_reg"] = dataframe_metrics(y_test, pred_test)
measured_metrics

Wall time: 47.2 s


Unnamed: 0,error_type,class_reg
0,MAE,48.023921
1,RMSE,78.169529
2,R2,0.426688
3,MAPE,36.965812


In [71]:
%%time
class_reg = ClassRegressorOnelevelEnsemble(n_bins=60, bins_calc_method='equal')
class_reg.fit(X_train_scaled, y_train)

pred_train = class_reg.predict(X_train_scaled)
pred_test = class_reg.predict(X_test_scaled)

measured_metrics = pd.DataFrame({"error_type":["MAE", "RMSE", "R2", "MAPE"]})
measured_metrics.set_index("error_type")

measured_metrics["class_reg"] = dataframe_metrics(y_test, pred_test)
measured_metrics

Wall time: 1min 57s


Unnamed: 0,error_type,class_reg
0,MAE,45.63841
1,RMSE,80.260266
2,R2,0.39561
3,MAPE,31.541427


## Сравнение с основным алгоритмом (с оптимальными настройками)

In [72]:
%%time
class_reg = ClassRegressorOnelevelEnsemble(n_bins=20, bins_calc_method='percentile')
class_reg.fit(X_train_scaled, y_train)

pred_train = class_reg.predict(X_train_scaled)
pred_test = class_reg.predict(X_test_scaled)

measured_metrics = pd.DataFrame({"error_type":["MAE", "RMSE", "R2", "MAPE"]})
measured_metrics.set_index("error_type")

measured_metrics["class_reg"] = dataframe_metrics(y_test, pred_test)
measured_metrics

Wall time: 38.7 s


Unnamed: 0,error_type,class_reg
0,MAE,45.685017
1,RMSE,80.157457
2,R2,0.397158
3,MAPE,31.299079


In [73]:
%%time
class_reg = ClassRegressorEnsemble(n_bins=2, n_levels=5, bins_calc_method='equal', leaf_model=None)
class_reg.fit(X_train_scaled, y_train)
pred_test = class_reg.predict(X_test_scaled)
measured_metrics["class_reg_ver1"] = dataframe_metrics(y_test, pred_test)
measured_metrics

Wall time: 8.78 s


Unnamed: 0,error_type,class_reg,class_reg_ver1
0,MAE,45.685017,47.453174
1,RMSE,80.157457,84.152413
2,R2,0.397158,0.33557
3,MAPE,31.299079,31.147789


In [74]:
%%time
lin_reg = LinearRegression()
lin_reg.fit(X_train_scaled, y_train)

pred_test_lin = lin_reg.predict(X_test_scaled)

measured_metrics["lin_reg"] = dataframe_metrics(y_test, pred_test_lin)
measured_metrics

Wall time: 11.8 ms


Unnamed: 0,error_type,class_reg,class_reg_ver1,lin_reg
0,MAE,45.685017,47.453174,49.708612
1,RMSE,80.157457,84.152413,78.557245
2,R2,0.397158,0.33557,0.420987
3,MAPE,31.299079,31.147789,40.722833
