In [1]:
%load_ext autoreload

%autoreload 2

In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor

from lightgbm import LGBMRegressor

import warnings
warnings.filterwarnings("ignore")

In [3]:
from regression_classifier import *

### Dataset link:
https://www.kaggle.com/camnugent/california-housing-prices?select=housing.csv

In [4]:
df = pd.read_csv('./data/housing.csv')

df = df.dropna()
df = pd.get_dummies(df, columns=['ocean_proximity'], prefix='ocean', drop_first=True)

target_name = 'median_house_value'

In [5]:
X, y = df.drop(columns=[target_name]), df[target_name]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

## Сравнение двух вариаций алгоритмов (с оптимальными настройками) и бейзлайнов

In [11]:
%%time
class_reg = ClassRegressorEnsemble(n_bins=2, n_levels=5, bins_calc_method='equal', leaf_size=5000, leaf_model=LinearRegression)
class_reg.fit(X_train_scaled, y_train)
pred_test = class_reg.predict(X_test_scaled)

measured_metrics = pd.DataFrame({"error_type":["MAE", "RMSE", "R2", "MAPE"]})
measured_metrics.set_index("error_type")

measured_metrics["class_reg"] = dataframe_metrics(y_test, pred_test)
measured_metrics

Wall time: 3.32 s


Unnamed: 0,error_type,class_reg
0,MAE,45727.14447
1,RMSE,68615.483049
2,R2,0.64683
3,MAPE,23.754482


In [7]:
%%time
class_reg_onelevel = ClassRegressorOnelevelEnsemble(n_bins=20, bins_calc_method='equal')
class_reg_onelevel.fit(X_train_scaled, y_train)

pred_train = class_reg_onelevel.predict(X_train_scaled)
pred_test = class_reg_onelevel.predict(X_test_scaled)

measured_metrics["class_reg_onelevel"] = dataframe_metrics(y_test, pred_test)
measured_metrics

Wall time: 16.1 s


Unnamed: 0,error_type,class_reg,class_reg_onelevel
0,MAE,45727.14447,43104.982936
1,RMSE,68615.483049,63118.988542
2,R2,0.64683,0.701146
3,MAPE,23.754482,23.423199


In [8]:
%%time
lin_reg = LinearRegression()
lin_reg.fit(X_train_scaled, y_train)

pred_test_lin = lin_reg.predict(X_test_scaled)

measured_metrics["lin_reg"] = dataframe_metrics(y_test, pred_test_lin)
measured_metrics

Wall time: 10 ms


Unnamed: 0,error_type,class_reg,class_reg_onelevel,lin_reg
0,MAE,45727.14447,43104.982936,49642.998352
1,RMSE,68615.483049,63118.988542,67927.638042
2,R2,0.64683,0.701146,0.653875
3,MAPE,23.754482,23.423199,28.734869


In [9]:
%%time
lgbm_reg = LGBMRegressor()

lgbm_reg.fit(X_train_scaled, y_train)

measured_metrics["lgbm_reg"] = dataframe_metrics(y_test, lgbm_reg.predict(X_test_scaled))
measured_metrics

Wall time: 233 ms


Unnamed: 0,error_type,class_reg,class_reg_onelevel,lin_reg,lgbm_reg
0,MAE,45727.14447,43104.982936,49642.998352,31981.573729
1,RMSE,68615.483049,63118.988542,67927.638042,47538.012867
2,R2,0.64683,0.701146,0.653875,0.83048
3,MAPE,23.754482,23.423199,28.734869,18.058717
