In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder

from sklearn.ensemble import RandomForestRegressor
import xgboost
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

from utils import serialize, deserialize

In [23]:
def show_results(preds, gt):
    print('RMSE:', np.sqrt(mean_squared_error(preds, gt)))

In [5]:
ds = pd.read_csv("dataset/train.csv")
print(ds.shape)
ds.drop_duplicates(inplace=True)
print(ds.shape, "No duplicates as we can see")

ds.drop(columns=["Id"], inplace=True)

(1460, 81)
(1460, 81) No duplicates as we can see


In [6]:
total = ds.isnull().sum().sort_values(ascending=False)

percent = total / len(ds) * 100
nan_stat_tbl = pd.concat([total, percent], axis=1, keys=['Total', '%'])
nan_stat_tbl.loc[(nan_stat_tbl['%']>5)]

Unnamed: 0,Total,%
PoolQC,1453,99.520548
MiscFeature,1406,96.30137
Alley,1369,93.767123
Fence,1179,80.753425
FireplaceQu,690,47.260274
LotFrontage,259,17.739726
GarageYrBlt,81,5.547945
GarageCond,81,5.547945
GarageType,81,5.547945
GarageFinish,81,5.547945


In [7]:
deleted_columns = nan_stat_tbl.index[(nan_stat_tbl['%'] > 6).to_numpy()]
ds.drop(columns=deleted_columns, inplace=True)

In [8]:
categorical_features = ds.select_dtypes(include = ["object"]).columns
numerical_features = ds.select_dtypes(exclude = ["object"]).columns
#numerical_features = numerical_features.drop("SalePrice")


In [None]:
# x_numeric = ds[numerical_features]
# corr_matrix = x_numeric.corr()

# f = plt.figure(figsize=(19, 15))
# plt.matshow(corr_matrix, fignum=f.number)
# plt.xticks(range(x_numeric.shape[1]), x_numeric.columns, fontsize=14, rotation=90)
# plt.yticks(range(x_numeric.shape[1]), x_numeric.columns, fontsize=14)
# cb = plt.colorbar()
# cb.ax.tick_params(labelsize=14)
# plt.title('Correlation Matrix', fontsize=16);

# _x, _y = np.nonzero((corr_matrix.abs() > 0.8).to_numpy())
# mask = np.full_like(_x, True, dtype=bool)
# for i in range(_x.size):
#     if _x[i] == _y[i]:
#         mask[i] = False
# _x, _y = _x[mask], _y[mask]

# plt.scatter(_x, _y, c='r', marker='x');


# Fill NaN's

In [16]:
for feature in numerical_features:
    ds[feature].fillna(ds[feature].mean(), inplace=True)

"Number of NaN in numerical features = {}".format(ds[numerical_features].isna().values.sum())

'Number of NaN in numerical features = 0'

# Make categorical feature to numeric 

In [17]:
for feature in categorical_features:
    encoder = LabelEncoder()
    encoded_feature = encoder.fit_transform(ds[feature])
    ds[feature] = encoded_feature

ds.select_dtypes(["object"]).shape

(1460, 0)

In [18]:
y = np.log1p(ds.SalePrice.to_numpy())
x = ds.drop(columns=["SalePrice"]).to_numpy()

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=98987)

# RandomForest

In [242]:
parameters = {
    'criterion':('squared_error',), 
    'max_depth': (100, 1000, None),
    'max_features':('sqrt', 1/3),
    'n_estimators': (100, 1000)
}

rforest = RandomForestRegressor()
rforest_gs = GridSearchCV(rforest, parameters, verbose=2)
rforest_gs.fit(x_train, y_train);

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END criterion=squared_error, max_depth=100, max_features=sqrt, n_estimators=100; total time=   0.6s
[CV] END criterion=squared_error, max_depth=100, max_features=sqrt, n_estimators=100; total time=   0.5s
[CV] END criterion=squared_error, max_depth=100, max_features=sqrt, n_estimators=100; total time=   0.4s
[CV] END criterion=squared_error, max_depth=100, max_features=sqrt, n_estimators=100; total time=   0.5s
[CV] END criterion=squared_error, max_depth=100, max_features=sqrt, n_estimators=100; total time=   0.4s
[CV] END criterion=squared_error, max_depth=100, max_features=sqrt, n_estimators=1000; total time=   5.0s
[CV] END criterion=squared_error, max_depth=100, max_features=sqrt, n_estimators=1000; total time=   5.0s
[CV] END criterion=squared_error, max_depth=100, max_features=sqrt, n_estimators=1000; total time=   5.0s
[CV] END criterion=squared_error, max_depth=100, max_features=sqrt, n_estimators=1000; total tim

In [243]:
serialize(rforest_gs, "rforest_gs")
serialize(rforest, "rforest")

In [19]:
rforest_gs = deserialize("rforest_gs")

In [24]:
print(rforest_gs.best_params_)
show_results(rforest_gs.predict(x_test), y_test)

{'criterion': 'squared_error', 'max_depth': 1000, 'max_features': 0.3333333333333333, 'n_estimators': 1000}
RMSE: 0.15502400843980607


# XGBoost

In [236]:
parameters = {
    "learning_rate": (0.01, 0.1, 1,),
    "max_depth": [ 2, 16],
    "min_child_weight": [ 1, 10],
    "gamma":[ 0.0, 0.1],
    "n_estimators": [100, 1000, 5000]
}
xgb = xgboost.XGBRegressor()
xgb_gs = GridSearchCV(xgb, parameters, verbose=2, cv=3)
xgb_gs.fit(x_train, y_train);

Fitting 3 folds for each of 72 candidates, totalling 216 fits
[CV] END gamma=0.0, learning_rate=0.01, max_depth=2, min_child_weight=1, n_estimators=100; total time=   0.2s
[CV] END gamma=0.0, learning_rate=0.01, max_depth=2, min_child_weight=1, n_estimators=100; total time=   0.1s
[CV] END gamma=0.0, learning_rate=0.01, max_depth=2, min_child_weight=1, n_estimators=100; total time=   0.1s
[CV] END gamma=0.0, learning_rate=0.01, max_depth=2, min_child_weight=1, n_estimators=1000; total time=   1.6s
[CV] END gamma=0.0, learning_rate=0.01, max_depth=2, min_child_weight=1, n_estimators=1000; total time=   1.6s
[CV] END gamma=0.0, learning_rate=0.01, max_depth=2, min_child_weight=1, n_estimators=1000; total time=   1.6s
[CV] END gamma=0.0, learning_rate=0.01, max_depth=2, min_child_weight=1, n_estimators=5000; total time=   8.2s
[CV] END gamma=0.0, learning_rate=0.01, max_depth=2, min_child_weight=1, n_estimators=5000; total time=   8.0s
[CV] END gamma=0.0, learning_rate=0.01, max_depth=2, 

In [237]:
serialize(xgb_gs, "xgb_gs")

In [21]:
xgb_gs = deserialize("xgb_gs")

In [25]:
print(xgb_gs.best_params_)
show_results(xgb_gs.predict(x_test), y_test)

{'gamma': 0.0, 'learning_rate': 0.01, 'max_depth': 2, 'min_child_weight': 1, 'n_estimators': 5000}
RMSE: 0.12885472451412458


# LightGBM

In [247]:
parameters = {
    'num_leaves': (10, 31, 100, ),
    'learning_rate': (0.1, 0.01, 0.001),
    'max_depth': (-1, 2, 10),
    'n_estimators': (10, 100,1000),}

lgbmr_gs = GridSearchCV(LGBMRegressor(), parameters, verbose=2)
lgbmr_gs.fit(x_train, y_train)

Fitting 5 folds for each of 81 candidates, totalling 405 fits
[CV] END learning_rate=0.1, max_depth=-1, n_estimators=10, num_leaves=10; total time=   0.0s
[CV] END learning_rate=0.1, max_depth=-1, n_estimators=10, num_leaves=10; total time=   0.0s
[CV] END learning_rate=0.1, max_depth=-1, n_estimators=10, num_leaves=10; total time=   0.0s
[CV] END learning_rate=0.1, max_depth=-1, n_estimators=10, num_leaves=10; total time=   0.0s
[CV] END learning_rate=0.1, max_depth=-1, n_estimators=10, num_leaves=10; total time=   0.0s
[CV] END learning_rate=0.1, max_depth=-1, n_estimators=10, num_leaves=31; total time=   0.0s
[CV] END learning_rate=0.1, max_depth=-1, n_estimators=10, num_leaves=31; total time=   0.0s
[CV] END learning_rate=0.1, max_depth=-1, n_estimators=10, num_leaves=31; total time=   0.0s
[CV] END learning_rate=0.1, max_depth=-1, n_estimators=10, num_leaves=31; total time=   0.0s
[CV] END learning_rate=0.1, max_depth=-1, n_estimators=10, num_leaves=31; total time=   0.0s
[CV] END

GridSearchCV(estimator=LGBMRegressor(),
             param_grid={'learning_rate': (0.1, 0.01, 0.001),
                         'max_depth': (-1, 2, 10),
                         'n_estimators': (10, 100, 1000),
                         'num_leaves': (10, 31, 100)},
             verbose=2)

In [248]:
serialize(lgbmr_gs, "lgbmr_gs")

In [26]:
lgbmr_gs = deserialize("lgbmr_gs")

In [27]:
print(lgbmr_gs.best_params_)
show_results(lgbmr_gs.predict(x_test), y_test)

{'learning_rate': 0.01, 'max_depth': -1, 'n_estimators': 1000, 'num_leaves': 10}
RMSE: 0.13893059119622414


Бустинг минимизирует как смещение (bias) так и расброс (variance)
Бэггинг уменьшает только разброс((((

# Cat Boost

In [252]:
parameters = {
        'learning_rate': [0.01, 0.1],
        'depth': [2, 10],
        'l2_leaf_reg': [1, 10],
        'logging_level': ['Silent']
}

catboost_gs = GridSearchCV(CatBoostRegressor(logging_level='Silent'), parameters, verbose=2, cv=3)
catboost_gs.fit(x_train, y_train)

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[CV] END depth=2, l2_leaf_reg=1, learning_rate=0.01, logging_level=Silent; total time=   7.3s
[CV] END depth=2, l2_leaf_reg=1, learning_rate=0.01, logging_level=Silent; total time=   2.5s
[CV] END depth=2, l2_leaf_reg=1, learning_rate=0.01, logging_level=Silent; total time=   3.0s
[CV] END depth=2, l2_leaf_reg=1, learning_rate=0.1, logging_level=Silent; total time=   2.1s
[CV] END depth=2, l2_leaf_reg=1, learning_rate=0.1, logging_level=Silent; total time=   3.5s
[CV] END depth=2, l2_leaf_reg=1, learning_rate=0.1, logging_level=Silent; total time=   1.8s
[CV] END depth=2, l2_leaf_reg=10, learning_rate=0.01, logging_level=Silent; total time=   1.6s
[CV] END depth=2, l2_leaf_reg=10, learning_rate=0.01, logging_level=Silent; total time=   2.2s
[CV] END depth=2, l2_leaf_reg=10, learning_rate=0.01, logging_level=Silent; total time=   1.7s
[CV] END depth=2, l2_leaf_reg=10, learning_rate=0.1, logging_level=Silent; total time=   1.5s


GridSearchCV(cv=3,
             estimator=<catboost.core.CatBoostRegressor object at 0x000001DF07301310>,
             param_grid={'depth': [2, 10], 'l2_leaf_reg': [1, 10],
                         'learning_rate': [0.01, 0.1],
                         'logging_level': ['Silent']},
             verbose=2)

In [257]:
serialize(catboost_gs, "catboost_gs")

In [28]:
catboost_gs = deserialize("catboost_gs")

In [29]:
print(catboost_gs.best_params_)
show_results(catboost_gs.predict(x_test), y_test)

{'depth': 2, 'l2_leaf_reg': 1, 'learning_rate': 0.1, 'logging_level': 'Silent'}
RMSE: 0.1303749760218432
