In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder

from sklearn.ensemble import RandomForestRegressor
import xgboost
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import catboost

from utils import serialize, deserialize

In [2]:
def show_results(preds, gt):
    rmse =  np.sqrt(mean_squared_error(preds, gt))
    print('RMSE:', rmse)
    return rmse

In [3]:
best_scores = {}

In [None]:
ds = pd.read_csv("dataset/train.csv")
ds.drop(columns=["Id"], inplace=True)

cat_features = ds.select_dtypes(include = ["object"]).columns
num_features = ds.select_dtypes(exclude = ["object"]).columns

In [5]:
def get_nan_stat_table(dataset):
    total = dataset.isnull().sum().sort_values(ascending=False)

    percent = total / len(dataset) * 100
    nan_stat_tbl = pd.concat([total, percent], axis=1, keys=['Total', '%'])
    return nan_stat_tbl.loc[(nan_stat_tbl['%']>0)]

get_nan_stat_table(ds[cat_features])

Unnamed: 0,Total,%
PoolQC,1453,99.520548
MiscFeature,1406,96.30137
Alley,1369,93.767123
Fence,1179,80.753425
FireplaceQu,690,47.260274
GarageType,81,5.547945
GarageCond,81,5.547945
GarageQual,81,5.547945
GarageFinish,81,5.547945
BsmtFinType2,38,2.60274


In [78]:
def prepare_dataset(dataset):
    ds_new = dataset.drop(columns=["Id"])

    cat_features = ds_new.select_dtypes(include = ["object"]).columns
    num_features = ds_new.select_dtypes(exclude = ["object"]).columns

    ds_new[cat_features] = ds_new[cat_features].fillna('None')

    for feature in num_features:
        ds_new[feature] = ds_new[feature].fillna(ds_new[feature].mean())

    return ds_new

# Make categorical feature to numeric 

In [9]:
ds_nocat = ds.copy()

for feature in cat_features:
    encoder = LabelEncoder()
    encoded_feature = encoder.fit_transform(ds[feature])
    ds_nocat[feature] = encoded_feature

(1460, 0)

In [20]:
y = np.log1p(ds.SalePrice.to_numpy())
x = ds_nocat.drop(columns=["SalePrice"]).to_numpy()

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=98987)

# RandomForest

In [71]:
parameters = {
    'criterion':('squared_error',), 
    'max_depth': (1000,),
    'max_features':(1/3, ),
    'n_estimators': (100, 1000),
    'min_samples_leaf': (1, 2, 8)
}

rforest = RandomForestRegressor()
rforest_gs = GridSearchCV(rforest, parameters, verbose=2)
rforest_gs.fit(x_train, y_train);

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] END criterion=squared_error, max_depth=1000, max_features=0.3333333333333333, min_samples_leaf=1, n_estimators=100; total time=   1.4s
[CV] END criterion=squared_error, max_depth=1000, max_features=0.3333333333333333, min_samples_leaf=1, n_estimators=100; total time=   1.0s
[CV] END criterion=squared_error, max_depth=1000, max_features=0.3333333333333333, min_samples_leaf=1, n_estimators=100; total time=   0.9s
[CV] END criterion=squared_error, max_depth=1000, max_features=0.3333333333333333, min_samples_leaf=1, n_estimators=100; total time=   0.9s
[CV] END criterion=squared_error, max_depth=1000, max_features=0.3333333333333333, min_samples_leaf=1, n_estimators=100; total time=   0.9s
[CV] END criterion=squared_error, max_depth=1000, max_features=0.3333333333333333, min_samples_leaf=1, n_estimators=1000; total time=  10.6s
[CV] END criterion=squared_error, max_depth=1000, max_features=0.3333333333333333, min_samples_leaf

In [72]:
serialize(rforest_gs, "rforest_gs")
serialize(rforest, "rforest")

In [19]:
rforest_gs = deserialize("rforest_gs")

In [73]:
print(rforest_gs.best_params_)
best_scores["RandomForest"] = show_results(rforest_gs.predict(x_test), y_test)

{'criterion': 'squared_error', 'max_depth': 1000, 'max_features': 0.3333333333333333, 'min_samples_leaf': 1, 'n_estimators': 1000}
RMSE: 0.1553235616691357


# XGBoost

In [74]:
parameters = {
    "learning_rate": (0.001, 0.01, ),
    "max_depth": [ 2, 4],
    "min_child_weight": [ 1, 10],
    "gamma":[ 0.0,],
    "n_estimators": [1000, 5000]
}
xgb = xgboost.XGBRegressor()
xgb_gs = GridSearchCV(xgb, parameters, verbose=2, cv=3)
xgb_gs.fit(x_train, y_train);

Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV] END gamma=0.0, learning_rate=0.001, max_depth=2, min_child_weight=1, n_estimators=1000; total time=   3.3s
[CV] END gamma=0.0, learning_rate=0.001, max_depth=2, min_child_weight=1, n_estimators=1000; total time=   1.9s
[CV] END gamma=0.0, learning_rate=0.001, max_depth=2, min_child_weight=1, n_estimators=1000; total time=   1.7s
[CV] END gamma=0.0, learning_rate=0.001, max_depth=2, min_child_weight=1, n_estimators=5000; total time=   8.9s
[CV] END gamma=0.0, learning_rate=0.001, max_depth=2, min_child_weight=1, n_estimators=5000; total time=   9.0s
[CV] END gamma=0.0, learning_rate=0.001, max_depth=2, min_child_weight=1, n_estimators=5000; total time=   8.9s
[CV] END gamma=0.0, learning_rate=0.001, max_depth=2, min_child_weight=10, n_estimators=1000; total time=   1.5s
[CV] END gamma=0.0, learning_rate=0.001, max_depth=2, min_child_weight=10, n_estimators=1000; total time=   1.6s
[CV] END gamma=0.0, learning_rate=0.001, 

In [76]:
serialize(xgb_gs, "xgb_gs")

In [18]:
xgb_gs = deserialize("xgb_gs")

In [21]:
print(xgb_gs.best_params_)
best_scores["XGBoost"] = show_results(xgb_gs.predict(x_test), y_test)

{'gamma': 0.0, 'learning_rate': 0.01, 'max_depth': 2, 'min_child_weight': 1, 'n_estimators': 5000}
RMSE: 0.12810969890915763


# LightGBM

In [None]:
parameters = {
    'num_leaves': (40, 20, 10,),
    'learning_rate': (0.1, 0.01, 0.05),
    'max_depth': (-1,),
    'n_estimators': (10**3, 10**4),}

lgbmr_gs = GridSearchCV(LGBMRegressor(), parameters, verbose=2)
lgbmr_gs.fit(x_train, y_train);

In [80]:
serialize(lgbmr_gs, "lgbmr_gs")

In [26]:
lgbmr_gs = deserialize("lgbmr_gs")

In [81]:
print(lgbmr_gs.best_params_)
best_scores["LightGBM"] = show_results(lgbmr_gs.predict(x_test), y_test)

{'learning_rate': 0.01, 'max_depth': -1, 'n_estimators': 1000, 'num_leaves': 10}
RMSE: 0.1409159071922352


Бустинг минимизирует как смещение (bias) так и расброс (variance)
Бэггинг уменьшает только разброс((((

# Cat Boost

1) В качестве базового алгоритма используются небрежные решающие деревья (ODT - Obvious Decision Tree)
2) В качестве метода ансаблированния используется модифицированный градиентный бустинг. Суть его следующая:
   * Перед обучением создаётся s перестановок обучающей выборки $X_1, X_2, ..., X_s$. 
   * Кадая из этих перестановок $X_\sigma$ делится на части, длина которых возрастает в геометрической прогрессии ($2^i$). 
   * Каждый новый алгоритм ансамбля $b_t$ строится на перстановке $X_\sigma$, где $\sigma$ выбирается случайным образом от 0 до s
   * $$b_t := \arg\min_b\sum_{i=1}^{l}{(b(x_i) + g_{ti})^2}$$
   * Градиент $g_{ti}$ вычисляется по выборке $X^{\sigma j}$, где j - это подвыборка перстановки в которой не учасвствовал объект $x_i$
   * $$g_{t i} = \mathcal{L}'(a^{r j}_{t-1}(x_i), y_i) $$
   * $$j = \log_2(i - 1)$$
3) Следующая фича - работа с категориальными признаками. Категориальные признаки преобразуются в вещественные с помощью метода статистики по целевому признаку (TS).


In [37]:
x = ds.drop(columns=['SalePrice'])
y = np.log1p(ds['SalePrice'])
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=98987)


train_pool = catboost.Pool(x_train, y_train, cat_features=cat_features.tolist())
test_pool = catboost.Pool(x_test, y_test, cat_features=cat_features.tolist())


In [63]:
p_grid = {
        'learning_rate': [0.07, 0.9],
        'depth': [1, 2, 3],
        'l2_leaf_reg': [0.7, 1, 1.3],
}

catboost_cls = CatBoostRegressor(
        loss_function='RMSE',
        verbose=0
)
grid_search_results = catboost_cls.grid_search(p_grid, train_pool, shuffle=False, verbose=1, search_by_train_test_split=False)

Training on fold [0/3]

bestTest = 0.1426659313
bestIteration = 976

Training on fold [1/3]

bestTest = 0.1228953334
bestIteration = 997

Training on fold [2/3]

bestTest = 0.1428224128
bestIteration = 489

0:	loss: 0.1364070	best: 0.1364070 (0)	total: 24.9s	remaining: 7m 3s
Training on fold [0/3]

bestTest = 0.1468508741
bestIteration = 515

Training on fold [1/3]

bestTest = 0.1287590721
bestIteration = 330

Training on fold [2/3]

bestTest = 0.1426366063
bestIteration = 573

1:	loss: 0.1427930	best: 0.1364070 (0)	total: 45.7s	remaining: 6m 5s
Training on fold [0/3]

bestTest = 0.1433343056
bestIteration = 969

Training on fold [1/3]

bestTest = 0.1237657549
bestIteration = 980

Training on fold [2/3]

bestTest = 0.1428005309
bestIteration = 434

2:	loss: 0.1368443	best: 0.1364070 (0)	total: 1m 3s	remaining: 5m 16s
Training on fold [0/3]

bestTest = 0.1481568056
bestIteration = 385

Training on fold [1/3]

bestTest = 0.1292779143
bestIteration = 401

Training on fold [2/3]

bestTest 

In [64]:
serialize(catboost_cls, "catboost_cls")
serialize(grid_search_results, "grid_search_results")

In [66]:
show_results(catboost_cls.predict(test_pool), y_test)

RMSE: 0.1282228211514425


0.1282228211514425

# Make submission for Kaggle

In [85]:
test_ds = pd.read_csv('dataset/test.csv')
test_ds = prepare_dataset(test_ds)
cat_features = test_ds.select_dtypes(['object']).columns.tolist()

y_pred = catboost_cls.predict(test_ds)
#return y's to original space
y_pred = np.expm1(y_pred)

In [100]:
my_submission = pd.read_csv('dataset/sample_submission.csv')
my_submission['SalePrice'] = y_pred

my_submission.to_csv('my_submission.csv',index=False)