In [1]:
import pandas as pd
from matplotlib import pyplot as plt

from math import sqrt
from sklearn.metrics import mean_squared_error

from sklearn.model_selection import GridSearchCV

from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, BayesianRidge
from xgboost import XGBRegressor

#### Loading the data

In [2]:
X_test = pd.read_csv("../out/t_x_test.csv")
X_train = pd.read_csv("../out/t_x_train.csv")
y_test = pd.read_csv("../out/t_y_test.csv")
y_train = pd.read_csv("../out/t_y_train.csv")

X_test_no_outlier = pd.read_csv("../out/no_outlier_t_x_test.csv")
X_train_no_outlier = pd.read_csv("../out/no_outlier_t_x_train.csv")
y_test_no_outlier = pd.read_csv("../out/no_outlier_t_y_test.csv")
y_train_no_outlier = pd.read_csv("../out/no_outlier_t_y_train.csv")

kaggle_train_X = pd.read_csv("../out/train_X.csv")
kaggle_train_y = pd.read_csv("../out/train_Y.csv")
kaggle_train_X_no_outlier = pd.read_csv("../out/no_outlier_train_X.csv")
kaggle_train_y_no_outlier = pd.read_csv("../out/no_outlier_train_Y.csv")

kaggle_test_X = pd.read_csv("../out/kaggle_test_X.csv", index_col='Id')

(X_test.shape, X_train.shape, y_test.shape, y_train.shape, X_test_no_outlier.shape, X_train_no_outlier.shape, y_test_no_outlier.shape, y_train_no_outlier.shape)


((292, 270),
 (1167, 270),
 (292, 1),
 (1167, 1),
 (224, 270),
 (892, 270),
 (224, 1),
 (892, 1))

In [15]:
class modelAndScore:
    def __init__(self, model, score=0):
        self.model = model
        self.score = score

    def set_score(self, score):
        self.score = score

In [16]:
models = (
    modelAndScore(CatBoostRegressor()),
    modelAndScore(LGBMRegressor()),
    modelAndScore(GaussianNB()),
    modelAndScore(DecisionTreeRegressor()),
    modelAndScore(LinearRegression()),
    modelAndScore(BayesianRidge()),
    modelAndScore(XGBRegressor())
)

[mns.model.fit(X_train, y_train) for mns in models]

Learning rate set to 0.041954
0:	learn: 77795.6309218	total: 8.82ms	remaining: 8.81s
1:	learn: 75906.5659077	total: 16.8ms	remaining: 8.4s
2:	learn: 74115.0953186	total: 23.8ms	remaining: 7.9s
3:	learn: 72098.1722432	total: 35.4ms	remaining: 8.82s
4:	learn: 70415.2188451	total: 42.6ms	remaining: 8.47s
5:	learn: 68833.7294267	total: 47.9ms	remaining: 7.93s
6:	learn: 67212.4809662	total: 52.7ms	remaining: 7.47s
7:	learn: 65558.1739601	total: 59.6ms	remaining: 7.39s
8:	learn: 64064.3098625	total: 65.3ms	remaining: 7.19s
9:	learn: 62531.2175171	total: 69.8ms	remaining: 6.91s
10:	learn: 61211.0772659	total: 75.3ms	remaining: 6.77s
11:	learn: 59818.4239896	total: 79.9ms	remaining: 6.58s
12:	learn: 58509.1197058	total: 84.3ms	remaining: 6.4s
13:	learn: 57257.7635328	total: 88.8ms	remaining: 6.26s
14:	learn: 55998.6098625	total: 93.6ms	remaining: 6.14s
15:	learn: 54784.8747485	total: 98.1ms	remaining: 6.04s
16:	learn: 53621.9883041	total: 103ms	remaining: 5.95s
17:	learn: 52476.9126687	total: 

  return f(*args, **kwargs)
  return f(*args, **kwargs)


[<catboost.core.CatBoostRegressor at 0x19baab385e0>,
 LGBMRegressor(),
 GaussianNB(),
 DecisionTreeRegressor(),
 LinearRegression(),
 BayesianRidge(),
 XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=16, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)]

In [17]:
[mns.set_score(sqrt(mean_squared_error(y_test, mns.model.predict(X_test)))) for mns in models]

[None, None, None, None, None, None, None]

In [26]:
[f"{mns.model.__class__.__name__}: {mns.score}" for mns in models]

['CatBoostRegressor: 23179.290034064165',
 'LGBMRegressor: 25066.00263066404',
 'GaussianNB: 68750.94542465759',
 'DecisionTreeRegressor: 37733.48490517462',
 'LinearRegression: 66343416839717.4',
 'BayesianRidge: 26357.10041307347',
 'XGBRegressor: 26311.475679162355']

#### Experimenting with CatBoostRegressor 🐱

In [3]:
model = CatBoostRegressor()
parameters = {
    'iterations': [10, 20, 30, 40],
    'learning_rate': [0.001, 0.01, 0.05, 0.1],
    'depth': [2, 4, 8],
}

grid = GridSearchCV(estimator=model, param_grid=parameters, cv=2, n_jobs=-1)
grid.fit(X_train, y_train)

0:	learn: 79745.5698163	total: 138ms	remaining: 1.24s
1:	learn: 79695.2480794	total: 139ms	remaining: 557ms
2:	learn: 79646.6241061	total: 141ms	remaining: 329ms
3:	learn: 79599.8318698	total: 142ms	remaining: 213ms
4:	learn: 79550.3662169	total: 144ms	remaining: 144ms
5:	learn: 79502.1670917	total: 145ms	remaining: 96.9ms
6:	learn: 79452.6037654	total: 147ms	remaining: 62.9ms
7:	learn: 79404.5205674	total: 148ms	remaining: 37.1ms
8:	learn: 79359.5766220	total: 150ms	remaining: 16.6ms
9:	learn: 79312.7768548	total: 151ms	remaining: 0us


GridSearchCV(cv=2,
             estimator=<catboost.core.CatBoostRegressor object at 0x000001BF3141C0A0>,
             n_jobs=-1,
             param_grid={'depth': [2, 4, 8], 'iterations': [10, 20, 30, 40],
                         'learning_rate': [0.001, 0.01, 0.05, 0.1]})

In [7]:
grid.best_estimator_.fit(kaggle_train_X, kaggle_train_y)


0:	learn: 79393.8407048	total: 5.03ms	remaining: 45.3ms
1:	learn: 79346.4837416	total: 6.6ms	remaining: 26.4ms
2:	learn: 79298.1958009	total: 8.55ms	remaining: 20ms
3:	learn: 79246.8768764	total: 10.1ms	remaining: 15.1ms
4:	learn: 79193.0740401	total: 11.6ms	remaining: 11.6ms
5:	learn: 79143.6469431	total: 13.1ms	remaining: 8.76ms
6:	learn: 79093.2212830	total: 14.9ms	remaining: 6.37ms
7:	learn: 79042.7503942	total: 16.9ms	remaining: 4.22ms
8:	learn: 79001.3184464	total: 18.5ms	remaining: 2.05ms
9:	learn: 78956.6977041	total: 19.9ms	remaining: 0us


<catboost.core.CatBoostRegressor at 0x1bf3141b2e0>

In [8]:
out = pd.DataFrame({
    'Id': kaggle_test_X.index,
    'SalePrice': grid.best_estimator_.predict(kaggle_test_X).flatten()
})
out.head()
out.to_csv("../out/submission_nns.csv", index=None)