In [3]:
import os
import sys
PROJ_ROOT = os.path.abspath(os.path.join(os.pardir))
sys.path.append(os.path.join(PROJ_ROOT, 'src'))

%matplotlib inline
%config InlineBackend.figure_format ='retina'

%load_ext autoreload
%autoreload 2

%aimport data, model, features, plot

import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px

from sklearn.experimental import enable_hist_gradient_boosting
from lightgbm import LGBMRegressor
from sklearn.ensemble import HistGradientBoostingRegressor, StackingRegressor
from sklearn.preprocessing import StandardScaler


from model.selection import estimate, grid_search_cv
from model.evaluation import score_rmse

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
features_selected = ['traff_m1', 'traff_mean', 'traff_m5', 'device_price', 'imei_mean_day_announced']

In [5]:
from data.dataset import DataSet

traindf = DataSet.traindf[np.unique(DataSet.traindf.columns)]
testdf = DataSet.testdf[np.unique(DataSet.testdf.columns)]

X_train, y_train = traindf[features_selected], traindf.target
X_test, y_test = testdf[features_selected], testdf.target

In [6]:
print(f"train:\t{traindf.shape}")
print(f"test:\t{testdf.shape}")

train:	(249876, 915)
test:	(242972, 915)


In [7]:
baseregressor = LGBMRegressor()
baseregressor.fit(X_train, y_train)
print(f"RMSE:\t{score_rmse(y_test, baseregressor.predict(X_test)): .3f}")

RMSE:	 8.692


In [8]:
mediandf = traindf[features_selected + ['target']].fillna(traindf[features_selected + ['target']].mean())
estimate(mediandf,rows=100000)

LinearRegression              		 RMSE: -560.308			 STD:  1103.30
SGDRegressor                  		 RMSE: -882142.257			 STD:  490855.08
Lasso                         		 RMSE: -8.733			 STD:  0.15
Ridge                         		 RMSE: -56.128			 STD:  94.94
LassoLars                     		 RMSE: -11.812			 STD:  0.10
ElasticNet                    		 RMSE: -8.989			 STD:  0.09
BayesianRidge                 		 RMSE: -558.939			 STD:  1100.56
HuberRegressor                		 RMSE: -369.869			 STD:  722.25
RANSACRegressor               		 RMSE: -73.352			 STD:  125.86
PoissonRegressor              		 RMSE: -183553.210			 STD:  367079.35
TweedieRegressor              		 RMSE: -9.217			 STD:  0.09
PassiveAggressiveRegressor    		 RMSE: -18.826			 STD:  17.22
DecisionTreeRegressor         		 RMSE: -12.129			 STD:  0.16
BaggingRegressor              		 RMSE: -9.048			 STD:  0.13
ExtraTreesRegressor           		 RMSE: -8.773			 STD:  0.10
GradientBoostingRegressor     		 RMSE: -8.404			 STD:  0.

In [9]:
hgbr = HistGradientBoostingRegressor()
hgbr.fit(X_train, y_train)
print(f"RMSE:\t{score_rmse(y_test, hgbr.predict(X_test)): .3f}")

RMSE:	 8.677


In [10]:
st_regressor = StackingRegressor([('lgbm', LGBMRegressor()), ('hgrb', HistGradientBoostingRegressor())])
st_regressor.fit(X_train[features_selected], y_train)
print(f"StackingRegressor RMSE:\t{score_rmse(y_test, st_regressor.predict(X_test)) : .3f}")

StackingRegressor RMSE:	 8.682


### LGBMRegressor is selected as the base regressor because it's a production-ready solution and fast

In [11]:
from sklearn.model_selection import GridSearchCV

params = {'boosting_type': ['gbdt'],
              'colsample_bytree': [0.7],
              'learning_rate': [0.025],
              'max_depth': [4],
              'min_child_samples': [200],
              'min_child_weight': [0.001],
              'min_split_gain': [0.05],
              'n_estimators': [200],
              'num_leaves': [20],
#               'objective': None,
#               'random_state': None,
              'reg_alpha': [0.0],
              'reg_lambda': [0.2],
              'subsample': [0.5],
              'subsample_for_bin': [100000],
              'subsample_freq': [0]}

best_estimator, scaler = grid_search_cv(traindf[features_selected + ['target']], LGBMRegressor(random_state=0), params,rows=249876)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best params: 
{'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'learning_rate': 0.025, 'max_depth': 4, 'min_child_samples': 200, 'min_child_weight': 0.001, 'min_split_gain': 0.05, 'n_estimators': 200, 'num_leaves': 20, 'reg_alpha': 0.0, 'reg_lambda': 0.2, 'subsample': 0.5, 'subsample_for_bin': 100000, 'subsample_freq': 0}


In [12]:
score_rmse(y_test, best_estimator.predict(scaler.transform(X_test)))

8.6697162084977