In [None]:
import os
import sys
PROJ_ROOT = os.path.abspath(os.path.join(os.pardir))
sys.path.append(os.path.join(PROJ_ROOT, 'src'))

%matplotlib inline
%config InlineBackend.figure_format ='retina'

%load_ext autoreload
%autoreload 2

%aimport data, model, features, plot

import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px

from sklearn.experimental import enable_hist_gradient_boosting
from lightgbm import LGBMRegressor
from sklearn.ensemble import HistGradientBoostingRegressor, StackingRegressor
from sklearn.preprocessing import StandardScaler


from model.selection import estimate, grid_search_cv
from model.evaluation import score_rmse

In [None]:
features_selected = ['traff_m1', 'traff_mean', 'traff_m5', 'device_price', 'imei_mean_day_announced']

In [None]:
from data.dataset import DataSet

traindf = DataSet.traindf[np.unique(DataSet.traindf.columns)]
testdf = DataSet.testdf[np.unique(DataSet.testdf.columns)]

X_train, y_train = traindf[features_selected], traindf.target
X_test, y_test = testdf[features_selected], testdf.target

In [None]:
print(f"train:\t{traindf.shape}")
print(f"test:\t{testdf.shape}")

In [None]:
baseregressor = LGBMRegressor()
baseregressor.fit(X_train, y_train)
print(f"RMSE:\t{score_rmse(y_test, baseregressor.predict(X_test)): .3f}")

In [None]:
mediandf = traindf[features_selected + ['target']].fillna(traindf[features_selected + ['target']].mean())
estimate(mediandf,rows=100000)

In [None]:
hgbr = HistGradientBoostingRegressor()
hgbr.fit(X_train, y_train)
print(f"RMSE:\t{score_rmse(y_test, hgbr.predict(X_test)): .3f}")

In [None]:
st_regressor = StackingRegressor([('lgbm', LGBMRegressor()), ('hgrb', HistGradientBoostingRegressor())])
st_regressor.fit(X_train[features_selected], y_train)
print(f"StackingRegressor RMSE:\t{score_rmse(y_test, st_regressor.predict(X_test)) : .3f}")

### LGBMRegressor is selected as the base regressor because it's a production-ready solution and fast

In [None]:
from sklearn.model_selection import GridSearchCV

params = {'boosting_type': ['gbdt'],
              'colsample_bytree': [0.7],
              'learning_rate': [0.025],
              'max_depth': [4],
              'min_child_samples': [200],
              'min_child_weight': [0.001],
              'min_split_gain': [0.05],
              'n_estimators': [200],
              'num_leaves': [20],
#               'objective': None,
#               'random_state': None,
              'reg_alpha': [0.0],
              'reg_lambda': [0.2],
              'subsample': [0.5],
              'subsample_for_bin': [100000],
              'subsample_freq': [0]}

best_estimator, scaler = grid_search_cv(traindf[features_selected + ['target']], LGBMRegressor(random_state=0), params,rows=249876)

In [None]:
score_rmse(y_test, best_estimator.predict(scaler.transform(X_test)))