In [2]:
import os
import sys
PROJ_ROOT = os.path.abspath(os.path.join(os.pardir))
sys.path.append(os.path.join(PROJ_ROOT, 'src'))

%matplotlib inline
%config InlineBackend.figure_format ='retina'

%load_ext autoreload
%autoreload 2

%aimport data, model, features, plot

import pandas as pd
import numpy as np
import missingno as msno
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
from plot.plot import scatter_against_target_fig

from model.selection import estimate
from model.evaluation import score_rmse
from model.training import train
from features.selection import boruta
import features.selection as feature_selection
import features.extraction as feature_extraction

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [111]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.linear_model import ElasticNet, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from lightgbm import LGBMRegressor
from sklearn.ensemble import HistGradientBoostingRegressor, StackingRegressor, VotingRegressor, GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler

In [13]:
features_selected = ['traff_m1', 'traff_mean', 'traff_m5', 'MV_ap_4G_d', 'bs_succ_rate', 'device_price']

In [12]:
from data.dataset import DataSet

traindf = DataSet.traindf[np.unique(DataSet.traindf.columns)]
testdf = DataSet.testdf[np.unique(DataSet.testdf.columns)]

X_train, y_train = traindf[features_selected], traindf.target
X_test, y_test = testdf[features_selected], testdf.target

In [4]:
print(f"train:\t{traindf.shape}")
print(f"test:\t{testdf.shape}")

train:	(249876, 915)
test:	(242972, 915)


In [119]:
baseregressor = LGBMRegressor()
baseregressor.fit(X_train[features_selected], y_train)
print(f"RMSE:\t{score_rmse(y_test, baseregressor.predict(X_test[features_selected])): .3f}")

RMSE:	 8.700


In [109]:
mediandf = traindf[features_selected + ['target']].fillna(traindf[features_selected + ['target']].mean())
estimate(mediandf,rows=100000)

LinearRegression              		 RMSE: -546.561			 STD:  1075.86
SGDRegressor                  		 RMSE: -807130.587			 STD:  444672.53
Lasso                         		 RMSE: -8.733			 STD:  0.15
Ridge                         		 RMSE: -54.609			 STD:  91.96
LassoLars                     		 RMSE: -11.812			 STD:  0.10
ElasticNet                    		 RMSE: -8.957			 STD:  0.09
BayesianRidge                 		 RMSE: -544.891			 STD:  1072.52
HuberRegressor                		 RMSE: -364.970			 STD:  712.48
RANSACRegressor               		 RMSE: -24.005			 STD:  27.86
PoissonRegressor              		 RMSE: -44214.815			 STD:  88406.49
TweedieRegressor              		 RMSE: -9.164			 STD:  0.09
PassiveAggressiveRegressor    		 RMSE: -26.123			 STD:  33.94
DecisionTreeRegressor         		 RMSE: -12.187			 STD:  0.14
BaggingRegressor              		 RMSE: -8.982			 STD:  0.12
ExtraTreesRegressor           		 RMSE: -8.640			 STD:  0.12
GradientBoostingRegressor     		 RMSE: -8.375			 STD:  0.12


In [105]:
hgrb_regressor = HistGradientBoostingRegressor()
hgrb_regressor.fit(X_train[features_selected], y_train)
print(f"HistGradientBoostingRegressor RMSE:\t{score_rmse(y_test, hgrb_regressor.predict(X_test[features_selected])): .3f}")

HistGradientBoostingRegressor RMSE:	 8.673


In [106]:
st_regressor = StackingRegressor([('lgbm', LGBMRegressor()), ('hgrb', HistGradientBoostingRegressor())])
st_regressor.fit(X_train[features_selected], y_train)
print(f"StackingRegressor RMSE:\t{score_rmse(y_test, st_regressor.predict(X_test[features_selected])) : .3f}")

StackingRegressor RMSE:	 8.679


### LGBMRegressor is selected as the base regressor because it's a production-ready solution and fast

In [292]:
from sklearn.model_selection import GridSearchCV

param_grid = {'boosting_type': ['gbdt'],
#               'class_weight': None,
              'colsample_bytree': [1.0],
              'importance_type': ['split'],
              'learning_rate': [0.05, 0.1],
              'max_depth': [3,6],
              'min_child_samples': [20],
              'min_child_weight': [0.001],
              'min_split_gain': [0.0],
              'n_estimators': [100,200,300,500],
              'n_jobs': [-1],
              'num_leaves': [31],
#               'objective': None,
#               'random_state': None,
              'reg_alpha': [0.0],
              'reg_lambda': [0.0],
              'subsample': [1.0],
              'subsample_for_bin': [200000],
              'subsample_freq': [0]}

grid = GridSearchCV(LGBMRegressor(), cv=5, n_jobs=-1, param_grid=param_grid, scoring='neg_root_mean_squared_error')

grid.fit(X_train[features_selected], y_train)

GridSearchCV(cv=5, estimator=LGBMRegressor(), n_jobs=-1,
             param_grid={'boosting_type': ['gbdt'], 'colsample_bytree': [1.0],
                         'importance_type': ['split'],
                         'learning_rate': [0.05, 0.1], 'max_depth': [3, 6],
                         'min_child_samples': [20], 'min_child_weight': [0.001],
                         'min_split_gain': [0.0],
                         'n_estimators': [100, 200, 300, 500], 'n_jobs': [-1],
                         'num_leaves': [31], 'reg_alpha': [0.0],
                         'reg_lambda': [0.0], 'subsample': [1.0],
                         'subsample_for_bin': [200000], 'subsample_freq': [0]},
             scoring='neg_root_mean_squared_error')

In [293]:
grid.best_params_

{'boosting_type': 'gbdt',
 'colsample_bytree': 1.0,
 'importance_type': 'split',
 'learning_rate': 0.05,
 'max_depth': 3,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 200,
 'n_jobs': -1,
 'num_leaves': 31,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 0}

In [294]:
score_rmse(y_test, grid.predict(X_test[features_selected]))

8.667221697119745