In [None]:
import os
import sys
PROJ_ROOT = os.path.abspath(os.path.join(os.pardir))
sys.path.append(os.path.join(PROJ_ROOT, 'src'))

%matplotlib inline
%config InlineBackend.figure_format ='retina'

%load_ext autoreload
%autoreload 2

%aimport data, model, features, plot

import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px


from plot.plot import scatter_against_target_fig

import features.selection as feature_selection
import features.extraction as feature_extraction

from model.selection import estimate, grid_search_cv
from model.evaluation import score_rmse
from sklearn.metrics import r2_score, max_error, explained_variance_score

from lightgbm import LGBMRegressor
from sklearn.ensemble import HistGradientBoostingRegressor

In [None]:
from data.dataset import DataSet

traindf = DataSet.traindf
testdf = DataSet.testdf

print(f"train:\t{traindf.shape}\ntest:\t{testdf.shape}")

In [None]:
cluster_stats, cluster_median_trends, cluster_mean_trends = feature_extraction\
                                                                .calculate_statistics_by_cluster(traindf, 'city')

feature_extraction.apply_cluster_statistics(traindf, 'city', cluster_stats, cluster_median_trends, cluster_mean_trends)

feature_extraction.apply_cluster_statistics(testdf, 'city', cluster_stats, cluster_median_trends, cluster_mean_trends)


In [None]:
X_train, y_train = pd.get_dummies(traindf.drop(columns=['target', 'station']), prefix='', prefix_sep=''), traindf.target
X_test, y_test = pd.get_dummies(testdf.drop(columns=['target', 'station']), prefix='', prefix_sep=''), testdf.target

## Baseline:

In [None]:
from sklearn.metrics import r2_score, max_error, explained_variance_score
print(f"Base line RMSE:\t\t{score_rmse(y_test, X_test.traff_mean):.3f}")
print(f"R^2:\t{r2_score(y_test, X_test.traff_mean): .3f}")
print(f"Max Error:\t{max_error(y_test, X_test.traff_mean): .3f}")
print(f"Var:\t{explained_variance_score(y_test, X_test.traff_mean): .3f}")

In [None]:
baseregressor = LGBMRegressor()
baseregressor.fit(X_train[feat], y_train)
pred = baseregressor.predict(X_test[feat])
print(f"All features RMSE:\t{score_rmse(y_test, pred): .3f}")
print(f"R^2:\t{r2_score(y_test, pred): .3f}")
print(f"Max Error:\t{max_error(y_test, pred): .3f}")
print(f"Var:\t{explained_variance_score(y_test, pred): .3f}")

In [None]:
boruta_appearance, boruta_importance = feature_selection.boruta(X_train, y_train, iterations=20)

In [None]:
boruta_features = (boruta_appearance + boruta_importance)[boruta_appearance.where(lambda x: x >= 2).dropna().index]
fig = px.bar(boruta_features.sort_values(ascending=False))
fig.update_layout(title='Boruta feature importance',
    template='plotly_dark',plot_bgcolor='rgb(42, 56, 61)',paper_bgcolor='rgb(42, 56, 61)', 
                  font=dict(color="white", size=18, ), width=1100, height=600, showlegend=False)
fig.update_xaxes(tickangle=45, title_text='')
fig.update_yaxes(title_text='')
fig.show()

In [None]:
fig = scatter_against_target_fig(
    traindf[list(boruta_features.sort_values(ascending=False).index) + ['target']], cols=3,
    sample=2000)
fig.update_layout(template='plotly_dark',plot_bgcolor='rgb(42, 56, 61)',paper_bgcolor='rgb(42, 56, 61)', 
                  font=dict(color="white", size=18, ), height=2000, showlegend=False)
fig.show()

In [None]:
features_selected = ['city_cluster_last_month_std',
 'non_accum_internet_vol_td_mnt3',
 'traff_m4',
 'city_cluster_mean_td',
 'non_accum_internet_vol_max_mnt1',
 'device_days_usage',
 'non_accum_internet_vol_std_mnt1',
 'bs_succ_rate',
 'city_cluster_last_month_median',
 'non_accum_internet_vol_min_mnt1',
 'imei_mean_day_announced',
 'MV_Traf_4G_d_Mb',
 'data_3g_tar_vol_std_mnt3',
 'dpi_events',
 'myvf_day_usage',
 'non_accum_internet_vol_max_mnt3',
 'non_accum_internet_vol_min_mnt3',
 'non_accum_internet_vol_mea_mnt1',
 'imei_mean_days_usage',
 'cnt_sms_category_microcred',
 'traff_td',
 'non_accum_internet_vol_mea_wk1',
 'dpi_upload',
 'traff_std',
 'dpi_download',
 'non_accum_internet_vol_mea_mnt3',
 'imei_max_price',
 'non_accum_internet_vol_std_mnt3',
 'traff_min',
 'city_cluster_last_month_min',
 'traff_max',
 'city_cluster_last_month_max',
 'imei_mean_long_days_usage',
 'traff_mean',
 'content_cnt_std_mnt3',
 'traff_m3',
 'traff_m1',
 'traff_m5',
 'loc_lon',
 'city_cluster_median_td',
 'traff_median',
 'device_price',
 'MV_ap_4G_d',
 'city_cluster_last_month_mean']

In [None]:
X_train, y_train = traindf[features_selected], traindf.target
X_test, y_test = testdf[features_selected], testdf.target

print(f"train:\t{traindf.shape}")
print(f"test:\t{testdf.shape}")

## Model selection:

In [None]:
baseregressor = LGBMRegressor()
baseregressor.fit(X_train, y_train)
print(f"RMSE:\t{score_rmse(y_test, baseregressor.predict(X_test)): .3f}")

In [None]:
mediandf = traindf[features_selected + ['target']].fillna(traindf[features_selected + ['target']].mean())
estimate(mediandf,rows=249876)

In [None]:
hgbr = HistGradientBoostingRegressor()
hgbr.fit(X_train, y_train)
pred = hgbr.predict(X_test)
print(f"RMSE:\t{score_rmse(y_test, pred): .3f}")
print(f"R^2:\t{r2_score(y_test, pred): .3f}")
print(f"Max Error:\t{max_error(y_test, pred): .3f}")

In [None]:
lgbm = LGBMRegressor()
lgbm.fit(X_train, y_train)
pred = lgbm.predict(X_test)
print(f"RMSE:\t{score_rmse(y_test, pred): .3f}")
print(f"R^2:\t{r2_score(y_test, pred): .3f}")
print(f"Max Error:\t{max_error(y_test, pred): .3f}")

## Hyperparameters tuning:

In [None]:
from sklearn.model_selection import GridSearchCV

params = {
              'colsample_bytree': [0.8],
              'learning_rate': [0.015],
              'max_depth': [5],
              'min_child_samples': [200],
#               'min_child_weight': [0.001],
              'min_split_gain': [0.05],
              'n_estimators': [200],
              'num_leaves': [30],
#               'objective': None,
#               'random_state': None,
#               'reg_alpha': [0],
              'reg_lambda': [0.3],
#               'subsample': [0.5],
              'subsample_for_bin': [100000],
#               'subsample_freq': [0]
         }

best_estimator, scaler = grid_search_cv(traindf[features_selected + ['target']], 
                                        LGBMRegressor(random_state=1), 
                                        params,rows=249876)

In [None]:
pred = best_estimator.predict(scaler.transform(X_test))
print(f"RMSE:\t{score_rmse(y_test, pred): .3f}")
print(f"R^2:\t{r2_score(y_test, pred): .3f}")
print(f"Max Error:\t{max_error(y_test, pred): .3f}")

In [None]:
testdf['prediction'] = pred
residuals = y_test - pred
traff_plot_df = testdf[testdf.index.isin(residuals.where(lambda x: abs(x) > 100 ).dropna().index)]

In [None]:
from plot.plot import traffic_parallel_coords
fig = traffic_parallel_coords(traff_plot_df, max_value=200.0, range=[0.6, 0.8])
fig.update_layout(template='plotly_dark',plot_bgcolor='rgb(42, 56, 61)',paper_bgcolor='rgb(42, 56, 61)', 
                  font=dict(color="white", size=18, ), width=1100, height=600)