In [None]:
import os
import sys
PROJ_ROOT = os.path.abspath(os.path.join(os.pardir))
sys.path.append(os.path.join(PROJ_ROOT, 'src'))

%matplotlib inline
%config InlineBackend.figure_format ='retina'

%load_ext autoreload
%autoreload 2

%aimport data, model, features, plot

import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px

from sklearn.experimental import enable_hist_gradient_boosting
from lightgbm import LGBMRegressor
from sklearn.ensemble import HistGradientBoostingRegressor, StackingRegressor
from sklearn.preprocessing import StandardScaler

import features.extraction as feature_extraction
from model.selection import estimate, grid_search_cv
from model.evaluation import score_rmse
from sklearn.metrics import r2_score, max_error, explained_variance_score

In [None]:
features_selected = ['traff_m1', 'traff_mean', 'traff_m5', 'device_price', 'imei_mean_day_announced']

In [None]:
features_selected = ['city_cluster_last_month_std',
 'non_accum_internet_vol_td_mnt3',
 'traff_m4',
 'city_cluster_mean_td',
 'non_accum_internet_vol_max_mnt1',
 'device_days_usage',
 'non_accum_internet_vol_std_mnt1',
 'bs_succ_rate',
 'city_cluster_last_month_median',
 'non_accum_internet_vol_min_mnt1',
 'imei_mean_day_announced',
 'MV_Traf_4G_d_Mb',
 'data_3g_tar_vol_std_mnt3',
 'dpi_events',
 'myvf_day_usage',
 'non_accum_internet_vol_max_mnt3',
 'non_accum_internet_vol_min_mnt3',
 'non_accum_internet_vol_mea_mnt1',
 'imei_mean_days_usage',
 'cnt_sms_category_microcred',
 'traff_td',
 'non_accum_internet_vol_mea_wk1',
 'dpi_upload',
 'traff_std',
 'dpi_download',
 'non_accum_internet_vol_mea_mnt3',
 'imei_max_price',
 'non_accum_internet_vol_std_mnt3',
 'traff_min',
 'city_cluster_last_month_min',
 'traff_max',
 'city_cluster_last_month_max',
 'imei_mean_long_days_usage',
 'traff_mean',
 'content_cnt_std_mnt3',
 'traff_m3',
 'traff_m1',
 'traff_m5',
 'loc_lon',
 'city_cluster_median_td',
 'traff_median',
 'device_price',
 'MV_ap_4G_d',
 'city_cluster_last_month_mean']

In [None]:
from data.dataset import DataSet, build_and_save_report

traindf = DataSet.traindf
testdf = DataSet.testdf
geodf = DataSet.geodf

In [None]:
cluster_stats, cluster_median_trends, cluster_mean_trends = feature_extraction\
                                                                .calculate_statistics_by_cluster(traindf, 'city')
feature_extraction.apply_cluster_statistics(traindf, 'city', cluster_stats, cluster_median_trends, cluster_mean_trends)
feature_extraction.apply_cluster_statistics(testdf, 'city', cluster_stats, cluster_median_trends, cluster_mean_trends)

In [None]:
X_train, y_train = traindf[features_selected], traindf.target
X_test, y_test = testdf[features_selected], testdf.target

In [None]:
print(f"train:\t{traindf.shape}")
print(f"test:\t{testdf.shape}")

In [None]:
baseregressor = LGBMRegressor()
baseregressor.fit(X_train, y_train)
print(f"RMSE:\t{score_rmse(y_test, baseregressor.predict(X_test)): .3f}")

In [None]:
mediandf = traindf[features_selected + ['target']].fillna(traindf[features_selected + ['target']].mean())
estimate(mediandf,rows=249876)

In [None]:
X_train = X_train.fillna(X_train.mean())
X_test = X_test.fillna(X_test.mean())

In [None]:
hgbr = HistGradientBoostingRegressor()
hgbr.fit(X_train, y_train)
pred = hgbr.predict(X_test)
print(f"RMSE:\t{score_rmse(y_test, pred): .3f}")
print(f"R^2:\t{r2_score(y_test, pred): .3f}")
print(f"Max Error:\t{max_error(y_test, pred): .3f}")

In [None]:
lgbm = LGBMRegressor()
lgbm.fit(X_train, y_train)
pred = lgbm.predict(X_test)
print(f"RMSE:\t{score_rmse(y_test, pred): .3f}")
print(f"R^2:\t{r2_score(y_test, pred): .3f}")
print(f"Max Error:\t{max_error(y_test, pred): .3f}")

In [None]:
from sklearn.linear_model import Lasso
ls = Lasso()
ls.fit(X_train, y_train)
pred = ls.predict(X_test)
print(f"RMSE:\t{score_rmse(y_test, pred): .3f}")
print(f"R^2:\t{r2_score(y_test, pred): .3f}")
print(f"Max Error:\t{max_error(y_test, pred): .3f}")

In [None]:
from sklearn.linear_model import ElasticNet
ls = ElasticNet()
ls.fit(X_train, y_train)
pred = ls.predict(X_test)
print(f"RMSE:\t{score_rmse(y_test, pred): .3f}")
print(f"R^2:\t{r2_score(y_test, pred): .3f}")
print(f"Max Error:\t{max_error(y_test, pred): .3f}")

In [None]:
st_regressor = StackingRegressor([('lgbm', LGBMRegressor()), ('hgrb', HistGradientBoostingRegressor())])
st_regressor.fit(X_train[features_selected], y_train)
print(f"StackingRegressor RMSE:\t{score_rmse(y_test, st_regressor.predict(X_test)) : .3f}")

### LGBMRegressor is selected as the base regressor because it's a production-ready solution and fast

In [None]:
from sklearn.model_selection import GridSearchCV

params = {
              'colsample_bytree': [0.8],
              'learning_rate': [0.015],
              'max_depth': [5],
              'min_child_samples': [200],
#               'min_child_weight': [0.001],
              'min_split_gain': [0.05],
              'n_estimators': [200],
              'num_leaves': [30],
#               'objective': None,
#               'random_state': None,
#               'reg_alpha': [0],
              'reg_lambda': [0.3],
#               'subsample': [0.5],
              'subsample_for_bin': [100000],
#               'subsample_freq': [0]
         }

best_estimator, scaler = grid_search_cv(traindf[features_selected + ['target']], 
                                        LGBMRegressor(random_state=1), 
                                        params,rows=249876)

In [None]:
from sklearn.model_selection import GridSearchCV

params = {'loss':['least_squares'],
    'learning_rate':[0.1],
    'max_iter':[100],
    'max_leaf_nodes':[31],
    'max_depth':[4],
    'min_samples_leaf':[1000],
    'l2_regularization':[0.0],
    'max_bins':[100],
    'categorical_features':[None],
    'monotonic_cst':[None],
    'warm_start':[False],
    'early_stopping':['auto'],
    'scoring':['loss'],
    'validation_fraction':[0.1],
    'n_iter_no_change':[10],
    'tol':[1e-07],}

best_estimator, scaler = grid_search_cv(traindf[features_selected + ['target']], 
                                        HistGradientBoostingRegressor(random_state=1), 
                                        params,rows=249876)

In [None]:
pred = best_estimator.predict(scaler.transform(X_test))
print(f"RMSE:\t{score_rmse(y_test, pred): .3f}")
print(f"R^2:\t{r2_score(y_test, pred): .3f}")
print(f"Max Error:\t{max_error(y_test, pred): .3f}")

In [None]:
pd.Series(pred).sort_values(ascending=False)

In [None]:
testdf['prediction'] = pred

In [None]:
df[df.group == 'C'].count()

In [None]:
df = build_and_save_report(testdf, './../../app/static/stations.json')

In [None]:
testdf.station.nunique()

In [None]:
df[['consuming_rate']].sort_values(by='consuming_rate', ascending=True)[:10]

In [None]:
# fig = px.bar(df, x='group', text=['rate 1', 'rate 2', 'rate 3'])

fig = go.Figure(data=[go.Bar(
    x=['Low Risk', 'Medium Risk', 'High Risk'],
    y=[1548,144, 0],
    text=[1548,144, 0],
    textposition='auto',
    marker_color=['green', 'yellow', 'red'] # marker color can be a single color value or an iterable
)])
fig.update_layout(template='plotly_dark',plot_bgcolor='rgb(42, 56, 61)',paper_bgcolor='rgb(42, 56, 61)', 
                  font=dict(color="white", size=26, ), width=1000, height=500)
fig.show()

In [None]:
residuals = y_test - pred

In [None]:
traff_plot_df = testdf[testdf.index.isin(residuals.where(lambda x: abs(x) > 100 ).dropna().index)]

In [None]:
from plot.plot import traffic_parallel_coords
fig = traffic_parallel_coords(traff_plot_df, max_value=200.0, range=[0.8, 1])
fig.update_layout(template='plotly_dark',plot_bgcolor='rgb(42, 56, 61)',paper_bgcolor='rgb(42, 56, 61)', 
                  font=dict(color="white", size=18, ), width=1100, height=600)