In [None]:
import os
import sys
PROJ_ROOT = os.path.abspath(os.path.join(os.pardir))
sys.path.append(os.path.join(PROJ_ROOT, 'src'))

%matplotlib inline
%config InlineBackend.figure_format ='retina'

%load_ext autoreload
%autoreload 2

%aimport data, model, features, plot

import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px

from lightgbm import LGBMRegressor


from plot.plot import scatter_against_target_fig

from model.evaluation import score_rmse
from model.regressor import ClusterRegressor
import features.selection as feature_selection
import features.extraction as feature_extraction

In [None]:
from data.dataset import DataSet

traindf = DataSet.traindf
testdf = DataSet.testdf

print(f"train:\t{traindf.shape}\ntest:\t{testdf.shape}")

In [None]:
cluster_stats, cluster_median_trends, cluster_mean_trends = feature_extraction\
                                                                .calculate_statistics_by_cluster(traindf, 'city')

In [None]:
feature_extraction.apply_cluster_statistics(traindf, 'city', cluster_stats, cluster_median_trends, cluster_mean_trends)

In [None]:
traindf

In [None]:
feature_extraction.apply_cluster_statistics(testdf, 'city', cluster_stats, cluster_median_trends, cluster_mean_trends)

In [None]:
X_train, y_train = pd.get_dummies(traindf.drop(columns=['target', 'station']), prefix='', prefix_sep=''), traindf.target
X_test, y_test = pd.get_dummies(testdf.drop(columns=['target', 'station']), prefix='', prefix_sep=''), testdf.target

In [None]:
X_train, y_train = traindf.drop(columns=['target', 'station', 'city']), traindf.target
X_test, y_test = testdf.drop(columns=['target', 'station', 'city']), testdf.target

In [None]:
xtset = set(X_test.columns)
xtrset = set(X_train.columns)
xtset.difference(xtrset)

In [None]:
X_test.shape

In [None]:
xtset.difference(xtrset)

In [None]:
CLUSTER_POSTFIXES = ['city_cluster']


def cluster_columns(columns):
    result = []
    for c in columns:
        if (c not in set([])):
            for mask in CLUSTER_POSTFIXES:
                if mask in c:
                    result.append(c)
                    break
    return result

In [None]:
traindf.city.unique()

In [None]:
cluster_columns(traindf.columns)

In [None]:
ffeature = list(set.union(set(cluster_columns(traindf.columns)), boruta_features.index))

In [None]:
ffeature

In [None]:
feat = feature_selection.correlation(traindf.drop(columns=['station', 'city']), 'target',threshold=0.15).abs().sort_values(ascending=False).index


In [None]:
len(feat)

#### Note: for feature extraction see features.extraction package

In [None]:
from sklearn.metrics import r2_score, max_error, explained_variance_score
print(f"Base line RMSE:\t\t{score_rmse(y_test, X_test.traff_mean):.3f}")
print(f"R^2:\t{r2_score(y_test, X_test.traff_mean): .3f}")
print(f"Max Error:\t{max_error(y_test, X_test.traff_mean): .3f}")
print(f"Var:\t{explained_variance_score(y_test, X_test.traff_mean): .3f}")

baseregressor = LGBMRegressor()
baseregressor.fit(X_train[feat], y_train)
pred = baseregressor.predict(X_test[feat])
print(f"All features RMSE:\t{score_rmse(y_test, pred): .3f}")
print(f"R^2:\t{r2_score(y_test, pred): .3f}")
print(f"Max Error:\t{max_error(y_test, pred): .3f}")
print(f"Var:\t{explained_variance_score(y_test, pred): .3f}")

In [None]:
boruta_features.sort_values(ascending=False).index[:6]

In [None]:
from sklearn.metrics import r2_score, max_error, explained_variance_score
print(f"All features RMSE:\t{r2_score(y_test, pred): .3f}")

In [None]:
from sklearn.metrics import r2_score, max_error, explained_variance_score
print(f"Base line RMSE:\t\t{max_error(y_test, X_test.traff_mean):.3f}")

In [None]:
std = 10
X_train, y_train = traindf[traindf.traff_std <= std].drop(columns=['target']), traindf[traindf.traff_std <= std].target
X_test, y_test = testdf[testdf.traff_std <= std].drop(columns=['target']), testdf[testdf.traff_std <= std].target


baseregressor = LGBMRegressor()
baseregressor.fit(X_train, y_train)
pred1 = baseregressor.predict(X_test)
ytest1 = y_test
print(f"All features RMSE:\t{score_rmse(y_test, pred1): .3f}")

In [None]:
std = 10
X_train, y_train = traindf[traindf.traff_std > std].drop(columns=['target']), traindf[traindf.traff_std > std].target
X_test, y_test = testdf[testdf.traff_std > std].drop(columns=['target']), testdf[testdf.traff_std > std].target


baseregressor = LGBMRegressor()
baseregressor.fit(X_train, y_train)

pred2 = baseregressor.predict(X_test)
y_test2 = y_test
print(f"All features RMSE:\t{score_rmse(y_test, pred2): .3f}")

In [None]:
print(f"All features RMSE:\t{score_rmse(y_test2, np.array([27.392696] * len(y_test2))): .3f}")

In [None]:
df = traindf[traindf.traff_std > 10].copy()
df = df.fillna(df.mean())

In [None]:
feat = feature_selection.correlation(df, 'target',threshold=0.01).abs().sort_values(ascending=False)[:20].index

In [None]:
from model.selection import estimate
estimate(df[np.append(feat ,['target', 'city_cluster_last_month_max','loc_lon', 
                             'city_cluster_last_month_min',
                             'city_cluster_last_month_median',
                            'city_cluster_last_month_mean','city_cluster_last_month_std', 'city_cluster_median_td', 'city_cluster_mean_td'])], rows=df.shape[0])

In [None]:
prediction = baseregressor.predict(X_test)

In [None]:
residuals = y_test2 - pred2

In [None]:
px.histogram(residuals)

In [None]:
testdf[testdf.index.isin(residuals.index.values)].traff_m1.median()

In [None]:
list1 = [1, 2,3]
list2 = [3,4,5]

In [None]:
np.append(list1, list2)

In [None]:
boruta_appearance, boruta_importance = feature_selection.boruta(X_train, y_train, iterations=20)

In [None]:
boruta_features = (boruta_appearance + boruta_importance)[boruta_appearance.where(lambda x: x >= 2).dropna().index]
fig = px.bar(boruta_features.sort_values(ascending=False))
fig.update_layout(title='Boruta feature importance',
    template='plotly_dark',plot_bgcolor='rgb(42, 56, 61)',paper_bgcolor='rgb(42, 56, 61)', 
                  font=dict(color="white", size=18, ), width=1100, height=600, showlegend=False)
fig.update_xaxes(tickangle=45, title_text='')
fig.update_yaxes(title_text='')
fig.show()

In [None]:
boruta_features.sort_values(ascending=False)

In [None]:
baseregressor.fit(X_train[list(boruta_features.index)], y_train)
dddf = X_test[list(boruta_features.index)]
pred = baseregressor.predict(dddf)
print(f"All features RMSE:\t{score_rmse(y_test, pred): .3f}")
print(f"R^2:\t{r2_score(y_test, pred): .3f}")
print(f"Max Error:\t{max_error(y_test, pred): .3f}")
print(f"Var:\t{explained_variance_score(y_test, pred): .3f}")

In [None]:
feat = feature_selection.correlation(traindf.drop(columns=['station', 'city']), 'target',threshold=0.2).abs().sort_values(ascending=False).index

In [None]:
xtset = set(feat)
xtrset = set(boruta_features.index)
final_features = set.union(xtset, xtrset)

In [None]:
len(final_features)

In [None]:
baseregressor.fit(X_train[list(final_features)], y_train)
dddf = X_test[list(final_features)]
pred = baseregressor.predict(dddf)
print(f"All features RMSE:\t{score_rmse(y_test, pred): .3f}")
print(f"R^2:\t{r2_score(y_test, pred): .3f}")
print(f"Max Error:\t{max_error(y_test, pred): .3f}")
print(f"Var:\t{explained_variance_score(y_test, pred): .3f}")

In [None]:
std = 2
X_train, y_train = traindf[traindf.traff_std <= std].drop(columns=['station', 'city']), traindf[traindf.traff_std <= std].target
X_test, y_test = testdf[testdf.traff_std <= std].drop(columns=['station', 'city']), testdf[testdf.traff_std <= std].target


baseregressor = LGBMRegressor()
baseregressor.fit(X_train, y_train)
pred1 = baseregressor.predict(X_test)

print(f"All features RMSE:\t{score_rmse(y_test, pred1): .3f}")
print(f"R^2:\t{r2_score(y_test, pred1): .3f}")
print(f"Max Error:\t{max_error(y_test, pred1): .3f}")
print(f"Var:\t{explained_variance_score(y_test, pred1): .3f}")

In [None]:
fig = scatter_against_target_fig(
    traindf[list(boruta_features.sort_values(ascending=False).index) + ['target']], cols=3,
    sample=2000)
fig.update_layout(template='plotly_dark',plot_bgcolor='rgb(42, 56, 61)',paper_bgcolor='rgb(42, 56, 61)', 
                  font=dict(color="white", size=18, ), height=2000, showlegend=False)
# fig.show()
fig.write_html('scatter.html')

In [None]:
boruta_features.index

### Recursive feature elimination with cross-validation

In [None]:
from sklearn.feature_selection import RFECV

rfecv = RFECV(estimator=LGBMRegressor(), step=1, cv=5,
              scoring='neg_root_mean_squared_error',
              min_features_to_select=1)
rfecv.fit(traindf[boruta_features.index], traindf.target);

In [None]:
boruta_features.index[rfecv.support_]

In [None]:
fig = px.line(rfecv.grid_scores_, title='Score vs Number of features',)
fig.update_layout(showlegend=False)

In [None]:
features_base = ['traff_m1', 'traff_mean', 'traff_m5']

In [None]:
baseregressor.fit(X_train[features_base], y_train)
print(f"Base features RMSE:\t{score_rmse(y_test, baseregressor.predict(X_test[features_base])): .3f}")

In [None]:
other_non_traffic_features = [x for x in boruta_features.index if not x.startswith('traff')]

In [None]:
result = feature_selection.grid_search(traindf, features_base, other_non_traffic_features,max_count_to_add=2)

In [None]:
result[0].sort_values(ascending=False)[:10]

In [None]:
result[1].sort_values(ascending=False)[:10]

In [None]:
pd.DataFrame((X_train[boruta_features.index].isnull().mean() * 100).apply(lambda x: round(x, 3))).sort_values(by=0)

In [None]:
features_selected = features_base + ['device_price', 'imei_mean_day_announced']

In [None]:
baseregressor.fit(X_train[features_selected], y_train)
print(f"Selected features RMSE:\t{score_rmse(y_test, baseregressor.predict(X_test[features_selected])): .3f}")