In [None]:
import pandas as pd
import numpy as np
import shared
import static_individual_estimates
import json

In [None]:
ve_or_ju = shared.race_type()
ve_or_ju

In [None]:
import time

startTime = time.time()

In [None]:
(x, y, features) = static_individual_estimates.preprocess_countries_names_and_features()
features.head(10)

In [None]:
display(x.shape)
display(y.shape)

In [None]:
reports = [f'x.shape: {x.shape}', f'y.shape: {y.shape}', features.info()]

In [None]:
import sklearn
from sklearn import linear_model
from sklearn import ensemble
from sklearn.metrics import mean_squared_error, median_absolute_error, mean_absolute_percentage_error, r2_score
from sklearn.model_selection import train_test_split
import joblib

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.05, random_state=2019)
reports.append(f'x_train: {x_train.shape}, x_test: {x_test.shape}')
x_train.shape

In [None]:
import matplotlib.pyplot as plt
index_of_team_id = list(features.columns).index("team_id")
def fit_and_test_model(model, x_train, x_test, y_train, y_test, fit_params={}):
    model.fit(x_train, y_train.ravel(), **fit_params)
    y_pred = np.exp(model.predict(x_test))
    print(f"Shapes: y_test={np.exp(y_test).shape} y_pred={y_pred.shape}")
    print("Mean absolute percetange error: %.3f" %  mean_absolute_percentage_error(np.exp(y_test), y_pred))
    print("Median absolute error: %.3f" %  median_absolute_error(np.exp(y_test), y_pred))
    print("Mean squared error: %.3f" % mean_squared_error(np.exp(y_test), y_pred))
    print('Explained variance score: %.3f' % r2_score(np.exp(y_test), y_pred))

    reports.append(f'{type(model)}: {model.get_params()}')
    reports.append(f'Explained variance score: {r2_score(np.exp(y_test), y_pred).round(3)}')
    
    plt.scatter(x_test[:,index_of_team_id], np.exp(y_test),  color='red', alpha=0.01)
    plt.scatter(x_test[:,index_of_team_id], y_pred, color='blue', alpha=0.01)
    plt.ylim(4, 20)
    plt.show()

In [None]:
y_train.shape

In [None]:

linear = linear_model.LinearRegression()
fit_and_test_model(linear, x_train, x_test, y_train, y_test)

In [None]:
coefs = pd.DataFrame({'name':features.keys(), 'coef':linear.coef_})
display(coefs.sort_values(by="coef").round(4))
#display(coefs.sort_values(by="coef", ascending=False).head(20))

In [None]:
ridge = linear_model.Ridge(alpha=0.5)
fit_and_test_model(ridge, x_train, x_test, y_train, y_test)

In [None]:
with open(f"models/best_params_gbr_{shared.race_id_str()}.json") as infile:
    best_params = json.load(infile)

best_params    

In [None]:
gbr = sklearn.ensemble.GradientBoostingRegressor(random_state=0, verbose=1, **best_params)
reports.append(f"GradientBoostingRegressor params: {gbr.get_params(deep=False)}")

fit_and_test_model(gbr, x_train, x_test, y_train, y_test)


In [None]:
# Careful, impurity-based feature importances can be misleading for high cardinality features (many unique values). 
gbr_features = pd.DataFrame({'feature':features.columns, 'importance': gbr.feature_importances_})
display(gbr_features.sort_values(by="importance", ascending=False))

In [None]:
from sklearn.inspection import permutation_importance
result = permutation_importance(gbr, x_test, y_test, n_repeats=20,
                                random_state=2019, n_jobs=2)

In [None]:
gbr_features['perm_importances_mean'] = result.importances_mean
gbr_features['perm_importances_std'] = result.importances_std
gbr_features['importance_power'] = np.sqrt(gbr_features['importance'] * gbr_features['perm_importances_mean'].abs())
display(gbr_features.sort_values(by="perm_importances_mean", ascending=False))

In [None]:
gbr_q_low = sklearn.ensemble.GradientBoostingRegressor(loss='quantile', alpha=0.159, random_state=0, verbose=1, **best_params)
fit_and_test_model(gbr_q_low, x_train, x_test, y_train, y_test)


In [None]:
gbr_q_high = sklearn.ensemble.GradientBoostingRegressor(loss='quantile', alpha=0.841, random_state=0, verbose=1, **best_params)
fit_and_test_model(gbr_q_high, x_train, x_test, y_train, y_test)



In [None]:

joblib.dump(gbr, f'models/gbr_{shared.race_id_str()}.sav')
joblib.dump(gbr_q_low, f'models/gbr_q_low_{shared.race_id_str()}.sav')
joblib.dump(gbr_q_high, f'models/gbr_q_high_{shared.race_id_str()}.sav')


In [None]:
gbr_preds = gbr.predict(pd.DataFrame(x_test))
gbr_q_low_preds = gbr_q_low.predict(pd.DataFrame(x_test))
gbr_q_high_preds = gbr_q_high.predict(pd.DataFrame(x_test))

In [None]:
gbr_q_pred_errors = pd.DataFrame({
    'q_low':np.exp(gbr_q_low_preds),
    'true':np.exp(y_test).ravel(), 
    'predicted':np.exp(gbr_preds), 
    'q_high':np.exp(gbr_q_high_preds), 
})

gbr_q_pred_errors["q_low_error"] = gbr_q_pred_errors.true < gbr_q_pred_errors.q_low
gbr_q_pred_errors["q_high_error"] = gbr_q_pred_errors.true > gbr_q_pred_errors.q_high
gbr_q_pred_errors["q_error"] = np.logical_or(gbr_q_pred_errors.q_low_error, gbr_q_pred_errors.q_high_error)
gbr_q_pred_errors["q_interval"] = gbr_q_pred_errors.q_high - gbr_q_pred_errors.q_low


gbr_q_pred_errors["std"] = (gbr_q_pred_errors.q_high - gbr_q_pred_errors.q_low) / 2
# Intentionally don't use log scale for calculation to get bigger std
# TODO IS this causing big std in Bayesian models? :(
gbr_q_pred_errors["std_correct"] = np.exp((gbr_q_high_preds - gbr_q_low_preds) / 2)
gbr_q_pred_errors["abs_error"] = np.abs(gbr_q_pred_errors.predicted - gbr_q_pred_errors.true)
gbr_q_pred_errors["abs_error_in_stds"] = gbr_q_pred_errors.abs_error / np.exp(gbr_q_pred_errors["std_correct"])

gbr_reports = [
    f'q_low_error.mean {gbr_q_pred_errors.q_low_error.mean().round(4)}',
    f'q_high_error.mean {gbr_q_pred_errors.q_high_error.mean().round(4)}',
    f'q_error.mean {gbr_q_pred_errors.q_error.mean().round(4)}',

    f'q_interval.mean {gbr_q_pred_errors.q_interval.mean().round(4)}',
    f'q_interval.median {gbr_q_pred_errors.q_interval.median().round(4)}',

    f'std.mean {gbr_q_pred_errors["std"].mean().round(4)}',
    f'std_correct.mean {gbr_q_pred_errors["std_correct"].mean().round(4)}',
    f'abs_error_in_stds.mean {gbr_q_pred_errors["abs_error_in_stds"].mean().round(4)}',
    f'abs_error.mean {gbr_q_pred_errors["abs_error"].mean().round(4)}',
    f'abs_error.median {gbr_q_pred_errors["abs_error"].median().round(4)}'
]


display(gbr_q_pred_errors.tail(15).round(3))
display(gbr_reports)


In [None]:
reports.extend(gbr_reports)

In [None]:
gbr_q_pred_errors[['q_error', "q_low", "q_high", "q_interval", "abs_error", "std"]].groupby('q_error').agg(["median"]).round(2)

In [None]:
endTime = time.time()
reports.append(f"runtime {round(((endTime - startTime)/ 60), 2)} mins")
shared.write_simple_text_report(reports, f'preprocess_priors_grouped_{shared.race_id_str()}.txt')

In [None]:
shared.log_df(f"{shared.race_id_str()} runtime {round(((endTime - startTime)/ 60), 2)} mins")