In [None]:
import shared
import pandas as pd
import numpy as np
import logging
import json
import os
import joblib

import sklearn
from sklearn import linear_model
from sklearn import ensemble
from sklearn.metrics import mean_squared_error, median_absolute_error, mean_absolute_percentage_error, r2_score
from sklearn.model_selection import train_test_split

import prepare_run_features
#os.environ['FORECAST_YEAR'] = "2012"
#os.environ['RACE_TYPE'] = "ju"
#os.environ['UNKNOWN_OR_KNOWN'] = "unknown"
unknown_or_known = os.environ.get('UNKNOWN_OR_KNOWN', "unknown")
runners_with_history =  unknown_or_known == "known"


In [None]:
x, y, features = prepare_run_features.prepare_run_features(runners_with_history)


In [None]:
display(y.shape)
display(x.shape)
features.shape

In [None]:
if runners_with_history:
    runners_row_indexer = features["runs"] > 1
else:
    runners_row_indexer = features["runs"] == 1
    
features = features[runners_row_indexer]
x = x[runners_row_indexer]
y = y[runners_row_indexer]
display(y.shape)
display(x.shape)
features.shape

In [None]:

x.shape

In [None]:
#x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.05, random_state=2019)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.01, random_state=2023)
x_train.shape

In [None]:
y_train.shape

In [None]:
import matplotlib.pyplot as plt
index_of_team_id = list(features.columns).index("team_id")
def fit_and_test_model(model, x_train, x_test, y_train, y_test, fit_params={}):
    model.fit(x_train, y_train.ravel(), **fit_params)
    y_pred = np.exp(model.predict(x_test))
    print(f"Shapes: y_test={np.exp(y_test).shape} y_pred={y_pred.shape}")
    print("Mean absolute percetange error: %.3f" %  mean_absolute_percentage_error(np.exp(y_test), y_pred))
    print("Median absolute error: %.3f" %  median_absolute_error(np.exp(y_test), y_pred))
    print("Mean squared error: %.3f" % mean_squared_error(np.exp(y_test), y_pred))
    print('Explained variance score: %.3f' % r2_score(np.exp(y_test), y_pred))
    
    plt.scatter(x_test[:,index_of_team_id], np.exp(y_test),  color='red', alpha=0.05)
    plt.scatter(x_test[:,index_of_team_id], y_pred, color='blue', alpha=0.05)
    plt.ylim(4, 20)
    plt.show()

In [None]:
linear = linear_model.LinearRegression()
fit_and_test_model(linear, np.nan_to_num(x_train), np.nan_to_num(x_test), y_train, y_test)

In [None]:
coefs = pd.DataFrame({'name':features.keys(), 'coef':linear.coef_})
display(coefs.sort_values(by="coef").round(4))

In [None]:
#ridge = linear_model.Ridge(alpha=0.5)
#fit_and_test_model(ridge, x_train, x_test, y_train, y_test)

In [None]:
json_path = f"models/best_params_{unknown_or_known}_runs_hgbr_{shared.race_id_str()}.json"    
with open(json_path) as infile:
    hgbr_params = json.load(infile)

hgbr_params

In [None]:
#best_params = {'max_depth': 6, 'n_estimators': 400, 'learning_rate': 0.08}
#best_params = {'max_depth': 4, 'n_estimators': 799, 'learning_rate': 0.11}

#best_params = {'max_depth': 7, 'n_estimators': 220, 'learning_rate': 0.15}
#best_params = {'max_depth': 6, 'n_estimators': 331, 'learning_rate': 0.1011956627512609}

#gbr = sklearn.ensemble.GradientBoostingRegressor(random_state=0, verbose=1, **best_params)
#fit_and_test_model(gbr, x_train, x_test, y_train, y_test)


In [None]:
#hgbr_params = {'max_depth': 6, 'max_iter': 331, 'learning_rate': 0.1011956627512609}


hgbr = sklearn.ensemble.HistGradientBoostingRegressor(random_state=0, verbose=1, **hgbr_params)

fit_and_test_model(hgbr, x_train, x_test, y_train, y_test)


In [None]:
# Careful, impurity-based feature importances can be misleading for high cardinality features (many unique values). 
gbr_features = pd.DataFrame({'feature':features.columns})
#display(gbr_features.sort_values(by="importance", ascending=False))

In [None]:
%%time
from sklearn.inspection import permutation_importance
result = permutation_importance(hgbr, x_test, y_test, n_repeats=20,
                                random_state=2019, n_jobs=2)
#result

In [None]:
gbr_features['perm_importances_mean'] = result.importances_mean
gbr_features['perm_importances_std'] = result.importances_std
#gbr_features['importance_power'] = np.sqrt(gbr_features['importance'] * gbr_features['perm_importances_mean'].abs())
display(gbr_features.sort_values(by="perm_importances_mean", ascending=False))

In [None]:
%%time
hgbr_q_low = sklearn.ensemble.HistGradientBoostingRegressor(loss='quantile', quantile=0.159, random_state=0, verbose=1, **hgbr_params)
#gbr_q_low = sklearn.ensemble.GradientBoostingRegressor(loss='quantile', alpha=0.159, random_state=0, verbose=1, **best_params)
fit_and_test_model(hgbr_q_low, x_train, x_test, y_train, y_test)


In [None]:
%%time
hgbr_q_high = sklearn.ensemble.HistGradientBoostingRegressor(loss='quantile', quantile=0.841, random_state=0, verbose=1, **hgbr_params)

#gbr_q_high = sklearn.ensemble.GradientBoostingRegressor(loss='quantile', alpha=0.841, random_state=0, verbose=1, **best_params)
fit_and_test_model(hgbr_q_high, x_train, x_test, y_train, y_test)


In [None]:
joblib.dump(hgbr, f'models/{unknown_or_known}_runs_hgbr_{shared.race_id_str()}.sav')
joblib.dump(hgbr_q_low, f'models/{unknown_or_known}_runs_hgbr_q_low_{shared.race_id_str()}.sav')
joblib.dump(hgbr_q_high, f'models/{unknown_or_known}_runs_hgbr_q_high_{shared.race_id_str()}.sav')