In [None]:
import shared
import pandas as pd
import numpy as np
import logging
import json
import os

import sklearn
from sklearn import linear_model
from sklearn import ensemble
from sklearn.metrics import mean_squared_error, median_absolute_error, mean_absolute_percentage_error, r2_score
from sklearn.model_selection import train_test_split

import prepare_run_features
#os.environ['FORECAST_YEAR'] = "2019"
unknown_or_known = os.environ.get('UNKNOWN_OR_KNOWN', "unknown")
runners_with_history =  unknown_or_known == "known"

np.__version__

In [None]:
x, y, features = prepare_run_features.prepare_run_features(runners_with_history)
y.shape

In [None]:
if runners_with_history:
    runners_row_indexer = features["runs"] > 1
else:
    runners_row_indexer = features["runs"] == 1
    
features = features[runners_row_indexer]
x = x[runners_row_indexer]
y = y[runners_row_indexer]
display(y.shape)
display(x.shape)
features.shape

In [None]:
#x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.05, random_state=2019)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.01, random_state=2023)
x_train.shape

In [None]:
y_train.shape

In [None]:
import matplotlib.pyplot as plt
index_of_team_id = list(features.columns).index("team_id")
def fit_and_test_model(model, x_train, x_test, y_train, y_test, fit_params={}):
    model.fit(x_train, y_train.ravel(), **fit_params)
    y_pred = np.exp(model.predict(x_test))
    print(f"Shapes: y_test={np.exp(y_test).shape} y_pred={y_pred.shape}")
    print("Mean absolute percetange error: %.3f" %  mean_absolute_percentage_error(np.exp(y_test), y_pred))
    print("Median absolute error: %.3f" %  median_absolute_error(np.exp(y_test), y_pred))
    print("Mean squared error: %.3f" % mean_squared_error(np.exp(y_test), y_pred))
    print('Explained variance score: %.3f' % r2_score(np.exp(y_test), y_pred))
    
    plt.scatter(x_test[:,index_of_team_id], np.exp(y_test),  color='red', alpha=0.01)
    plt.scatter(x_test[:,index_of_team_id], y_pred, color='blue', alpha=0.01)
    plt.ylim(4, 20)
    plt.show()

In [None]:
from sklearn.model_selection import cross_val_score
from skopt.space import Real, Integer, Categorical
from skopt.utils import use_named_args

if shared.race_type() == "ve":
    max_iter=200
else:
    max_iter=300

if runners_with_history:
    max_iter = 2 * max_iter

X = x_train
y = y_train
# The list of hyper-parameters we want to optimize. For each one we define the
# bounds, the corresponding scikit-learn parameter name, as well as how to
# sample values from that dimension (`'log-uniform'` for the learning rate)
space  = [
          Integer(5, 60, name='max_depth'),
          Integer(max_iter * 0.5, max_iter * 2, name='max_iter'),
          Real(low=0.005, high=0.2, name='learning_rate')
]

reg = sklearn.ensemble.HistGradientBoostingRegressor(random_state=0)
# this decorator allows your objective function to receive a the parameters as
# keyword arguments. This is particularly convenient when you want to set
# scikit-learn estimator parameters
@use_named_args(space)
def objective(**params):
    reg.set_params(**params)

    return -np.mean(cross_val_score(reg, X, y, cv=5, n_jobs=-1, verbose=1,
                                    scoring="neg_mean_absolute_percentage_error"))

#n_features = X.shape[1]

In [None]:
%%time
from skopt import gp_minimize
np.int = int
res_gp = gp_minimize(objective, space, n_calls=40, random_state=0)

"Best score=%.4f" % res_gp.fun

In [None]:
best_params = {value_and_specs[1].name: value_and_specs[0]  for value_and_specs in zip(res_gp.x, space)}
best_params

In [None]:
from skopt.plots import plot_convergence

plot_convergence(res_gp)

In [None]:
logging.info(f"best_params: {best_params}")

class NumpyEncoder(json.JSONEncoder):
    """ Custom encoder for numpy data types """
    def default(self, obj):
        if isinstance(obj, (np.int_, np.intc, np.intp, np.int8,
                            np.int16, np.int32, np.int64, np.uint8,
                            np.uint16, np.uint32, np.uint64)):

            return int(obj)

        elif isinstance(obj, (np.float_, np.float16, np.float32, np.float64)):
            return float(obj)

        elif isinstance(obj, (np.complex_, np.complex64, np.complex128)):
            return {'real': obj.real, 'imag': obj.imag}

        elif isinstance(obj, (np.ndarray,)):
            return obj.tolist()

        elif isinstance(obj, (np.bool_)):
            return bool(obj)

        elif isinstance(obj, (np.void)): 
            return None

        return json.JSONEncoder.default(self, obj)
    
#json_path = f"models/best_params_unk_runs_hgbr_{shared.race_id_str()}.json"    
json_path = f"models/best_params_{unknown_or_known}_runs_hgbr_{shared.race_id_str()}.json"    
with open(json_path, 'w') as outfile:
    json.dump(best_params, outfile, cls=NumpyEncoder)
    
with open(json_path) as infile:
    best_params = json.load(infile)
 