In [None]:
import pandas as pd
import numpy as np
import shared
import static_individual_estimates
import json
import logging 
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')


In [None]:
ve_or_ju = shared.race_type()
ve_or_ju

In [None]:
import time

startTime = time.time()

In [None]:
(x, y, features) = static_individual_estimates.preprocess_countries_names_and_features()
features.head(10)


In [None]:

display(x.shape)
display(y.shape)

In [None]:
reports = [f'x.shape: {x.shape}', f'y.shape: {y.shape}', features.info()]

In [None]:
import sklearn
from sklearn import linear_model
from sklearn import ensemble
from sklearn.metrics import mean_squared_error, median_absolute_error, mean_absolute_percentage_error, r2_score
from sklearn.model_selection import train_test_split
import joblib

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=2019)
reports.append(f'x_train: {x_train.shape}, x_test: {x_test.shape}')
x_train.shape

In [None]:
import matplotlib.pyplot as plt
index_of_team_id = list(features.columns).index("team_id")
def fit_and_test_model(model, x_train, x_test, y_train, y_test, fit_params={}):
    model.fit(x_train, y_train.ravel(), **fit_params)
    y_pred = np.exp(model.predict(x_test))
    print(f"Shapes: y_test={np.exp(y_test).shape} y_pred={y_pred.shape}")
    print("Mean absolute percetange error: %.3f" %  mean_absolute_percentage_error(np.exp(y_test), y_pred))
    print("Median absolute error: %.3f" %  median_absolute_error(np.exp(y_test), y_pred))
    print("Mean squared error: %.3f" % mean_squared_error(np.exp(y_test), y_pred))
    print('Explained variance score: %.3f' % r2_score(np.exp(y_test), y_pred))

    reports.append(f'{type(model)}: {model.get_params()}')
    reports.append(f'Explained variance score: {r2_score(np.exp(y_test), y_pred).round(3)}')
    
    plt.scatter(x_test[:,index_of_team_id], np.exp(y_test),  color='red', alpha=0.01)
    plt.scatter(x_test[:,index_of_team_id], y_pred, color='blue', alpha=0.01)
    plt.ylim(4, 20)
    plt.show()

In [None]:
y_train.shape

In [None]:
linear = linear_model.LinearRegression()
fit_and_test_model(linear, x_train, x_test, y_train, y_test)

In [None]:
coefs = pd.DataFrame({'name':features.keys(), 'coef':linear.coef_})
display(coefs.sort_values(by="coef", ascending=False).head(20).round(4))

In [None]:
from sklearn.model_selection import cross_val_score
from skopt.space import Real, Integer, Categorical
from skopt.utils import use_named_args

if ve_or_ju == "ve":
    gbr_num_estimators=200
else:
    gbr_num_estimators=800


X = x_train
y = y_train
# The list of hyper-parameters we want to optimize. For each one we define the
# bounds, the corresponding scikit-learn parameter name, as well as how to
# sample values from that dimension (`'log-uniform'` for the learning rate)
space  = [
          Integer(2, 10, name='max_depth'),
          Integer(gbr_num_estimators * 0.3, gbr_num_estimators * 1.25, name='n_estimators'),
          #Integer(225, 235, name='n_estimators'),
          Categorical([0.05, 0.07, 0.08], name='learning_rate')
]

reg = sklearn.ensemble.GradientBoostingRegressor(random_state=0)
# this decorator allows your objective function to receive a the parameters as
# keyword arguments. This is particularly convenient when you want to set
# scikit-learn estimator parameters
@use_named_args(space)
def objective(**params):
    reg.set_params(**params)

    return -np.mean(cross_val_score(reg, X, y, cv=5, n_jobs=-1, verbose=1,
                                    scoring="neg_mean_absolute_percentage_error"))

#n_features = X.shape[1]

In [None]:
from skopt import gp_minimize
res_gp = gp_minimize(objective, space, n_calls=30, random_state=0)

"Best score=%.4f" % res_gp.fun

In [None]:
from skopt.plots import plot_convergence

plot_convergence(res_gp)

In [None]:
best_params = {value_and_specs[1].name: value_and_specs[0]  for value_and_specs in zip(res_gp.x, space)}
best_params

In [None]:
logging.info(f"best_params: {best_params}")

class NumpyEncoder(json.JSONEncoder):
    """ Custom encoder for numpy data types """
    def default(self, obj):
        if isinstance(obj, (np.int_, np.intc, np.intp, np.int8,
                            np.int16, np.int32, np.int64, np.uint8,
                            np.uint16, np.uint32, np.uint64)):

            return int(obj)

        elif isinstance(obj, (np.float_, np.float16, np.float32, np.float64)):
            return float(obj)

        elif isinstance(obj, (np.complex_, np.complex64, np.complex128)):
            return {'real': obj.real, 'imag': obj.imag}

        elif isinstance(obj, (np.ndarray,)):
            return obj.tolist()

        elif isinstance(obj, (np.bool_)):
            return bool(obj)

        elif isinstance(obj, (np.void)): 
            return None

        return json.JSONEncoder.default(self, obj)
    
    
with open(f"models/best_params_gbr_{shared.race_id_str()}.json", 'w') as outfile:
    json.dump(best_params, outfile, cls=NumpyEncoder)
    
with open(f"models/best_params_gbr_{shared.race_id_str()}.json") as infile:
    best_params = json.load(infile)
    

In [None]:
gbr = sklearn.ensemble.GradientBoostingRegressor(random_state=0, verbose=1, **best_params)

fit_and_test_model(gbr, x_train, x_test, y_train, y_test)

In [None]:
# Careful, impurity-based feature importances can be misleading for high cardinality features (many unique values). 
gbr_features = pd.DataFrame({'feature':features.columns, 'importance': gbr.feature_importances_})
display(gbr_features.sort_values(by="importance", ascending=False))

In [None]:
gbr_num_estimators_quantile=int(gbr_num_estimators/2)

In [None]:
gbr_q_low = sklearn.ensemble.GradientBoostingRegressor(loss='quantile', alpha=0.159, random_state=0, verbose=1, **best_params)
fit_and_test_model(gbr_q_low, x_train, x_test, y_train, y_test)


In [None]:
gbr_q_high = sklearn.ensemble.GradientBoostingRegressor(loss='quantile', alpha=0.841, random_state=0, verbose=1, **best_params)
fit_and_test_model(gbr_q_high, x_train, x_test, y_train, y_test)



In [None]:
endTime = time.time()
shared.log_df(f"{shared.race_id_str()} runtime {round(((endTime - startTime)/ 60), 2)} mins")