In [101]:
from time import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

In [97]:
df = pd.read_pickle('../data/men_lead_lag7.pkl')
dummies = pd.get_dummies(df['country'], prefix='c_')
df = pd.concat([df, dummies], axis=1)
y = df['avg_points']
df = df.drop(columns=['ID', 'Rank', 'Last Name', 'First Name', 'Points', 
                    'country', 'avg_points'])
lag_cols = [f't-{i}' for i in range(1, 8)]
for col in lag_cols:
    df[col] = df[col].fillna(0)

# Impute missing values using the median
df['height'] = df['height'].fillna(df['height'].median())
df['weight'] = df['weight'].fillna(df['weight'].median())
df['age'] = df['age'].fillna(df['age'].median())
X = df.copy()
    
# # Drop the physical features
# X = df.drop(columns=['height', 'weight', 'age'])

X_train = X[X['Year'].isin(np.arange(1991, 2019))]#.to_numpy()
y_train = y[X['Year'].isin(np.arange(1991, 2019))]#.to_numpy()
X_test = X[X['Year'].isin(np.arange(2019, 2020))]#.to_numpy()
y_test = y[X['Year'].isin(np.arange(2019, 2020))]#.to_numpy()
X_train = X_train.drop(columns='Year')
X_test = X_test.drop(columns='Year')

In [None]:
# forest = RandomForestRegressor()
# forest.fit(X_train, y_train)
# preds = forest.predict(X_test)
# mse = mean_squared_error(y_test, preds)
# print(mse)

# plt.scatter(np.arange(len(preds)), preds, alpha=.7, label='Prediction')
# plt.scatter(np.arange(len(preds)), y_test.to_numpy(), alpha=.7, label='Truth')
# plt.gcf().set_size_inches(11, 5)
# plt.grid()
# plt.legend()

In [129]:
forest = RandomForestRegressor()
parameters = {'n_estimators': [10, 50, 100, 200],
             'max_depth': [2, 5, 10, None],
             'min_samples_leaf': [1, 2, 5, 10],
             'max_features': ['auto', 'sqrt']}
start = time()
clf = GridSearchCV(forest, parameters, verbose=1, scoring='neg_mean_squared_error')
clf.fit(X_train, y_train)
end = time()

Fitting 5 folds for each of 128 candidates, totalling 640 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 640 out of 640 | elapsed:  3.0min finished


In [132]:
best_forest = RandomForestRegressor(**clf.best_params_)
best_forest.fit(X_train, y_train)
preds = best_forest.predict(X_test)
mse = mean_squared_error(y_test, preds)
print(mse)

389.9742495603846


In [134]:
forest = RandomForestRegressor()
forest.fit(X_train, y_train)
preds = forest.predict(X_test)
mse = mean_squared_error(y_test, preds)
print(mse)

345.5930410310842
