In [48]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso, Ridge, LinearRegression, ElasticNet
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.impute import KNNImputer
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.model_selection import GridSearchCV, ParameterGrid, KFold
from matplotlib import pyplot as plt
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from xgboost import XGBRegressor
from tqdm import tqdm

In [52]:
def impute(df):
    countries = df.country.unique()
    #dataset averages
    global_h = np.mean(df.height)
    global_w = np.mean(df.weight)
    global_a = np.mean(df.age)

    heights = []
    weights = []
    ages = []
    #steps through each country
    for co in countries:
        group = df[df['country'] == co]
        # counting datapoints within country
        count_h = np.count_nonzero(~np.isnan(group.height))
        count_w = np.count_nonzero(~np.isnan(group.weight))
        count_a = np.count_nonzero(~np.isnan(group.age))
        # sets thresholds between accepting the countries average or using dataset average to fill in NaN's
        if count_h >= 5:
            avg_h = np.mean(group.height)
        else:
            avg_h = global_h
        if count_w >= 5:
            avg_w = np.mean(group.weight)
        else:
            avg_w = global_w
        if count_a >= 10:
            avg_a = np.mean(group.age)
        else:
            avg_a = global_a
        # steps through each person creating lists to replace current columns in dataframe
        for i in range(len(group)):
            if np.isnan(group.iloc[i].height):
                heights.append(avg_h)
            else:
                heights.append(group.iloc[i].height)
            if np.isnan(group.iloc[i].weight):
                weights.append(avg_w)
            else:
                weights.append(group.iloc[i].weight)
            if np.isnan(group.iloc[i].age) or group.iloc[i].age==0:
                ages.append(avg_a)
            else:
                ages.append(group.iloc[i].age)
    #replacing columns of dataframe 
    imputed = df.copy()
    imputed['height'] = heights
    imputed['weight'] = weights
    imputed['age'] = ages
    return imputed.fillna(0)

In [53]:
df = pd.read_csv('../data/men_lead_no_drop.csv')
df = impute(df)
df = df.drop(['id', 'last_name', 'first_name', 'points', 'rank', 
              'Unnamed: 20', 'Unnamed: 21', 'event_count'], axis=1)

# TODO: I need to drop the extra 0 columns
# TODO: try to drop some of the lag columns
df = pd.get_dummies(df)

test = df[df['year'] >= 2019]
X_test = test.drop(['avg_points'], axis=1)
y_test = test['avg_points']

train = df[df['year'] <= 2018]
X_train = train.drop(['avg_points'], axis=1)
y_train = train['avg_points']

In [58]:
param_grid = ParameterGrid({'alpha': [0, .5, 1],
                            'gamma': [0, .5, 1],
                            'lambda': [0, .5, 1],
                            'eta': [.1, .3, .6, .9],
                            'max_depth': [1, 2, 6, 10, None]})

best_params = None
best_error = np.inf

for params in tqdm(param_grid):
    kf = KFold(n_splits=5)
    for train_idx, test_idx in kf.split(X_train):
        X_fold_train, X_fold_test = X_train.iloc[train_idx], X_train.iloc[test_idx]
        y_fold_train, y_fold_test = y_train.iloc[train_idx], y_train.iloc[test_idx]
        
        clf = XGBRegressor(objective='reg:squarederror')
        clf.fit(X_fold_train, y_fold_train)
        preds = clf.predict(X_fold_test)
        error = mean_squared_error(y_fold_test, preds)
        if error < best_error:
            best_error = error
            best_params = params
print(error)

100%|██████████| 540/540 [09:43<00:00,  1.08s/it]

298.3211621172526





In [59]:
print(best_params)

{'alpha': 0, 'eta': 0.1, 'gamma': 0, 'lambda': 0, 'max_depth': 1}
