In [27]:
import numpy as np
import pandas as pd
import seaborn as sns
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import mean_squared_error
from sklearn import linear_model
import lightgbm

os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [65]:
datapath = '../data/clean_data/windowed_data/'
window_size = '9'
season = '2020-21/'

team_name_df = pd.read_csv('../data/clean_data/master_team_list.csv')
team_name_df = team_name_df[team_name_df.season == '2020-21']
fixture_df = pd.read_csv('../data/fixtures/full_fixture_difficulty.csv')
fixture_df = fixture_df[fixture_df.season == '20_21']
fixture_df = fixture_df.merge(team_name_df, how='left', left_on='team_a', right_on='team')[['team_a_difficulty', 'team_h_difficulty', 'team_h', 'team_name']]
fixture_df.rename(columns={'team_name':'away_team_name'}, inplace=True)
fixture_df = fixture_df.merge(team_name_df, how='left', left_on='team_h', right_on='team')[['team_a_difficulty', 'team_h_difficulty', 'away_team_name', 'team_name']]
fixture_df.rename(columns={'team_name':'home_team_name'}, inplace=True)

def attach_difficulty(row):
    try:
        if row['was_home']:
            team = fixture_df[(fixture_df.home_team_name == row.team) & (fixture_df.away_team_name == row.opponent_team)].reset_index(drop=True).loc[0, 'team_h_difficulty']
            opponent = fixture_df[(fixture_df.home_team_name == row.team) & (fixture_df.away_team_name == row.opponent_team)].reset_index(drop=True).loc[0, 'team_a_difficulty']
            return team - opponent
        else:
            team = fixture_df[(fixture_df.home_team_name == row.opponent_team) & (fixture_df.away_team_name == row.team)].reset_index(drop=True).loc[0, 'team_a_difficulty']
            opponent = fixture_df[(fixture_df.home_team_name == row.opponent_team) & (fixture_df.away_team_name == row.team)].reset_index(drop=True).loc[0, 'team_h_difficulty']
            return team - opponent
    except KeyError:
        return 0

def read_data(pos, datapath, window_size, season, minimum_minutes=0):
    path = os.path.join(datapath, season)
    df = pd.read_csv(path + pos + window_size + '.csv')
    df = df[df.minutes >= minimum_minutes]
    df['difficulty_gap'] = df.apply(lambda r: attach_difficulty(r), axis=1, result_type='expand')
    df.drop(['team', 'opponent_team'], axis=1, inplace=True)
    
    df['total_points_bin'] = pd.cut(df['total_points'], 5, labels=False)
    return df

dfs = [read_data(p, datapath, window_size, season, minimum_minutes=1) for p in ['GK_', 'DEF_', 'FWD_', 'MID_']]
full_df = pd.concat(dfs)

### Note: qcut reduces spread of mse a bit more compared to cut

In [66]:
dfs[0].total_points_bin.value_counts()

0    247
2    234
3    180
1    129
4     44
Name: total_points_bin, dtype: int64

In [4]:
dfs[0].columns

Index(['assists', 'clean_sheets', 'creativity', 'bps', 'goals_conceded',
       'goals_scored', 'ict_index', 'influence', 'minutes', 'own_goals',
       'penalties_missed', 'penalties_saved', 'red_cards', 'saves', 'threat',
       'yellow_cards', 'total_points', 'was_home', 'Target_Output',
       'difficulty_gap', 'total_points_bin'],
      dtype='object')

### Modeling

In [67]:
# For lightgbm
param_grid = {
    'boosting_type': ['gbdt'],
    'objective': ['regression'],
    'n_estimators': [50, 200],
    'num_leaves': [7, 15, 31],
    'reg_alpha': [0.01, 10],
    'reg_lambda': [0.01, 10],
    'min_data_in_leaf': [70, 210, 350],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1],
    'verbose': [-1]
}

In [68]:
def get_results(data):
    results = {}
    print('Starting...')
    
    data = pd.get_dummies(data)
    X = data.drop('Target_Output', axis=1)
    y = data.Target_Output

    # Test set is still held-out and won't be used until writing the report.
    # Stratified split (by binned minutes)
    split_criteria_name = 'total_points_bin'
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, 
        test_size=0.2, 
        random_state=1, 
        stratify=X[split_criteria_name]
    )
    split_criteria = X_train[split_criteria_name]
    X_train.drop(split_criteria_name, axis=1, inplace=True)
    
    splitter = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    
    # Grid Search for lightgbm
    # Stratified cv (by binned minutes)
    grid = GridSearchCV(
        lightgbm.LGBMRegressor(random_state=1), 
        param_grid, 
        scoring='neg_mean_squared_error',
        cv=splitter.split(X_train, split_criteria),
        return_train_score=True,
        verbose=1
    )
    grid.fit(X_train, y_train)
    best_in_grid = grid.best_estimator_
    results['lgbm_est'] = best_in_grid

    print(
        'CV Training MSE mean %f, CV Training MSE std %f\nCV Validation MSE mean %f, CV Validation MSE std %f' %(
        -grid.cv_results_['mean_train_score'][grid.cv_results_['mean_test_score'].argmax()],
        grid.cv_results_['std_train_score'][grid.cv_results_['mean_test_score'].argmax()],
        -grid.cv_results_['mean_test_score'].max(),
        grid.cv_results_['std_test_score'][grid.cv_results_['mean_test_score'].argmax()])
    )
    
    
    # Linear Model
    lrgrid = GridSearchCV(
        linear_model.Ridge(), 
        {'alpha': [0.001, 0.1, 1, 100]}, 
        scoring='neg_mean_squared_error',
        cv=splitter.split(X_train, split_criteria),
        return_train_score=True,
        verbose=1
    )
    lrgrid.fit(X_train, y_train)
    best_in_grid = lrgrid.best_estimator_
    results['lr_coef'] = pd.concat([pd.DataFrame(X_train.columns),pd.DataFrame(np.transpose(best_in_grid.coef_))], axis = 1)
    
    print(
        'CV Training MSE mean %f, CV Training MSE std %f\nCV Validation MSE mean %f, CV Validation MSE std %f' %(
        -lrgrid.cv_results_['mean_train_score'][lrgrid.cv_results_['mean_test_score'].argmax()],
        lrgrid.cv_results_['std_train_score'][lrgrid.cv_results_['mean_test_score'].argmax()],
        -lrgrid.cv_results_['mean_test_score'].max(),
        lrgrid.cv_results_['std_test_score'][lrgrid.cv_results_['mean_test_score'].argmax()])
    )
    
    results['heuristic'] = mean_squared_error(y_test, X_test.total_points)
    
    return results

In [72]:
def_results = get_results(dfs[1])

Starting...
Fitting 5 folds for each of 432 candidates, totalling 2160 fits
CV Training MSE mean 6.607060, CV Training MSE std 0.131487
CV Validation MSE mean 6.862781, CV Validation MSE std 0.513375
Fitting 5 folds for each of 4 candidates, totalling 20 fits
CV Training MSE mean 6.935034, CV Training MSE std 0.132859
CV Validation MSE mean 6.846728, CV Validation MSE std 0.530930


In [70]:
gk_results = get_results(dfs[0])

Starting...
Fitting 5 folds for each of 432 candidates, totalling 2160 fits
CV Training MSE mean 5.852898, CV Training MSE std 0.254816
CV Validation MSE mean 6.927752, CV Validation MSE std 1.025895
Fitting 5 folds for each of 4 candidates, totalling 20 fits
CV Training MSE mean 6.647759, CV Training MSE std 0.243767
CV Validation MSE mean 7.138785, CV Validation MSE std 0.995339


In [73]:
fwd_results = get_results(dfs[2])

Starting...
Fitting 5 folds for each of 432 candidates, totalling 2160 fits
CV Training MSE mean 6.816966, CV Training MSE std 0.213665
CV Validation MSE mean 7.565874, CV Validation MSE std 0.856802
Fitting 5 folds for each of 4 candidates, totalling 20 fits
CV Training MSE mean 7.234238, CV Training MSE std 0.213408
CV Validation MSE mean 7.554370, CV Validation MSE std 0.866827


In [74]:
mid_results = get_results(dfs[3])

Starting...
Fitting 5 folds for each of 432 candidates, totalling 2160 fits
CV Training MSE mean 5.936848, CV Training MSE std 0.141110
CV Validation MSE mean 6.077601, CV Validation MSE std 0.581978
Fitting 5 folds for each of 4 candidates, totalling 20 fits
CV Training MSE mean 6.182645, CV Training MSE std 0.150663
CV Validation MSE mean 6.064438, CV Validation MSE std 0.611849


In [75]:
def_results['lgbm_est'].get_params()

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': 3,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 50,
 'n_jobs': None,
 'num_leaves': 15,
 'objective': 'regression',
 'random_state': 1,
 'reg_alpha': 0.01,
 'reg_lambda': 0.01,
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 0,
 'min_data_in_leaf': 210,
 'verbose': -1}

In [77]:
def_results['heuristic']

7.245041910717693

In [None]:
viz_df = pd.DataFrame(data={'position': names,
                         'lr_mse': [lr_results[n][1] for n in names], 
                         'lightgbm_mse': [lgbm_results[n][1] for n in names]})

In [None]:
viz_df = pd.melt(viz_df, id_vars = 'position')
viz_df.rename(columns={'variable': 'model'})
viz_df
sns.catplot(x = 'position', y='value', hue = 'variable',data=viz_df, kind='bar')