In [937]:
import numpy as np
import pandas as pd
import pickle
import warnings

import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy
 
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV  #ordinary linear regression + w/ ridge regularization
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn import feature_selection as f_select
from sklearn.metrics import r2_score
from sklearn.linear_model import ElasticNetCV

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

warnings.filterwarnings('ignore')

In [938]:
df = pd.read_pickle('/Users/georgevarelas/Downloads/scraper-whoscored-master MAIN/TRANSFERMARKT_WHOSCORED_MERGE.pkl')
fifa = pd.read_pickle('/Users/georgevarelas/fifa_final.pkl')

In [939]:
df.shape

(10724, 35)

In [940]:
df = df.dropna()

In [941]:
#reorder columns
df = df[['Name', 'Market_Value',  'Position','Season', 'Age', 'Team', 'Tournament','Games_Missed', 'Goals/90min', 'Assists/90min',
       'Yel/90min', 'Red/90min', 'SpG', 'PS%', 'Rating', 'Tackles', 'Inter',
       'Fouls (def)', 'Offsides', 'Clear', 'DrB (def)', 'Blocks', 'DrB (off)',
       'Fouled (off)', 'Off (off)', 'Disp (off)', 'KeyP', 'AvgP', 'Crosses',
       'LongB', 'ThrB', 'OutOfBox', 'SixYardBox', 'PenaltyArea', 'URL']]

In [942]:
#change Market_Value and Age to float:
"""df['Market_Value'] = df['Market_Value'].apply(lambda row: float(row))
df['Age'] = df['Age'].apply(lambda row: float(row))"""

In [943]:
df.shape

(10696, 35)

In [944]:
df = pd.merge(df, fifa, on = ['Name', 'Season'])

In [945]:
#get some weird age differences from fifa dataset and transfermarkt. Get rid of any rows with large age differences

In [946]:
df['Age_y'] = df['Age_y'].apply(lambda row: float(row))
df['Age_Diff'] = df['Age_x'] - df['Age_y']
#drop rows where player ages are very mismatched
df = df.drop(df[abs(df['Age_Diff']) >= 3.0 ].index)
df = df.drop(columns = ['Age_Diff', 'Age_x'])
df = df.rename(index=str, columns={"Age_y": "Age"})

In [948]:
#Make Years to Contract_Expiry
df['Contract_Expiry'] = df['Contract_Expiry'].apply(lambda row: float(row))
df['Seasons_Left_On_Contract'] = df['Contract_Expiry'] - df['Season']

In [949]:
df['Team'] = df['Team'].apply(lambda x: x.split(' U21')[0])

In [950]:
#pickle this so we can try to get google search trends
df.to_pickle('final_players_dataset.pkl')

In [951]:
#create columns specific to club/intl tournaments so we can groupby later and preserve information

In [952]:
international_club = ['UEL', 'UCL']
club_league = ['BSA', 'RPL','CSl','ISA','GB','FL1','EPL','NE','SLL','APD','PLN', 'TS', 'SA', 'UMLS','EC', 'GB2']
international = ['UNL','WC', 'UEC', 'ICA','ICC', 'ACoN','IEU']

In [953]:
#Ranking of most powerful clubs/international teams per season
forbes_2018 = ['Real Madrid', 'Manchester United', 'Barcelona', 'Bayern Munich',\
              'Manchester City', 'Chelsea', 'Arsenal', 'Liverpool', 'Tottenham', \
              'Juventus', 'Paris Saint-Germain', 'Atletico Madrid', 'Borussia Dortmund', 'Schalke 04', \
              'Inter', 'Roma', 'West Ham', 'AC Milan', 'Everton', 'Newcastle United']

forbes_2017 = ['Real Madrid', 'Manchester United', 'Barcelona', 'Bayern Munich','Manchester City', \
              'Chelsea', 'Arsenal', 'Liverpool', 'Juventus', 'Tottenham', \
               'Paris Saint-Germain','Borussia Dortmund', 'Atletico Madrid','West Ham','Schalke 04', \
              'Inter', 'Roma', 'AC Milan', 'Leicester', 'Napoli']

forbes_2016 = ['Real Madrid', 'Manchester United', 'Barcelona', 'Bayern Munich','Manchester City',\
               'Chelsea', 'Arsenal', 'Liverpool', 'Juventus','Tottenham', \
               'Paris Saint-Germain', 'Borussia Dortmund',  'AC Milan', 'Atletico Madrid', 'West Ham', \
              'Schalke 04','Inter', 'Roma',  'Leicester', 'Napoli']

forbes_2015 = ['Real Madrid', 'Manchester United', 'Barcelona', 'Bayern Munich', 'Arsenal',\
              'Manchester City', 'Chelsea',  'Liverpool', 'Juventus', 'Tottenham', \
               'Borussia Dortmund','AC Milan','Paris Saint-Germain',  'Schalke 04', 'Atletico Madrid',  \
              'Inter', 'Roma', 'West Ham',  'Newcastle United', 'Napoli']

forbes_2014 = ['Real Madrid', 'Manchester United', 'Barcelona', 'Bayern Munich', 'Manchester City',\
               'Chelsea', 'Arsenal', 'Liverpool','Juventus', 'AC Milan', \
               'Borussia Dortmund','Paris Saint-Germain', 'Tottenham', 'Schalke 04', \
              'Inter', 'Atletico Madrid', 'Galatasaray', 'West Ham',  'Newcastle United', 'Napoli']

forbes_2013 = ['Real Madrid', 'Manchester United', 'Barcelona', 'Bayern Munich','Arsenal',\
              'Chelsea', 'Manchester City', 'AC Milan', 'Juventus', 'Liverpool',\
               'Borussia Dortmund', 'Schalke 04','Tottenham', 'Inter', 'Paris Saint-Germain',\
               'Galatasaray', 'Atletico Madrid', 'Roma',  'Hamburger SV', 'Napoli']

forbes_2012 = ['Real Madrid', 'Manchester United', 'Barcelona', 'Arsenal', 'Bayern Munich'\
              'AC Milan', 'Chelsea', 'Juventus', 'Manchester City', 'Liverpool', \
               'Tottenham','Schalke 04', 'Borussia Dortmund', 'Inter', 'Lyon',  \
               'Corinthians', 'Napoli', 'Hamburger SV', 'Marseille', 'Newcastle United']

world_2018 = ['Belgium', 'France', 'Brazil', 'England', 'Croatia', \
              'Uruguay', 'Portugal', 'Switzerland', 'Spain', 'Denmark', \
              'Argentina', 'Colombia', 'Germany', 'Sweden', 'Chile', \
              'Netherlands', 'Italy', 'Mexico', 'Wales', 'Poland']

world_2017 = ['Belgium', 'France', 'Brazil','Croatia','England',\
             'Uruguay', 'Portugal', 'Switzerland', 'Spain', 'Denmark', \
             'Argentina', 'Colombia','Sweden', 'Chile','Netherlands',\
              'Germany', 'Italy', 'Mexico', 'Wales', 'Poland']

world_2016 =  ['Germany', 'Brazil', 'Portugal', 'Argentina','Belgium',\
              'Spain', 'Poland', 'Switzerland', 'France', 'Chile', \
              'Peru', 'Denmark', 'Colombia', 'Italy', 'England', \
              'Mexico', 'Croatia', 'Sweden', 'Wales', 'Netherlands']

world_2015 = ['Germany', 'Brazil', 'Argentina','Belgium', 'Chile',\
             'Colombia', 'Portugal','France','Uruguay', 'Spain', \
             'Switzerland', 'Wales', 'England', 'Croatia', 'Poland',\
             'Italy', 'Costa Rica', 'Mexico', 'Peru', 'Ecuador']

world_2014 = ['Germany', 'Spain', 'Argentina','Belgium', 'Chile', \
              'Brazil','Colombia', 'Portugal', 'England', 'Austria', \
              'Uruguay', 'Switzerland', 'Ecuador', 'Netherlands', 'Italy', \
              'Romania', 'Wales', 'Croatia', 'Ivory Coast', 'Hungary']

world_2013 = ['Germany', 'Colombia', 'Argentina','Belgium', 'Netherlands',\
              'Brazil','Portugal','France','Spain','Uruguay',\
              'England', 'Italy', 'Switzerland', 'Chile', 'Romania'\
              'Costa Rica', 'Czech Republic', 'Algeria', 'Croatia', 'Mexico']


world_2012 = ['Germany', 'Colombia', 'Argentina','Spain', 'Portugal',\
              'Uruguay', 'Italy', 'Switzerland', 'Netherlands', 'Brazil', \
              'Belgium', 'Greece', 'England', 'USA', 'Chile', \
              'Croatia', 'Ivory Coast', 'Ukraine', 'Bosnia and Herzegovina', 'France']               

In [954]:
def get_club_ranking(row, forbes_list):
    if row in forbes_list[:5]:
        return 5
    elif row in forbes_list[5:10]:
        return 4
    elif row in forbes_list[10:15]:
        return 3
    elif row in forbes_list[15:20]:
        return 2
    else:
        return 1

In [955]:
df['Forbes_Rank_18'] = df[df['Season'] == 2018]['Team'].apply(lambda x: get_club_ranking(x, forbes_2018) if x in forbes_2018 else get_club_ranking(x, world_2018))
df['Forbes_Rank_17'] = df[df['Season'] == 2017]['Team'].apply(lambda x: get_club_ranking(x, forbes_2017) if x in forbes_2017 else get_club_ranking(x, world_2017))
df['Forbes_Rank_16'] = df[df['Season'] == 2016]['Team'].apply(lambda x: get_club_ranking(x, forbes_2016) if x in forbes_2016 else get_club_ranking(x, world_2016))
df['Forbes_Rank_15'] = df[df['Season'] == 2015]['Team'].apply(lambda x: get_club_ranking(x, forbes_2015) if x in forbes_2015 else get_club_ranking(x, world_2015))
df['Forbes_Rank_14'] = df[df['Season'] == 2014]['Team'].apply(lambda x: get_club_ranking(x, forbes_2014) if x in forbes_2014 else get_club_ranking(x, world_2014))
df['Forbes_Rank_13'] = df[df['Season'] == 2013]['Team'].apply(lambda x: get_club_ranking(x, forbes_2013) if x in forbes_2013 else get_club_ranking(x, world_2013))
df['Forbes_Rank_12'] = df[df['Season'] == 2012]['Team'].apply(lambda x: get_club_ranking(x, forbes_2012) if x in forbes_2012 else get_club_ranking(x, world_2012))

df.iloc[:, -7:] = df.iloc[:, -7:].fillna(0)
df['Forbes_Rank'] = df['Forbes_Rank_18'] + df['Forbes_Rank_17'] + df['Forbes_Rank_16'] + df['Forbes_Rank_15'] \
+ df['Forbes_Rank_14'] + df['Forbes_Rank_13'] + df['Forbes_Rank_12']


In [956]:
df = df.iloc[:, np.r_[0:len(df.columns)-8, -1]]

In [957]:
df.columns

Index(['Name', 'Market_Value', 'Position', 'Season', 'Team', 'Tournament',
       'Games_Missed', 'Goals/90min', 'Assists/90min', 'Yel/90min',
       'Red/90min', 'SpG', 'PS%', 'Rating', 'Tackles', 'Inter', 'Fouls (def)',
       'Offsides', 'Clear', 'DrB (def)', 'Blocks', 'DrB (off)', 'Fouled (off)',
       'Off (off)', 'Disp (off)', 'KeyP', 'AvgP', 'Crosses', 'LongB', 'ThrB',
       'OutOfBox', 'SixYardBox', 'PenaltyArea', 'URL', 'url', 'Height',
       'Weight', 'Age', 'Wage', 'Contract_Expiry', 'Seasons_Left_On_Contract',
       'Forbes_Rank'],
      dtype='object')

In [958]:
grouped = df.groupby(['Name', 'Season', 'Position'])[['Market_Value', 
       'Games_Missed', 'Goals/90min', 'Assists/90min', 'Yel/90min',
       'Red/90min', 'SpG', 'PS%', 'Rating', 'Tackles', 'Inter', 'Fouls (def)',
       'Offsides', 'Clear', 'DrB (def)', 'Blocks', 'DrB (off)', 'Fouled (off)',
       'Off (off)', 'Disp (off)', 'KeyP', 'AvgP', 'Crosses', 'LongB', 'ThrB',
       'OutOfBox', 'SixYardBox', 'PenaltyArea', 'Height',
       'Weight', 'Age', 'Wage', 'Seasons_Left_On_Contract', 'Forbes_Rank']].mean().reset_index()

#add lag_value as a column
grouped['Lag_Value'] = grouped.groupby('Name')['Market_Value'].shift(1)

#reorder columns so we can see lagged value
cols = grouped.columns.tolist()
cols
cols = cols[:4] + cols[-1:] + cols[4:-1]
grouped = grouped[cols]

#drop whoscored rating
grouped = grouped.drop('Rating', axis = 1)

In [959]:
#Lag the game stats & other metrics also
lag_columns = list(grouped.columns[5:])
for i in lag_columns:
    grouped[i] =  grouped.groupby('Name')[i].shift(1)

#rename lagged columns to _Lag
newcols = list(grouped.columns[:5]) + [i + "_Lag" for i in lag_columns]
grouped.columns = newcols

# Feature Engineering

#### Dummies

In [960]:
Forward = ['Centre-Forward', 'Second Striker']
Winger = ['Left Winger', 'Right Winger']
Central_Midfielder = ['Central Midfield', 'Right Midfield', 'Left Midfield']
Defensive_Midfielder = ['Defensive Midfield']
Attacking_Midfielder = ['Attacking Midfield']
Defender = ['Centre-Back']
Wing_Back = ['Right-Back', 'Left-Back']

grouped["Forward"] = grouped['Position'].apply(lambda row: 1 if row in (Forward) else 0)
grouped["Winger"] = grouped['Position'].apply(lambda row: 1 if row in (Winger) else 0)
grouped["Central_Midfielder"] = grouped['Position'].apply(lambda row: 1 if row in (Central_Midfielder) else 0)
grouped["Defensive_Midfielder"] = grouped['Position'].apply(lambda row: 1 if row in (Defensive_Midfielder) else 0)
grouped["Attacking_Midfielder"] = grouped['Position'].apply(lambda row: 1 if row in (Attacking_Midfielder) else 0)
grouped["Defender"] = grouped['Position'].apply(lambda row: 1 if row in (Defender) else 0)
grouped['Wing_Back'] = grouped['Position'].apply(lambda row: 1 if row in (Wing_Back) else 0)

#### Interactions

In [961]:
forward_interactions = ['Goals/90min_Lag', 'Assists/90min_Lag', 'DrB (off)_Lag', 'Off (off)_Lag', 'Disp (off)_Lag']
winger_interactions = ['Goals/90min_Lag', 'Assists/90min_Lag', 'DrB (off)_Lag', 'Off (off)_Lag', 'Disp (off)_Lag', 'Crosses_Lag']
central_midfielder_interactions = ['PS%_Lag', 'KeyP_Lag', 'Tackles_Lag', 'DrB (off)_Lag', 'Disp (off)_Lag', 'Inter_Lag']
defensive_midfielder_interactions = ['Tackles_Lag', 'Inter_Lag', 'Blocks_Lag', 'PS%_Lag', 'Fouls (def)_Lag', 'DrB (def)_Lag']
attacking_midfielder_interactions = ['Goals/90min_Lag', 'Assists/90min_Lag', 'DrB (off)_Lag', 'KeyP_Lag', 'PS%_Lag']
defender_interactions = ['Tackles_Lag', 'Inter_Lag', 'Blocks_Lag', 'Fouls (def)_Lag', 'DrB (def)_Lag']
wing_back_interactions = ['Tackles_Lag', 'Inter_Lag', 'Blocks_Lag', 'Fouls (def)_Lag', 'DrB (def)_Lag', 'Assists/90min_Lag', 'DrB (off)_Lag',\
                         'Disp (off)_Lag', 'Crosses_Lag']

In [962]:
def generate_interactions(df, position, position_interactions):
    for i in position_interactions:
        df[position+'_'+i] = df[i] * df[position]
    return df

In [963]:
positions = ['Forward', 'Winger', 'Central_Midfielder', 'Defensive_Midfielder', 'Attacking_Midfielder', 'Defender', 'Wing_Back']
interactions = [forward_interactions, winger_interactions, central_midfielder_interactions , defensive_midfielder_interactions,\
               attacking_midfielder_interactions, defender_interactions, wing_back_interactions]

In [964]:
#generate interactions and add them to dataframe
for i, j in list(zip(positions, interactions)):
    grouped = generate_interactions(grouped, i, j)

#### Lagged Features

In [965]:
lag_3_columns = list(grouped.columns[4:])
for i in lag_3_columns:
    grouped[i + "_3"] =  grouped.groupby('Name')[i].rolling(window = 2).mean().reset_index(drop = True)

In [973]:
len(grouped[grouped['Season'] <= 2017])/len(grouped)

0.8102336825141015

# Modelling

In [999]:
#Drop NANs for Linear Regression

X_train_1 = grouped[grouped['Season'] <= 2017].dropna().iloc[:, np.r_[1, 4:len(grouped.columns)]]
X_test = grouped[grouped['Season'] == 2018].dropna().iloc[:, np.r_[1, 4:len(grouped.columns)]]
y_train_1 = grouped[grouped['Season'] <= 2017].dropna()['Market_Value']
y_test = grouped[grouped['Season'] == 2018].dropna()['Market_Value']

X_train, X_val, y_train, y_val = train_test_split(X_train_1, y_train_1, test_size=.15, random_state=5)

#FOR LASSO AND RIDGE
def mae(y_true, y_pred):
    return np.mean(np.abs(y_pred - y_true))

std = StandardScaler()
std.fit(X_train.values)

## Scale the Predictors on both the train and test set
X_tr = std.transform(X_train.values)
X_val_sc = std.transform(X_val.values)
X_test_sc = std.transform(X_test.values)

# Run the cross validation, find the best alpha, refit the model on all the data with that alpha
alphavec = 10**np.linspace(-2,2,200)


X_train_1_RF = grouped[grouped['Season'] <= 2017].fillna(-600).iloc[:, np.r_[1, 4:len(grouped.columns)]]
X_test_RF = grouped[grouped['Season'] == 2018].fillna(-600).iloc[:, np.r_[1, 4:len(grouped.columns)]]
y_train_1_RF = grouped[grouped['Season'] <= 2017].fillna(-600)['Market_Value']
y_test_RF = grouped[grouped['Season'] == 2018].fillna(-600)['Market_Value']

X_train_RF, X_val_RF, y_train_RF, y_val_RF = train_test_split(X_train_1_RF, y_train_1_RF, test_size=.15, random_state=5)


### Linear Regression

In [1000]:
lm = LinearRegression()
#fit the training set
lm.fit(X_train.iloc[:, 3:33], y_train)
#get score for training and validation set
print('Training set R^2 = {}'.format(lm.score(X_train.iloc[:, 3:33], y_train)))
print('Validation set R^2 = {}'.format(lm.score(X_val.iloc[:, 3:33], y_val)))

Training set R^2 = 0.6775012698479838
Validation set R^2 = 0.6497147696627377


In [1001]:
#Baseline with just old market values
#LR with all columns
lm = LinearRegression()
#fit the training set
lm.fit(X_train.iloc[:,0:2], y_train)
#get score for training and validation set
print('Training set R^2 = {}'.format(lm.score(X_train.iloc[:,0:2], y_train)))
print('Validation set R^2 = {}'.format(lm.score(X_val.iloc[:,0:2], y_val)))

Training set R^2 = 0.7842317859207976
Validation set R^2 = 0.8070220345543356


In [988]:
#LR with all columns
lm = LinearRegression()
#fit the training set
lm.fit(X_train, y_train)
#get score for training and validation set
print('Training set R^2 = {}'.format(lm.score(X_train, y_train)))
print('Validation set R^2 = {}'.format(lm.score(X_val, y_val)))

Training set R^2 = 0.8479023040401894
Validation set R^2 = 0.8240362722649393


#### Lasso CV

In [989]:
lasso_model = LassoCV(alphas = alphavec, cv=5)
lasso_model.fit(X_tr, y_train)

print('Lasso model alpha = {}'.format(lasso_model.alpha_))

# Make predictions on the validation set using the new model
val_set_pred = lasso_model.predict(X_val_sc)

print('Lasso model MAE = {}'.format(mae(y_val, val_set_pred)))
print('Lasso Validation R^2 = {}'.format(r2_score(y_val, val_set_pred)))

Lasso model alpha = 100.0
Lasso model MAE = 5206500.848967589
Lasso Validation R^2 = 0.8239537029417433


#### Ridge CV

In [990]:
ridge_model = RidgeCV(alphas = alphavec, cv=5)
ridge_model.fit(X_tr, y_train)

print('Ridge model alpha = {}'.format(ridge_model.alpha_))

# Make predictions on the validation set using the new model
ridge_val_set_pred = ridge_model.predict(X_val_sc)

print('Ridge model MAE = {}'.format(mae(y_val, ridge_val_set_pred)))
print('Ridge Validation R^2 = {}'.format(r2_score(y_val, ridge_val_set_pred)))

Ridge model alpha = 79.34096665797492
Ridge model MAE = 5399728.077386578
Ridge Validation R^2 = 0.8140476824173942


In [799]:
resids = y_val - ridge_val_set_pred
residuals = pd.DataFrame({'Residuals':resids, 'Predicted Values': ridge_val_set_pred})

pd.set_option('display.float_format', '{:,.1f}'.format)

residuals.reset_index(inplace = True)
residuals['Name'] = residuals['index'].apply(lambda row: grouped.iloc[row, [0]])
residuals['Market_Value'] = residuals['index'].apply(lambda row: grouped.iloc[row, 3])
residuals['Season'] = residuals['index'].apply(lambda row: grouped.iloc[row, 1])
residuals['Position'] = residuals['index'].apply(lambda row: grouped.iloc[row, 2])

resid_cols = ['index', 'Name', 'Market_Value', 'Predicted Values', 'Residuals', 'Season', 'Position']
residuals = residuals[resid_cols]

In [851]:
#model coefficients
coefficients = list(zip(X_val.columns, ridge_model.coef_))

In [852]:
coefficients.sort(key = lambda t: t[1], reverse = True)
coefficients

[('Lag_Value', 19487305.32618952),
 ('Wage_Lag', 3897177.223867783),
 ('KeyP_Lag_3', 2872765.5947983153),
 ('SpG_Lag', 1795102.4173857307),
 ('Defensive_Midfielder_Tackles_Lag_3', 1685305.7336727846),
 ('Height_Lag', 1645839.7432962274),
 ('Season', 1490419.3099697114),
 ('Winger_DrB (off)_Lag_3', 1336841.006263082),
 ('Attacking_Midfielder_DrB (off)_Lag', 1221332.525560598),
 ('Winger_Disp (off)_Lag_3', 960082.079970971),
 ('Central_Midfielder_Disp (off)_Lag_3', 944933.7283626392),
 ('Winger_Goals/90min_Lag', 934925.5734637538),
 ('Central_Midfielder_Inter_Lag', 920919.5683118519),
 ('Defensive_Midfielder_DrB (def)_Lag', 916162.9788478839),
 ('Assists/90min_Lag', 911696.6465465155),
 ('Central_Midfielder_Tackles_Lag', 890569.7430791933),
 ('Winger_Crosses_Lag', 874505.0281126674),
 ('Winger_Assists/90min_Lag', 804924.6305799304),
 ('Central_Midfielder_DrB (off)_Lag', 710089.9352342748),
 ('Goals/90min_Lag', 689613.247017473),
 ('Wing_Back_DrB (off)_Lag', 653548.2400123562),
 ('Clear_L

In [None]:
"""with sns.axes_style('white'):
    plot=residuals.plot(kind='scatter',
                  x='Predicted Values',y='Residuals',alpha=0.3,figsize=(10,6));"""

#### ElasticNet CV

In [991]:
elastic = ElasticNetCV(alphas = alphavec, cv=5)
elastic.fit(X_tr, y_train) 

print('ElasticNet alpha = {}'.format(ridge_model.alpha_))

elastic_val_set_pred = elastic.predict(X_val_sc)

print('ElasticNet  MAE = {}'.format(mae(y_val, elastic_val_set_pred)))
print('ElasticNet Validation R^2 = {}'.format(r2_score(y_val, elastic_val_set_pred)))

ElasticNet alpha = 79.34096665797492
ElasticNet  MAE = 5395765.576209098
ElasticNet Validation R^2 = 0.8114205108055784


In [1002]:
elastic_test_set_pred = elastic.predict(X_test_sc)

print('ElasticNet  MAE = {}'.format(mae(y_test, elastic_test_set_pred)))
print('ElasticNet Test R^2 = {}'.format(r2_score(y_test, elastic_test_set_pred)))

ElasticNet  MAE = 6018379.162239792
ElasticNet Validation R^2 = 0.8466352251845464


In [1034]:
coefficients = list(zip(X_test.columns, elastic.coef_))

In [1035]:
coefficients.sort(key = lambda t: t[1], reverse = True)
coefficients

[('Lag_Value', 7698230.730297421),
 ('Lag_Value_3', 3614958.532451103),
 ('Season', 2204862.5661809966),
 ('Wage_Lag', 1976802.2008233129),
 ('Goals/90min_Lag', 945086.635606001),
 ('Off (off)_Lag', 727196.2154635297),
 ('SixYardBox_Lag', 702368.5719747306),
 ('DrB (off)_Lag', 693472.5923815841),
 ('Forward_Off (off)_Lag', 633787.698921266),
 ('KeyP_Lag_3', 610948.584461651),
 ('Wage_Lag_3', 608894.3040535962),
 ('Forward_DrB (off)_Lag', 586693.8960180504),
 ('Wing_Back_DrB (off)_Lag', 547409.3846240758),
 ('Defensive_Midfielder_Tackles_Lag_3', 546344.1378677944),
 ('Forward_Goals/90min_Lag', 537767.1569459417),
 ('Central_Midfielder_Tackles_Lag', 534885.6719663778),
 ('Forbes_Rank_Lag', 525004.908451921),
 ('Winger_Goals/90min_Lag', 523980.8893580111),
 ('Clear_Lag_3', 503562.8601236301),
 ('SpG_Lag', 491099.20092549897),
 ('Forward_Assists/90min_Lag_3', 490130.2775813583),
 ('ThrB_Lag', 473085.0510516987),
 ('Winger_Off (off)_Lag', 466050.9282251743),
 ('Clear_Lag', 446399.8749384775

In [1003]:
elastic_resids = y_test - elastic_test_set_pred
elastic_residuals = pd.DataFrame({'Residuals':elastic_resids, 'Predicted Values': elastic_test_set_pred})

pd.set_option('display.float_format', '{:,.1f}'.format)

elastic_residuals.reset_index(inplace = True)
elastic_residuals['Name'] = elastic_residuals['index'].apply(lambda row: grouped.iloc[row, [0]])
elastic_residuals['Market_Value'] = elastic_residuals['index'].apply(lambda row: grouped.iloc[row, 3])
elastic_residuals['Season'] = elastic_residuals['index'].apply(lambda row: grouped.iloc[row, 1])
elastic_residuals['Position'] = elastic_residuals['index'].apply(lambda row: grouped.iloc[row, 2])

resid_cols = ['index', 'Name', 'Market_Value', 'Predicted Values', 'Residuals', 'Season', 'Position']
elastic_residuals = elastic_residuals[resid_cols]

In [1032]:
elastic_residuals.sort_values(by = 'Residuals', ascending = True).head(20)

Unnamed: 0,index,Name,Market_Value,Predicted Values,Residuals,Season,Position
510,2875,Luis Suarez,60000000.0,100508272.8,-40508272.8,2018,Centre-Forward
171,968,Cristiano Ronaldo,100000000.0,130992350.5,-30992350.5,2018,Left Winger
45,255,Alexis Sanchez,50000000.0,80979243.8,-30979243.8,2018,Left Winger
318,1767,Gonzalo Higuain,50000000.0,79987025.3,-29987025.3,2018,Centre-Forward
721,4052,Robert Lewandowski,75000000.0,100570898.9,-25570898.9,2018,Centre-Forward
490,2768,Lionel Messi,160000000.0,182391263.4,-22391263.4,2018,Right Winger
215,1209,Diego Costa,40000000.0,60537948.9,-20537948.9,2018,Centre-Forward
93,528,Arturo Vidal,20000000.0,39603638.8,-19603638.8,2018,Central Midfield
775,4367,Sergio Ramos,35000000.0,54178329.8,-19178329.8,2018,Centre-Back
596,3361,Mesut Ozil,40000000.0,58447348.8,-18447348.8,2018,Attacking Midfield


## Random Forest Regressor

In [806]:
regr = RandomForestRegressor(max_depth=15, random_state=0, n_estimators=100, max_features = 70)
regr.fit(X_train_RF, y_train_RF) 

rf_predictions = regr.predict(X_val_RF)

print('Random Forest Validation R^2 = {}'.format(regr.score(X_val_RF, y_val_RF)))


random_resids = y_val_RF - rf_predictions
random_residuals = pd.DataFrame({'Residuals':random_resids, 'Predicted Values': rf_predictions})

Random Forest Validation R^2 = 0.7677899694965636


In [807]:
random_residuals.reset_index(inplace = True)
random_residuals['Name'] = random_residuals['index'].apply(lambda row: grouped.iloc[row, [0]])
random_residuals['Market_Value'] = random_residuals['index'].apply(lambda row: grouped.iloc[row, 3])
random_residuals['Season'] = random_residuals['index'].apply(lambda row: grouped.iloc[row, 1])
random_residuals['Position'] = random_residuals['index'].apply(lambda row: grouped.iloc[row, 2])

random_residuals = random_residuals[resid_cols]

In [809]:
"print(regr.feature_importances_)"

'print(regr.feature_importances_)'

## Random Forest with Random Search Cross Validation

In [811]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [812]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train_RF, y_train_RF)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 10.1min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 26.2min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=100, n_jobs=-1,
          param_distributions={'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=2)

In [813]:
rf_random.best_params_

{'n_estimators': 700,
 'min_samples_split': 2,
 'min_samples_leaf': 2,
 'max_features': 'auto',
 'max_depth': 70,
 'bootstrap': True}

In [814]:
best_random = rf_random.best_estimator_

In [816]:
print('Best Random Forest Validation R^2 = {}'.format(best_random.score(X_val_RF, y_val_RF)))

Best Random Forest Validation R^2 = 0.7779171673987559


In [817]:
best_random_predictions = best_random.predict(X_val_RF)
best_random_resids = y_val_RF - best_random_predictions

In [818]:
best_random_residuals = pd.DataFrame({'Residuals':best_random_resids, 'Predicted Values': best_random_predictions})
#with sns.axes_style('white'):
#    plot=residuals.plot(kind='scatter',
                 # x='Predicted Values',y='Residuals',alpha=0.3,figsize=(10,6));

In [819]:
best_random_residuals.reset_index(inplace = True)
best_random_residuals['Name'] = best_random_residuals['index'].apply(lambda row: grouped.iloc[row, [0]])
best_random_residuals['Market_Value'] = best_random_residuals['index'].apply(lambda row: grouped.iloc[row, 3])
best_random_residuals['Season'] = best_random_residuals['index'].apply(lambda row: grouped.iloc[row, 1])
best_random_residuals['Position'] = best_random_residuals['index'].apply(lambda row: grouped.iloc[row, 2])

In [820]:
resid_cols = ['index', 'Name', 'Market_Value', 'Predicted Values', 'Residuals', 'Season', 'Position']
best_random_residuals = best_random_residuals[resid_cols]

In [825]:
best_random_residuals[best_random_residuals['Season'] == 2016].sort_values(by = 'Residuals', ascending = True).head(4)


Unnamed: 0,index,Name,Market_Value,Predicted Values,Residuals,Season,Position
437,4808,Wayne Rooney,25000000.0,40560562.6,-15560562.6,2016,Centre-Forward
593,180,Aleksandar Mitrovic,10000000.0,21111416.7,-11111416.7,2016,Centre-Forward
4,1681,Geoffrey Kondogbia,19500000.0,28116710.3,-8616710.3,2016,Central Midfield
629,3911,Rafael,5000000.0,11308522.2,-6308522.2,2016,Right-Back


In [892]:
best_random_residuals[best_random_residuals['Name'] == 'Frenkie de Jong']

Unnamed: 0,index,Name,Market_Value,Predicted Values,Residuals,Season,Position


In [890]:
best_random_residuals.sort_values(by = 'Residuals', ascending = True).head(20)

Unnamed: 0,index,Name,Market_Value,Predicted Values,Residuals,Season,Position
538,4680,Toni Kroos,80000000.0,112037005.4,-32037005.4,2018,Central Midfield
46,1209,Diego Costa,40000000.0,64444088.7,-24444088.7,2018,Centre-Forward
309,1016,Daniel Ginczek,2875000.0,22932583.0,-20057583.0,2017,Centre-Forward
214,2827,Lucas Moura,28000000.0,45398716.0,-17398716.0,2017,Right Winger
437,4808,Wayne Rooney,25000000.0,40560562.6,-15560562.6,2016,Centre-Forward
313,4052,Robert Lewandowski,75000000.0,90086045.9,-15086045.9,2018,Centre-Forward
487,2841,Lucas Tousart,20000000.0,33934532.3,-13934532.3,2018,Defensive Midfield
89,2430,Juanmi,1750000.0,15199262.4,-13449262.4,2012,Right Winger
257,1833,Harry Kane,500000.0,12378623.2,-11878623.2,2012,Centre-Forward
165,2452,Jurgen Locadia,750000.0,12378623.2,-11628623.2,2012,Centre-Forward


In [831]:
best_random_residuals.sort_values(by = 'Residuals', ascending = False).head(10)

Unnamed: 0,index,Name,Market_Value,Predicted Values,Residuals,Season,Position
607,2762,Lionel Messi,120000000.0,15199262.4,104800737.6,2012,Right Winger
286,4804,Wayne Rooney,65000000.0,12378623.2,52621376.8,2012,Centre-Forward
269,1837,Harry Kane,120000000.0,68731563.8,51268436.2,2017,Centre-Forward
171,2767,Lionel Messi,180000000.0,128908937.1,51091062.9,2017,Right Winger
472,1167,Dele Alli,80000000.0,43034756.8,36965243.2,2017,Attacking Midfield
317,1109,David Silva,44000000.0,13093712.0,30906288.0,2012,Attacking Midfield
554,774,Casemiro,60000000.0,29912943.9,30087056.1,2017,Defensive Midfield
573,1547,Fernandinho,32000000.0,3551219.3,28448780.7,2013,Defensive Midfield
190,3407,Milan Skriniar,35000000.0,6602957.0,28397043.0,2017,Centre-Back
51,412,Angel Di Maria,40000000.0,15199262.4,24800737.6,2012,Right Winger


In [834]:
importances = best_random.feature_importances_
std = np.std([tree.feature_importances_ for tree in best_random.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

In [848]:
for f in range(X_val_RF.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]) + X_val_RF.columns[f] )

1. feature 1 (0.809404)Season
2. feature 0 (0.016393)Lag_Value
3. feature 30 (0.013767)Games_Missed_Lag
4. feature 82 (0.006487)Goals/90min_Lag
5. feature 101 (0.005962)Assists/90min_Lag
6. feature 13 (0.005684)Yel/90min_Lag
7. feature 122 (0.005132)Red/90min_Lag
8. feature 16 (0.004231)SpG_Lag
9. feature 4 (0.003394)PS%_Lag
10. feature 85 (0.003372)Tackles_Lag
11. feature 8 (0.003357)Inter_Lag
12. feature 46 (0.003082)Fouls (def)_Lag
13. feature 31 (0.003055)Offsides_Lag
14. feature 89 (0.002969)Clear_Lag
15. feature 17 (0.002723)DrB (def)_Lag
16. feature 21 (0.002693)Blocks_Lag
17. feature 111 (0.002592)DrB (off)_Lag
18. feature 3 (0.002533)Fouled (off)_Lag
19. feature 5 (0.002517)Off (off)_Lag
20. feature 7 (0.002433)Disp (off)_Lag
21. feature 11 (0.002339)KeyP_Lag
22. feature 45 (0.002090)AvgP_Lag
23. feature 22 (0.002047)Crosses_Lag
24. feature 20 (0.001997)LongB_Lag
25. feature 25 (0.001928)ThrB_Lag
26. feature 23 (0.001927)OutOfBox_Lag
27. feature 112 (0.001897)SixYardBox_Lag
28