# Feature Selection Notebook Test

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('../data/merged_gw.csv')
df.fillna(18, inplace=True) # quick clean-up from previous notebook
print(df.shape)
print(df.columns)

(22896, 41)
Index(['name', 'position', 'team', 'xP', 'assists', 'bonus', 'bps',
       'clean_sheets', 'creativity', 'element', 'expected_assists',
       'expected_goal_involvements', 'expected_goals',
       'expected_goals_conceded', 'fixture', 'goals_conceded', 'goals_scored',
       'ict_index', 'influence', 'kickoff_time', 'minutes', 'opponent_team',
       'own_goals', 'penalties_missed', 'penalties_saved', 'red_cards',
       'round', 'saves', 'selected', 'starts', 'team_a_score', 'team_h_score',
       'threat', 'total_points', 'transfers_balance', 'transfers_in',
       'transfers_out', 'value', 'was_home', 'yellow_cards', 'GW'],
      dtype='object')


In [3]:
# taking subset of original df 
simpler_df = df[['name', 'position', 'team', 'xP', 'GW', 'minutes', 'bonus', 'goals_scored', 'assists', 'clean_sheets', 'influence', 'creativity', 'threat', 'value', 'total_points']].copy()

In [4]:
# Selecting columns to create rolling average
to_lag = simpler_df.columns[5:-2]

Index(['minutes', 'bonus', 'goals_scored', 'assists', 'clean_sheets',
       'influence', 'creativity', 'threat'],
      dtype='object')

In [5]:
for col in to_lag:
    for lag in [1,3,5]:
        lagged = 'last_' + str(lag)+ '_' + col
        simpler_df[lagged] = simpler_df.sort_values('GW').groupby('name', group_keys=False)[col].apply(lambda x: (x.rolling(min_periods=1, window=lag+1).sum() - x)/lag)

In [None]:
# Viewing what we just did to check
simpler_df.sort_values(['name', 'GW'])[['name', 'GW', 'minutes', 'last_1_minutes', 'last_3_minutes', 'last_5_minutes']].head(34)

In [6]:
# Cumulative sum for total stats up to the current gameweek row
for col in to_lag:
    lagged = 'tally_' + col
    simpler_df[lagged] = simpler_df.sort_values('GW').groupby('name', group_keys=False)[col].apply(lambda x: x.cumsum() - x)

In [7]:
pd.options.display.max_columns = None
simpler_df

Unnamed: 0,name,position,team,xP,GW,minutes,bonus,goals_scored,assists,clean_sheets,influence,creativity,threat,value,total_points,last_1_minutes,last_3_minutes,last_5_minutes,last_1_bonus,last_3_bonus,last_5_bonus,last_1_goals_scored,last_3_goals_scored,last_5_goals_scored,last_1_assists,last_3_assists,last_5_assists,last_1_clean_sheets,last_3_clean_sheets,last_5_clean_sheets,last_1_influence,last_3_influence,last_5_influence,last_1_creativity,last_3_creativity,last_5_creativity,last_1_threat,last_3_threat,last_5_threat,tally_minutes,tally_bonus,tally_goals_scored,tally_assists,tally_clean_sheets,tally_influence,tally_creativity,tally_threat
0,Nathan Redmond,MID,Southampton,1.5,1.0,1,0,0,0,0,0.0,0.0,0.0,55,1,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.00,0.0,0.000000,0.00,0.0,0.000000,0.0,0,0,0,0,0,0.0,0.0,0.0
1,Junior Stanislas,MID,Bournemouth,1.1,1.0,1,0,0,0,0,0.0,0.0,0.0,50,1,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.00,0.0,0.000000,0.00,0.0,0.000000,0.0,0,0,0,0,0,0.0,0.0,0.0
2,Armando Broja,FWD,Chelsea,2.0,1.0,15,0,0,0,0,5.2,0.3,19.0,55,1,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.00,0.0,0.000000,0.00,0.0,0.000000,0.0,0,0,0,0,0,0.0,0.0,0.0
3,Fabian Schär,DEF,Newcastle,2.4,1.0,90,3,1,0,1,66.0,14.6,25.0,45,15,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.00,0.0,0.000000,0.00,0.0,0.000000,0.0,0,0,0,0,0,0.0,0.0,0.0
4,Jonny Evans,DEF,Leicester,1.9,1.0,90,0,0,0,0,14.0,1.3,0.0,45,1,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.00,0.0,0.000000,0.00,0.0,0.000000,0.0,0,0,0,0,0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22891,Oliver Skipp,MID,Spurs,1.8,33.0,90,0,0,0,0,0.0,0.0,0.0,43,2,90.0,84.333333,86.6,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,11.8,10.866667,11.32,13.2,9.366667,11.04,2.0,3.000000,1.8,988,1,1,0,3,151.4,132.3,46.0
22892,Ryan Sessegnon,DEF,Spurs,0.0,33.0,0,0,0,0,0,0.0,0.0,0.0,44,0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.00,0.0,0.000000,0.00,0.0,0.000000,0.0,832,0,2,1,1,149.6,103.4,290.0
22893,Ashley Young,DEF,Aston Villa,4.7,33.0,84,0,0,0,1,8.0,16.1,4.0,43,5,65.0,81.666667,83.8,0.0,0.666667,0.4,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.666667,0.6,4.2,8.800000,11.12,0.5,1.500000,5.92,0.0,3.666667,2.4,1783,9,1,0,9,353.2,192.2,67.0
22894,Jeremy Sarmiento Morante,MID,Brighton,0.0,33.0,0,0,0,0,0,0.0,0.0,0.0,45,0,0.0,0.000000,0.4,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.08,0.0,0.000000,0.06,0.0,0.000000,0.0,197,0,0,2,1,46.6,42.7,59.0


In [9]:
# Removing players who have not played
simpler_df = simpler_df.drop(simpler_df[simpler_df['minutes'] < 1].index, axis = 0).reset_index(drop=True)

# Dropping all true results
simpler_df = simpler_df.drop(columns=simpler_df.columns[5:13])

simpler_df

Unnamed: 0,name,position,team,xP,GW,value,total_points,last_1_minutes,last_3_minutes,last_5_minutes,last_1_bonus,last_3_bonus,last_5_bonus,last_1_goals_scored,last_3_goals_scored,last_5_goals_scored,last_1_assists,last_3_assists,last_5_assists,last_1_clean_sheets,last_3_clean_sheets,last_5_clean_sheets,last_1_influence,last_3_influence,last_5_influence,last_1_creativity,last_3_creativity,last_5_creativity,last_1_threat,last_3_threat,last_5_threat,tally_minutes,tally_bonus,tally_goals_scored,tally_assists,tally_clean_sheets,tally_influence,tally_creativity,tally_threat
0,Nathan Redmond,MID,Southampton,1.5,1.0,55,1,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.00,0.0,0.000000,0.00,0.0,0.000000,0.0,0,0,0,0,0,0.0,0.0,0.0
1,Junior Stanislas,MID,Bournemouth,1.1,1.0,50,1,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.00,0.0,0.000000,0.00,0.0,0.000000,0.0,0,0,0,0,0,0.0,0.0,0.0
2,Armando Broja,FWD,Chelsea,2.0,1.0,55,1,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.00,0.0,0.000000,0.00,0.0,0.000000,0.0,0,0,0,0,0,0.0,0.0,0.0
3,Fabian Schär,DEF,Newcastle,2.4,1.0,45,15,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.00,0.0,0.000000,0.00,0.0,0.000000,0.0,0,0,0,0,0,0.0,0.0,0.0
4,Jonny Evans,DEF,Leicester,1.9,1.0,45,1,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.00,0.0,0.000000,0.00,0.0,0.000000,0.0,0,0,0,0,0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9951,Çaglar Söyüncü,DEF,Leicester,0.7,33.0,42,2,90.0,60.000000,36.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,30.6,17.333333,10.40,0.4,3.700000,2.22,25.0,10.000000,6.0,258,0,0,0,0,67.0,11.4,30.0
9952,Nick Pope,GK,Newcastle,4.8,33.0,54,3,90.0,90.000000,90.0,0.0,1.000000,0.6,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.2,17.2,33.266667,21.72,0.0,0.000000,2.00,0.0,0.000000,0.0,2811,12,0,0,14,577.4,10.0,0.0
9953,Oliver Skipp,MID,Spurs,1.8,33.0,43,2,90.0,84.333333,86.6,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,11.8,10.866667,11.32,13.2,9.366667,11.04,2.0,3.000000,1.8,988,1,1,0,3,151.4,132.3,46.0
9954,Ashley Young,DEF,Aston Villa,4.7,33.0,43,5,65.0,81.666667,83.8,0.0,0.666667,0.4,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.666667,0.6,4.2,8.800000,11.12,0.5,1.500000,5.92,0.0,3.666667,2.4,1783,9,1,0,9,353.2,192.2,67.0


# Attempt to Fit Models with created data

In [10]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

# Instantiate the OneHotEncoder
ohe = OneHotEncoder(sparse_output=False)

# Fit the OneHotEncoder to the positions column and transform
pos = pd.DataFrame(simpler_df['position'])
encoded = ohe.fit_transform(pos)

# Put into a dataframe to get column names
encoded_df = pd.DataFrame(encoded, columns=['DEF', 'FWD', 'GK', 'MID'], dtype=int)

# Add original back in
model_df = pd.concat([simpler_df, encoded_df], axis=1)

# Drop position column
model_df = model_df.drop(['position','GK'], axis=1)

# Show
model_df.head()

Unnamed: 0,name,team,xP,GW,value,total_points,last_1_minutes,last_3_minutes,last_5_minutes,last_1_bonus,last_3_bonus,last_5_bonus,last_1_goals_scored,last_3_goals_scored,last_5_goals_scored,last_1_assists,last_3_assists,last_5_assists,last_1_clean_sheets,last_3_clean_sheets,last_5_clean_sheets,last_1_influence,last_3_influence,last_5_influence,last_1_creativity,last_3_creativity,last_5_creativity,last_1_threat,last_3_threat,last_5_threat,tally_minutes,tally_bonus,tally_goals_scored,tally_assists,tally_clean_sheets,tally_influence,tally_creativity,tally_threat,DEF,FWD,MID
0,Nathan Redmond,Southampton,1.5,1.0,55,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0.0,0.0,0,0,1
1,Junior Stanislas,Bournemouth,1.1,1.0,50,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0.0,0.0,0,0,1
2,Armando Broja,Chelsea,2.0,1.0,55,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0.0,0.0,0,1,0
3,Fabian Schär,Newcastle,2.4,1.0,45,15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0.0,0.0,1,0,0
4,Jonny Evans,Leicester,1.9,1.0,45,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0.0,0.0,1,0,0


In [11]:
# Train test split

X = model_df.drop('total_points', axis = 1)
y = model_df['total_points']

In [12]:
# Splitting based on GW and then ensuring same shape is used to split target

X_train, X_test = X.loc[X['GW'] < 27], X.loc[X['GW'] >= 27]
y_train, y_test = y[:X_train.shape[0]], y[X_train.shape[0]:]

# Sanity check to make sure shapes of train and test sets are consistent
print(f"X_train Shape: {X_train.shape}\ny_train Shape:{y_train.shape}\n\nX_test Shape:{X_test.shape}\ny_test Shape:{y_test.shape}")

X_train Shape: (7766, 40)
y_train Shape:(7766,)

X_test Shape:(2190, 40)
y_test Shape:(2190,)


In [13]:
# saving player names, team, xP, GW number in a list for train and test data

train_names = [n for n in X_train['name']]
train_teams = [t for t in X_train['team']]
train_xP = [x for x in X_train['xP']]
train_GW = [g for g in X_train['GW']]
train_value = [v for v in X_train['value']]


test_names = [n for n in X_test['name']]
test_teams = [t for t in X_test['team']]
test_xP = [x for x in X_test['xP']]
test_GW = [g for g in X_test['GW']]
test_value = [v for v in X_test['value']]

In [14]:
# Removing names from training and test data
X_train = X_train.drop(['name', 'team', 'xP', 'GW', 'value'], axis = 1)
X_test = X_test.drop(['name', 'team', 'xP', 'GW', 'value'], axis = 1)

In [15]:
# importing necessary libraries for modelling

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from tempfile import mkdtemp
from sklearn import metrics

In [16]:
# fitting base LR model without scaling and checking score
LR_model = LinearRegression()
LR_model.fit(X_train, y_train)

In [17]:
print(f"R^2 of the Linear Regression for train data: {LR_model.score(X_train, y_train)}")
print(f"R^2 of the Linear Regression for test data: {LR_model.score(X_test, y_test)}")

R^2 of the Linear Regression for train data: 0.0838690920961579
R^2 of the Linear Regression for test data: 0.07772385695169115


In [18]:
# Evaluating base LR model
y_pred = LR_model.predict(X_test)

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 2.044403341105471
Mean Squared Error: 7.392815263258342
Root Mean Squared Error: 2.718973200172878


In [19]:
# Cross-validation with time series split to split the folds by the number of gameweeks in the training set.
X = X_train.values
tsc = TimeSeriesSplit(n_splits=26)

# Save pipeline parameters in temperary directory
cachedir = mkdtemp()

# placeholders for pipeline
estimators = [('normalise', StandardScaler()),
              ('model', LinearRegression())]

# Initializing pipeline
pipe = Pipeline(estimators, memory = cachedir)

# parameter grid for finding best model and hyperparameters for our data
param_grid = [
            {'model': [LinearRegression()],
             'normalise': [StandardScaler(), MinMaxScaler(), None],
#              'model__alpha': [0.0001, 0.001, 0.01, 0.1, 1, 5, 10]
            }
]

# Cross Validation based on Time Series Split by number of GWs
grid = GridSearchCV(pipe, param_grid, cv=tsc.split(X))

fittedgrid = grid.fit(X_train, y_train)

In [20]:
fittedgrid.best_estimator_

In [21]:
ss = StandardScaler()
ss.fit(X_train)

X_train_ss = ss.transform(X_train)
X_test_ss = ss.transform(X_test)

In [23]:
LR_ss = LinearRegression()
LR_ss.fit(X_train_ss, y_train)

print(f"R^2 of the Linear Regression for train data: {LR_ss.score(X_train_ss, y_train)}")
print(f"R^2 of the Linear Regression for test data: {LR_ss.score(X_test_ss, y_test)}")

R^2 of the Linear Regression for train data: 0.08386909209615778
R^2 of the Linear Regression for test data: 0.07772385695169226


In [24]:
# Evaluating base LR model
y_pred = LR_ss.predict(X_test_ss)

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 2.044403341105472
Mean Squared Error: 7.392815263258333
Root Mean Squared Error: 2.7189732001728766


In [25]:
# Cross-validation with time series split to split the folds by the number of gameweeks in the training set.
X = X_train.values
tsc = TimeSeriesSplit(n_splits=26)

# Save pipeline parameters in temperary directory
cachedir = mkdtemp()

# placeholders for pipeline
estimators = [('normalise', StandardScaler()),
              ('model', Lasso())]

# Initializing pipeline
pipe = Pipeline(estimators, memory = cachedir)

# parameter grid for finding best model and hyperparameters for our data
param_grid = [
            {'model': [Lasso()],
             'normalise': [StandardScaler(), MinMaxScaler(), None],
             'model__alpha': [0.0001, 0.001, 0.01, 0.1, 1, 5, 10]
            }
]

# Cross Validation based on Time Series Split by number of GWs
grid = GridSearchCV(pipe, param_grid, cv=tsc.split(X))

fittedgrid = grid.fit(X_train, y_train)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [26]:
fittedgrid.best_estimator_

In [27]:
# Scale data with Min Max Scaler

scaler = MinMaxScaler()
scaler.fit(X_train)

X_train_mms = scaler.transform(X_train)
X_test_mms = scaler.transform(X_test)

# Fitting the Lasso Regression with the best parameters from grid search

LM = Lasso(alpha=0.01)
LM.fit(X_train_mms, y_train)

print(f"R^2 of the Lasso Model for train data: {LM.score(X_train_mms, y_train)}")
print(f"R^2 of the Lasso Model for test data: {LM.score(X_test_mms, y_test)}")

R^2 of the Lasso Model for train data: 0.07424418214469086
R^2 of the Lasso Model for test data: 0.07440298294636272


In [28]:
# Evaluating Optimized Lasso model
y_pred = LM.predict(X_test_mms)

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 2.0763132300970835
Mean Squared Error: 7.419434848096354
Root Mean Squared Error: 2.723863955504451


In [29]:
# Cross-validation with time series split to split the folds by the number of gameweeks in the training set.
X = X_train.values
tsc = TimeSeriesSplit(n_splits=26)

# Save pipeline parameters in temperary directory
cachedir = mkdtemp()

# placeholders for pipeline
estimators = [('normalise', StandardScaler()),
              ('model', RandomForestRegressor())]

# Initializing pipeline
pipe = Pipeline(estimators, memory = cachedir)

# parameter grid for finding best model and hyperparameters for our data
param_grid = [
            {'model': [RandomForestRegressor()], 
             'normalise': [StandardScaler(), MinMaxScaler(), None],
             'model__n_estimators': [i for i in range(25, 100, 25)],
             'model__max_depth': [i for i in range(2, 10, 2)],
             }
]

# Cross Validation based on Time Series Split by number of GWs
grid = GridSearchCV(pipe, param_grid, cv=tsc.split(X))

fittedgrid = grid.fit(X_train, y_train)

In [30]:
fittedgrid.best_estimator_

In [31]:
# fitting random forest with parameters from best estimator
ss = StandardScaler()
ss.fit(X_train)

X_train_ss = ss.transform(X_train)
X_test_ss = ss.transform(X_test)

RFR = RandomForestRegressor(max_depth = 2, n_estimators=75)

RFR.fit(X_train_ss, y_train)

In [32]:
print(f"Train:{RFR.score(X_train_ss, y_train)}\nTest:{RFR.score(X_test_ss, y_test)}")

Train:0.0729041507052377
Test:0.05813171046994492


In [33]:
# Evaluating Optimzied RFR model
y_pred = RFR.predict(X_test_ss)

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 2.0446945485043844
Mean Squared Error: 7.5498627166073105
Root Mean Squared Error: 2.7477013514221866


Since the base linear model had the best R^2 and RMSE scores, we will use this model for the predictions table.

In [34]:
# creating dataframe of predictions and true results
mypred = LR_model.predict(X_test)
pred_df = pd.DataFrame(data = [test_names, simpler_df.loc[simpler_df['GW'] >= 27, 'position'].values, test_teams, test_GW, test_value, test_xP, mypred, np.array(y_test)], index = ['name', 'position', 'team', 'GW', 'value', 'xP', 'mypred', 'actual_points']).T
pred_df

Unnamed: 0,name,position,team,GW,value,xP,mypred,actual_points
0,Fabian Schär,DEF,Newcastle,27.0,51,1.1,2.853227,2
1,Jonny Evans,DEF,Leicester,27.0,44,0.2,1.927015,1
2,Enzo Fernández,MID,Chelsea,27.0,50,3.0,2.873014,5
3,Brennan Johnson,FWD,Nott'm Forest,27.0,57,4.8,4.061066,2
4,Cheick Doucouré,MID,Crystal Palace,27.0,50,0.0,1.638224,0
...,...,...,...,...,...,...,...,...
2185,Çaglar Söyüncü,DEF,Leicester,33.0,42,0.7,2.746634,2
2186,Nick Pope,GK,Newcastle,33.0,54,4.8,3.991316,3
2187,Oliver Skipp,MID,Spurs,33.0,43,1.8,2.552383,2
2188,Ashley Young,DEF,Aston Villa,33.0,43,4.7,2.807029,5
