In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor

## Goalkeeper Preprocessing

In [2]:
combined_gameweeks_gk_df = pd.DataFrame(columns=['name', 'clean_sheets', 'xP', 'expected_goals_conceded', 'goals_conceded', 'ict_index', 'influence', 'penalties_saved', 'starts', 'was_home', 'team_strength_difference', 'total_points'])
for gameweek in range(1, 38):
    if gameweek == 6 or gameweek == 7:
        continue
    current_gameweek_df = pd.read_csv(filepath_or_buffer=f'../2022_2023_gameweek_data/gw{gameweek}.csv', encoding='UTF-8')
    next_gameweek_df = pd.read_csv(filepath_or_buffer=f'../2022_2023_gameweek_data/gw{gameweek + 1}.csv', encoding='UTF-8')
    current_gameweek_df = current_gameweek_df[current_gameweek_df['position'] == 'GK']
    current_gameweek_filtered_df = current_gameweek_df[['name', 'team', 'clean_sheets', 'xP', 'expected_goals_conceded', 'goals_conceded', 'ict_index', 'influence', 'penalties_saved', 'starts']]
    current_gameweek_averaged_df = current_gameweek_filtered_df.groupby(['name', 'team']).agg({
        'clean_sheets': 'mean',
        'xP': 'mean',
        'expected_goals_conceded': 'mean',
        'goals_conceded': 'mean',
        'ict_index': 'mean',
        'influence': 'mean',
        'penalties_saved': 'mean',
        'starts': 'mean',
    }).reset_index()
    current_and_next_gameweek_df = current_gameweek_averaged_df.merge(next_gameweek_df[['name', 'kickoff_time', 'opponent_team', 'was_home', 'total_points']], how='inner', on=['name']).sort_values(['name', 'kickoff_time']).drop_duplicates(subset=['name'], keep='first')
    team_to_threat_rating = {'Southampton': 4, 'Bournemouth': 8, 'Chelsea': 12, 'Newcastle': 16,
                            'Leicester': 4, 'Nott\'m Forest': 8, 'Crystal Palace': 8, 'Wolves': 12,
                            'Brentford': 8, 'Spurs': 16, 'West Ham': 12, 'Liverpool': 20,
                            'Leeds': 4, 'Fulham': 8, 'Brighton': 12, 'Man Utd': 16,
                            'Man City': 20, 'Arsenal': 20, 'Aston Villa': 12, 'Everton': 8}
    current_and_next_gameweek_df['team'].replace(team_to_threat_rating, inplace=True)
    current_and_next_gameweek_df['was_home'].replace({True: 1, False: 0}, inplace=True)
    current_and_next_gameweek_df['team_strength_difference'] = current_and_next_gameweek_df['opponent_team'] - (20 - current_and_next_gameweek_df['team'])
    current_and_next_gameweek_df[['xP',
                                    'expected_goals_conceded',
                                    'goals_conceded',
                                    'ict_index',
                                    'influence',
                                    'penalties_saved',
                                    'team_strength_difference']] = StandardScaler().fit_transform(current_and_next_gameweek_df[['xP',
                                                                                                                                'expected_goals_conceded',
                                                                                                                                'goals_conceded',
                                                                                                                                'ict_index',
                                                                                                                                'influence',
                                                                                                                                'penalties_saved',
                                                                                                                                'team_strength_difference']])
    current_and_next_gameweek_df = current_and_next_gameweek_df[['name', 'clean_sheets', 'xP', 'expected_goals_conceded', 'goals_conceded', 'ict_index', 'influence', 'penalties_saved', 'starts', 'was_home', 'team_strength_difference', 'total_points']]
    combined_gameweeks_gk_df = pd.concat([current_and_next_gameweek_df, combined_gameweeks_gk_df], ignore_index=True)
combined_gameweeks_gk_df.head(10)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  current_and_next_gameweek_df['team'].replace(team_to_threat_rating, inplace=True)
  current_and_next_gameweek_df['team'].replace(team_to_threat_rating, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  current_and_next_gameweek_df['was_home'].replace({True: 1, False: 

Unnamed: 0,name,clean_sheets,xP,expected_goals_conceded,goals_conceded,ict_index,influence,penalties_saved,starts,was_home,team_strength_difference,total_points
0,Aaron Ramsdale,0.0,1.71014,0.255737,0.973278,0.467305,0.486004,0.0,1.0,1,2.72881,6
1,Adrián San Miguel del Castillo,0.0,-0.248112,-0.494663,-0.424249,-0.48526,-0.485814,0.0,0.0,0,2.290251,0
2,Alex McCarthy,0.0,-0.154862,2.970818,3.768333,2.531197,2.573024,0.0,1.0,1,-0.77966,1
3,Alex Smithies,0.0,-0.714362,-0.494663,-0.424249,-0.48526,-0.485814,0.0,0.0,1,0.243644,0
4,Alisson Ramses Becker,0.0,2.362891,1.401801,0.973278,0.943588,0.963948,0.0,1.0,0,2.290251,0
5,Alphonse Areola,0.0,-0.481237,-0.494663,-0.424249,-0.48526,-0.485814,0.0,0.0,0,0.097458,0
6,Andy Lonergan,0.0,-0.481237,-0.494663,-0.424249,-0.48526,-0.485814,0.0,0.0,1,-1.510591,0
7,Asmir Begović,0.0,-0.481237,-0.494663,-0.424249,-0.48526,-0.485814,0.0,0.0,1,-1.510591,0
8,Bernd Leno,0.0,1.756765,1.715604,2.370805,1.34049,1.330371,0.0,1.0,0,0.097458,3
9,Brandon Austin,0.0,-0.481237,-0.494663,-0.424249,-0.48526,-0.485814,0.0,0.0,0,0.828389,0


**Splitting Goalkeeper Data into Train and Test Set to Evaluate Classifier Performance**

In [3]:
# Assuming combined_game_week_df is your pandas DataFrame containing your dataset

# Select relevant features and the target variable
X = combined_gameweeks_gk_df.drop(columns=['total_points'])  # Features, including binary columns
y = combined_gameweeks_gk_df['total_points']  # Target variable

# Split the GK dataset into training and validation sets to analyze performance of different classifiers
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=41)

## Evaluation of Various Classifiers

In [4]:
# Create and train the decision tree regressor
dt_regressor = DecisionTreeRegressor(random_state=42)
dt_regressor.fit(X_train.drop(columns=['name']), y_train)

# Predict on the test set
y_name_pred = X_test['name'].to_frame()
y_pred = dt_regressor.predict(X_test.drop(columns=['name']))
y_name_pred['predicted_score'] = pd.Series(y_pred)

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Create a DataFrame to store predicted and actual values side by side
results_df = pd.DataFrame({'Name': y_name_pred['name'], 'Actual Total Points': y_test, 'Predicted Total Points': y_pred})

# Write the DataFrame to a CSV file
# results_df.to_csv('predicted_vs_actual_decision_tree.csv', index=False)

# print("Results saved to predicted_vs_actual_decision_tree.csv")

Mean Squared Error: 5.663344835518749


In [5]:
linear_reg = LinearRegression()
linear_reg.fit(X_train.drop(columns=['name']), y_train)

# Predict on the validation set
y_name_pred = X_test['name'].to_frame()
y_pred = linear_reg.predict(X_test.drop(columns=['name']))
y_name_pred['predicted_score'] = pd.Series(y_pred)

# Create a DataFrame to store predicted and actual values side by side
results_df = pd.DataFrame({'Name': y_name_pred['name'], 'Actual Total Points': y_test, 'Predicted Total Points': y_pred})
results_df.sort_values(by='Predicted Total Points', ascending=False, inplace=True)

# Write the DataFrame to a CSV file
# results_df.to_csv('predicted_vs_actual_linear_regression.csv', index=False)

mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 3.107477099955285


In [6]:
# Assuming combined_game_week_df is your pandas DataFrame containing your dataset

# Define the degree of polynomial features
degree = 3

# Create polynomial features
poly = PolynomialFeatures(degree=degree)
X_poly_train = poly.fit_transform(X_train.drop(columns=['name']))
X_poly_test = poly.transform(X_test.drop(columns=['name']))

# Create and train the polynomial regression model
poly_reg = LinearRegression()
poly_reg.fit(X_poly_train, y_train)

# Predict on the validation set
y_name_pred = X_test['name'].to_frame()
y_pred = poly_reg.predict(X_poly_test)
y_name_pred['predicted_score'] = pd.Series(y_pred)

# Create a DataFrame to store predicted and actual values side by side
results_df = pd.DataFrame({'Name': y_name_pred['name'], 'Actual Total Points': y_test, 'Predicted Total Points': y_pred})
results_df.sort_values(by='Predicted Total Points', ascending=False, inplace=True)

# Write the DataFrame to a CSV file
# results_df.to_csv('predicted_vs_actual_polynomial_regression.csv', index=False)

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 71.02530644535616


In [7]:
# Create and train the Random Forest regression model
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
rf_regressor.fit(X_train.drop(columns=['name']), y_train)

# Predict on the validation set
y_name_pred = X_test['name'].to_frame()
y_pred = rf_regressor.predict(X_test.drop(columns=['name']))
y_name_pred['predicted_score'] = pd.Series(y_pred)

# Create a DataFrame to store predicted and actual values side by side
results_df = pd.DataFrame({'Name': y_name_pred['name'], 'Actual Total Points': y_test, 'Predicted Total Points': y_pred})
results_df.sort_values(by='Predicted Total Points', ascending=False, inplace=True)

# Write the DataFrame to a CSV file
# results_df.to_csv('predicted_vs_actual_random_forest_gk.csv', index=False)

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error(No Hyperparameter Tuning):", mse)

Mean Squared Error(No Hyperparameter Tuning): 2.948732002782305


**Best Result**: ~2.95 with Random Forest Regression(and no hyperparameter tuning)

# Tuning Hyperparameters for Goalkeeper Model

In [8]:
# Define the hyperparameters grid
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],   # Maximum depth of the trees
    'min_samples_split': [2, 5, 10],   # Minimum number of samples required to split a node
    'min_samples_leaf': [1, 2, 4]      # Minimum number of samples required at each leaf node
}

# Create the Random Forest regressor
rf_regressor = RandomForestRegressor(random_state=42)

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=rf_regressor, param_grid=param_grid, 
                           scoring='neg_mean_squared_error', cv=5, verbose=1, n_jobs=-1)

# Perform the grid search
grid_search.fit(X_train.drop(columns=['name']), y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best hyperparameters:", best_params)

best_score = grid_search.best_score_
print("Best score (negative mean squared error):", best_score)

# Predict on the test set using the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test.drop(columns=['name']))

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error(Tuned Hyperparameters):", mse)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 300}
Best score (negative mean squared error): -3.067311077013211
Mean Squared Error(Tuned Hyperparameters): 2.7431141135847654


**Best Result**: ~2.74 (after tuning hyperparameters)

# Defender Preprocessing

In [9]:
combined_gameweeks_def_df = pd.DataFrame(columns=['name'
                                              ,'xP'
                                              ,'assists'
                                              ,'clean_sheets'
                                              ,'creativity'
                                              ,'expected_assists'
                                              ,'expected_goal_involvements'
                                              ,'expected_goals'
                                              ,'expected_goals_conceded'
                                              ,'goals_conceded'
                                              ,'goals_scored'
                                              ,'ict_index'
                                              ,'influence'
                                              ,'own_goals'
                                              ,'starts'
                                              ,'threat'
                                              ,'was_home'
                                              ,'team_strength_difference'
                                              ,'total_points'])
for gameweek in range(1, 38):
    if gameweek == 6 or gameweek == 7:
        continue
    current_gameweek_df = pd.read_csv(filepath_or_buffer=f'../2022_2023_gameweek_data/gw{gameweek}.csv', encoding='UTF-8')
    next_gameweek_df = pd.read_csv(filepath_or_buffer=f'../2022_2023_gameweek_data/gw{gameweek + 1}.csv', encoding='UTF-8')
    current_gameweek_df = current_gameweek_df[current_gameweek_df['position'] == 'DEF']
    current_gameweek_filtered_df = current_gameweek_df[['name', 'team', 'xP', 'assists', 'clean_sheets', 'creativity', 'expected_assists', 'expected_goal_involvements', 'expected_goals', 'expected_goals_conceded', 'goals_conceded', 'goals_scored', 'ict_index', 'influence', 'own_goals', 'starts', 'threat']]
    current_gameweek_averaged_df = current_gameweek_filtered_df.groupby(['name', 'team']).agg({
        'xP': 'mean',
        'assists': 'mean',
        'clean_sheets': 'mean',
        'creativity': 'mean',
        'expected_assists': 'mean',
        'expected_goal_involvements': 'mean',
        'expected_goals': 'mean',
        'expected_goals_conceded': 'mean',
        'goals_conceded': 'mean',
        'goals_scored': 'mean',
        'ict_index': 'mean',
        'influence': 'mean',
        'own_goals': 'mean',
        'starts': 'mean',
        'threat': 'mean'
    }).reset_index()
    current_and_next_gameweek_df = current_gameweek_averaged_df.merge(next_gameweek_df[['name', 'kickoff_time', 'opponent_team', 'was_home', 'total_points']], how='inner', on=['name']).sort_values(['name', 'kickoff_time']).drop_duplicates(subset=['name'], keep='first')
    team_to_threat_rating = {'Southampton': 4, 'Bournemouth': 8, 'Chelsea': 12, 'Newcastle': 16,
                            'Leicester': 4, 'Nott\'m Forest': 8, 'Crystal Palace': 8, 'Wolves': 12,
                            'Brentford': 8, 'Spurs': 16, 'West Ham': 12, 'Liverpool': 20,
                            'Leeds': 4, 'Fulham': 8, 'Brighton': 12, 'Man Utd': 16,
                            'Man City': 20, 'Arsenal': 20, 'Aston Villa': 12, 'Everton': 8}
    current_and_next_gameweek_df['team'].replace(team_to_threat_rating, inplace=True)
    current_and_next_gameweek_df['was_home'].replace({True: 1, False: 0}, inplace=True)
    current_and_next_gameweek_df['team_strength_difference'] = current_and_next_gameweek_df['opponent_team'] - (20 - current_and_next_gameweek_df['team'])
    current_and_next_gameweek_df[['xP'
                                  ,'assists'
                                  ,'clean_sheets'
                                  ,'creativity'
                                  ,'expected_assists'
                                  ,'expected_goal_involvements'
                                  ,'expected_goals'
                                  ,'expected_goals_conceded'
                                  ,'goals_conceded'
                                  ,'goals_scored'
                                  ,'ict_index'
                                  ,'influence'
                                  ,'own_goals'
                                  ,'threat'
                                  ,'team_strength_difference']] = StandardScaler().fit_transform(current_and_next_gameweek_df[['xP'
                                                                                                              ,'assists'
                                                                                                              ,'clean_sheets'
                                                                                                              ,'creativity'
                                                                                                              ,'expected_assists'
                                                                                                              ,'expected_goal_involvements'
                                                                                                              ,'expected_goals'
                                                                                                              ,'expected_goals_conceded'
                                                                                                              ,'goals_conceded'
                                                                                                              ,'goals_scored'
                                                                                                              ,'ict_index'
                                                                                                              ,'influence'
                                                                                                              ,'own_goals'
                                                                                                              ,'threat'
                                                                                                              ,'team_strength_difference']])
    current_and_next_gameweek_df = current_and_next_gameweek_df[['name'
                                                                ,'xP'
                                                                ,'assists'
                                                                ,'clean_sheets'
                                                                ,'creativity'
                                                                ,'expected_assists'
                                                                ,'expected_goal_involvements'
                                                                ,'expected_goals'
                                                                ,'expected_goals_conceded'
                                                                ,'goals_conceded'
                                                                ,'goals_scored'
                                                                ,'ict_index'
                                                                ,'influence'
                                                                ,'own_goals'
                                                                ,'starts'
                                                                ,'threat'
                                                                ,'was_home'
                                                                ,'team_strength_difference'
                                                                ,'total_points']]
    combined_gameweeks_def_df = pd.concat([current_and_next_gameweek_df, combined_gameweeks_def_df], ignore_index=True)
# combined_gameweeks_def_df.to_csv('combined_gameweeks.csv', index=False)
combined_gameweeks_def_df.head(10)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  current_and_next_gameweek_df['team'].replace(team_to_threat_rating, inplace=True)
  current_and_next_gameweek_df['team'].replace(team_to_threat_rating, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  current_and_next_gameweek_df['was_home'].replace({True: 1, False: 

Unnamed: 0,name,xP,assists,clean_sheets,creativity,expected_assists,expected_goal_involvements,expected_goals,expected_goals_conceded,goals_conceded,goals_scored,ict_index,influence,own_goals,starts,threat,was_home,team_strength_difference,total_points
0,Aaron Cresswell,0.295005,-0.107833,-0.290021,-0.364949,-0.295517,-0.295263,-0.184982,-0.604482,-0.491138,-0.087875,-0.567329,-0.611656,0.0,0.0,-0.288144,0,0.0365,1
1,Aaron Hickey,1.302534,9.273618,-0.290021,1.506751,0.842932,0.281993,-0.184982,1.00872,0.830377,-0.087875,1.256861,1.398053,0.0,1.0,-0.288144,1,-0.103595,6
2,Aaron Wan-Bissaka,2.681257,-0.107833,1.812631,0.823649,0.178837,0.763039,0.855276,1.458301,0.16962,-0.087875,1.570394,1.300809,0.0,1.0,1.700466,1,0.456785,0
3,Adam Smith,-0.182245,-0.107833,-0.290021,5.646352,2.360863,1.051667,-0.184982,1.048389,0.830377,-0.087875,3.024045,0.965858,0.0,1.0,0.434987,0,-0.80407,1
4,Adam Webster,-0.182245,-0.107833,-0.290021,-0.364949,-0.295517,-0.295263,-0.184982,-0.604482,-0.491138,-0.087875,-0.567329,-0.611656,0.0,0.0,-0.288144,0,-1.08426,1
5,Alex Telles,-0.55344,-0.107833,-0.290021,-0.364949,-0.295517,-0.295263,-0.184982,-0.604482,-0.491138,-0.087875,-0.567329,-0.611656,0.0,0.0,-0.288144,1,0.456785,0
6,Alexandre Moreno Lopera,0.135922,-0.107833,-0.290021,0.195195,-0.105775,-0.199054,-0.184982,-0.564813,-0.491138,-0.087875,-0.054276,-0.049802,0.0,0.0,-0.288144,1,-0.663975,0
7,Alfie Gilchrist,0.401061,-0.107833,-0.290021,-0.364949,-0.295517,-0.295263,-0.184982,-0.604482,-0.491138,-0.087875,-0.567329,-0.611656,0.0,0.0,-0.288144,1,0.736975,0
8,Andrew Robertson,1.408589,-0.107833,-0.290021,1.83464,1.032673,0.378202,-0.184982,1.193842,0.830377,-0.087875,0.971831,0.317564,0.0,1.0,0.073421,0,2.137924,0
9,Angelo Ogbonna,0.613172,-0.107833,-0.290021,-0.337625,-0.295517,-0.295263,-0.184982,0.585585,0.830377,-0.087875,0.57279,1.527712,0.0,1.0,-0.288144,0,0.0365,0


In [10]:
# Select relevant features and the target variable
X = combined_gameweeks_def_df.drop(columns=['total_points'])  # Features, including binary columns
y = combined_gameweeks_def_df['total_points']  # Target variable

# Split the dataset into training and validation sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the Random Forest regression model
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
rf_regressor.fit(X_train.drop(columns=['name']), y_train)

# Predict on the validation set
y_name_pred = X_test['name'].to_frame()
y_pred = rf_regressor.predict(X_test.drop(columns=['name']))
y_name_pred['predicted_score'] = pd.Series(y_pred)

# Create a DataFrame to store predicted and actual values side by side
results_df = pd.DataFrame({'Name': y_name_pred['name'], 'Actual Total Points': y_test, 'Predicted Total Points': y_pred})
results_df.sort_values(by='Predicted Total Points', ascending=False, inplace=True)

# Write the DataFrame to a CSV file
# results_df.to_csv('predicted_vs_actual_random_forest_def.csv', index=False)

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error(No Hyperparameter Tuning):", mse)


Mean Squared Error(No Hyperparameter Tuning): 4.114719656131092


**Best Result**: ~4.11(No hyperparameter tuning)

# Tuning Hyperparameters for Defender Model

In [11]:
# Define the hyperparameters grid
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],   # Maximum depth of the trees
    'min_samples_split': [2, 5, 10],   # Minimum number of samples required to split a node
    'min_samples_leaf': [1, 2, 4]      # Minimum number of samples required at each leaf node
}

# Create the Random Forest regressor
rf_regressor = RandomForestRegressor(random_state=42)

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=rf_regressor, param_grid=param_grid, 
                           scoring='neg_mean_squared_error', cv=5, verbose=1, n_jobs=-1)

# Perform the grid search
grid_search.fit(X_train.drop(columns=['name']), y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best hyperparameters:", best_params)

best_score = grid_search.best_score_
print("Best score (negative mean squared error):", best_score)

# Predict on the test set using the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test.drop(columns=['name']))

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error(Tuned Hyperparameters):", mse)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best hyperparameters: {'max_depth': 20, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 300}
Best score (negative mean squared error): -3.536205055655664
Mean Squared Error(Tuned Hyperparameters): 4.050450106429972


**Best Result**: ~4.05(after tuning hyperparameters)

# Midfielder Preprocessing

In [12]:
combined_gameweeks_mid_df = pd.DataFrame(columns=['name'
                                              ,'xP'
                                              ,'assists'
                                              ,'creativity'
                                              ,'expected_assists'
                                              ,'expected_goal_involvements'
                                              ,'expected_goals'
                                              ,'goals_scored'
                                              ,'ict_index'
                                              ,'influence'
                                              ,'starts'
                                              ,'threat'
                                              ,'was_home'
                                              ,'team_strength_difference'
                                              ,'total_points'])
for gameweek in range(1, 38):
    if gameweek == 6 or gameweek == 7:
        continue
    current_gameweek_df = pd.read_csv(filepath_or_buffer=f'../2022_2023_gameweek_data/gw{gameweek}.csv', encoding='UTF-8')
    next_gameweek_df = pd.read_csv(filepath_or_buffer=f'../2022_2023_gameweek_data/gw{gameweek + 1}.csv', encoding='UTF-8')
    current_gameweek_df = current_gameweek_df[current_gameweek_df['position'] == 'MID']
    current_gameweek_filtered_df = current_gameweek_df[['name', 'team', 'xP', 'assists', 'creativity', 'expected_assists', 'expected_goal_involvements', 'expected_goals', 'goals_scored', 'ict_index', 'influence', 'starts', 'threat']]
    current_gameweek_averaged_df = current_gameweek_filtered_df.groupby(['name', 'team']).agg({
        'xP': 'mean',
        'assists': 'mean',
        'creativity': 'mean',
        'expected_assists': 'mean',
        'expected_goal_involvements': 'mean',
        'expected_goals': 'mean',
        'goals_scored': 'mean',
        'ict_index': 'mean',
        'influence': 'mean',
        'starts': 'mean',
        'threat': 'mean'
    }).reset_index()
    current_and_next_gameweek_df = current_gameweek_averaged_df.merge(next_gameweek_df[['name', 'kickoff_time', 'opponent_team', 'was_home', 'total_points']], how='inner', on=['name']).sort_values(['name', 'kickoff_time']).drop_duplicates(subset=['name'], keep='first')
    team_to_threat_rating = {'Southampton': 4, 'Bournemouth': 8, 'Chelsea': 12, 'Newcastle': 16,
                            'Leicester': 4, 'Nott\'m Forest': 8, 'Crystal Palace': 8, 'Wolves': 12,
                            'Brentford': 8, 'Spurs': 16, 'West Ham': 12, 'Liverpool': 20,
                            'Leeds': 4, 'Fulham': 8, 'Brighton': 12, 'Man Utd': 16,
                            'Man City': 20, 'Arsenal': 20, 'Aston Villa': 12, 'Everton': 8}
    current_and_next_gameweek_df['team'].replace(team_to_threat_rating, inplace=True)
    current_and_next_gameweek_df['was_home'].replace({True: 1, False: 0}, inplace=True)
    current_and_next_gameweek_df['team_strength_difference'] = current_and_next_gameweek_df['opponent_team'] - (20 - current_and_next_gameweek_df['team'])
    current_and_next_gameweek_df[['xP'
                                  ,'assists'
                                  ,'creativity'
                                  ,'expected_assists'
                                  ,'expected_goal_involvements'
                                  ,'expected_goals'
                                  ,'goals_scored'
                                  ,'ict_index'
                                  ,'influence'
                                  ,'threat'
                                  ,'team_strength_difference']] = StandardScaler().fit_transform(current_and_next_gameweek_df[['xP'
                                                                                                                                ,'assists'
                                                                                                                                ,'creativity'
                                                                                                                                ,'expected_assists'
                                                                                                                                ,'expected_goal_involvements'
                                                                                                                                ,'expected_goals'
                                                                                                                                ,'goals_scored'
                                                                                                                                ,'ict_index'
                                                                                                                                ,'influence'
                                                                                                                                ,'threat'
                                                                                                                                ,'team_strength_difference']])
    current_and_next_gameweek_df = current_and_next_gameweek_df[['name'
                                                                ,'xP'
                                                                ,'assists'
                                                                ,'creativity'
                                                                ,'expected_assists'
                                                                ,'expected_goal_involvements'
                                                                ,'expected_goals'
                                                                ,'goals_scored'
                                                                ,'ict_index'
                                                                ,'influence'
                                                                ,'starts'
                                                                ,'threat'
                                                                ,'was_home'
                                                                ,'team_strength_difference'
                                                                ,'total_points']]
    combined_gameweeks_mid_df = pd.concat([current_and_next_gameweek_df, combined_gameweeks_mid_df], ignore_index=True)
# combined_gameweeks_mid_df.to_csv('combined_gameweeks.csv', index=False)
combined_gameweeks_mid_df.head(10)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  current_and_next_gameweek_df['team'].replace(team_to_threat_rating, inplace=True)
  current_and_next_gameweek_df['team'].replace(team_to_threat_rating, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  current_and_next_gameweek_df['was_home'].replace({True: 1, False: 

Unnamed: 0,name,xP,assists,creativity,expected_assists,expected_goal_involvements,expected_goals,goals_scored,ict_index,influence,starts,threat,was_home,team_strength_difference,total_points
0,Abdoulaye Doucouré,1.458375,-0.236883,0.790807,3.885617,2.125349,0.301935,-0.19328,0.331941,-0.04416,1.0,0.108703,1,-1.477793,11
1,Adam Forshaw,0.025332,-0.236883,0.878576,0.2583,-0.134516,-0.374148,-0.19328,0.225007,0.097732,1.0,-0.459768,1,0.027284,2
2,Adam Lallana,-0.626052,-0.236883,-0.508171,-0.381814,-0.449846,-0.374148,-0.19328,-0.559176,-0.510377,0.0,-0.459768,0,-1.067318,0
3,Adama Traoré Diarra,0.155608,4.586925,1.449073,-0.168443,0.180814,0.377055,-0.19328,1.543861,0.68557,1.0,1.908861,0,-1.204143,2
4,Ainsley Maitland-Niles,-0.322073,-0.236883,-0.508171,-0.381814,-0.449846,-0.374148,-0.19328,-0.559176,-0.510377,0.0,-0.459768,1,-0.793667,0
5,Albert Sambi Lokonga,-0.626052,-0.236883,-0.508171,-0.381814,-0.449846,-0.374148,-0.19328,-0.559176,-0.510377,0.0,-0.459768,1,0.300934,0
6,Alejandro Garnacho,0.633289,-0.236883,0.119376,-0.221786,0.890306,1.428739,-0.19328,0.688388,-0.246863,0.0,1.908861,1,0.437759,2
7,Alex Iwobi,1.458375,-0.236883,1.124328,1.751901,1.337024,0.677536,-0.19328,1.365637,1.37476,1.0,1.1509,1,-1.477793,3
8,Alex Mighten,-0.626052,-0.236883,-0.508171,-0.381814,-0.449846,-0.374148,-0.19328,-0.559176,-0.510377,0.0,-0.459768,0,-0.930493,0
9,Alex Oxlade-Chamberlain,-0.408924,-0.236883,-0.508171,-0.381814,-0.449846,-0.374148,-0.19328,-0.559176,-0.510377,0.0,-0.459768,0,2.079661,0


In [13]:
# Select relevant features and the target variable
X = combined_gameweeks_mid_df.drop(columns=['total_points'])  # Features, including binary columns
y = combined_gameweeks_mid_df['total_points']  # Target variable

# Split the dataset into training and validation sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=41)

# Create and train the Random Forest regression model
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
rf_regressor.fit(X_train.drop(columns=['name']), y_train)

# Predict on the validation set
y_name_pred = X_test['name'].to_frame()
y_pred = rf_regressor.predict(X_test.drop(columns=['name']))
y_name_pred['predicted_score'] = pd.Series(y_pred)

# Create a DataFrame to store predicted and actual values side by side
results_df = pd.DataFrame({'Name': y_name_pred['name'], 'Actual Total Points': y_test, 'Predicted Total Points': y_pred})
results_df.sort_values(by='Predicted Total Points', ascending=False, inplace=True)

# Write the DataFrame to a CSV file
# results_df.to_csv('predicted_vs_actual_random_forest_mid.csv', index=False)

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 3.545911796002


**Best Result**: ~3.55(No hyperparameter tuning)

In [14]:
# Define the hyperparameters grid
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],   # Maximum depth of the trees
    'min_samples_split': [2, 5, 10],   # Minimum number of samples required to split a node
    'min_samples_leaf': [1, 2, 4]      # Minimum number of samples required at each leaf node
}

# Create the Random Forest regressor
rf_regressor = RandomForestRegressor(random_state=42)

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=rf_regressor, param_grid=param_grid, 
                           scoring='neg_mean_squared_error', cv=5, verbose=1, n_jobs=-1)

# Perform the grid search
grid_search.fit(X_train.drop(columns=['name']), y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best hyperparameters:", best_params)

best_score = grid_search.best_score_
print("Best score (negative mean squared error):", best_score)

# Predict on the test set using the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test.drop(columns=['name']))

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error(Tuned Hyperparameters):", mse)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 100}
Best score (negative mean squared error): -4.0253289028161
Mean Squared Error(Tuned Hyperparameters): 3.2495180760311597


**Best Result**: ~3.25(after tuning hyperparameters)

# Forward Preprocessing

In [15]:
combined_gameweeks_fwd_df = pd.DataFrame(columns=['name'
                                              ,'xP'
                                              ,'assists'
                                              ,'creativity'
                                              ,'expected_assists'
                                              ,'expected_goal_involvements'
                                              ,'expected_goals'
                                              ,'goals_scored'
                                              ,'ict_index'
                                              ,'influence'
                                              ,'starts'
                                              ,'threat'
                                              ,'was_home'
                                              ,'team_strength_difference'
                                              ,'total_points'])
for gameweek in range(1, 38):
    if gameweek == 6 or gameweek == 7:
        continue
    current_gameweek_df = pd.read_csv(filepath_or_buffer=f'../2022_2023_gameweek_data/gw{gameweek}.csv', encoding='UTF-8')
    next_gameweek_df = pd.read_csv(filepath_or_buffer=f'../2022_2023_gameweek_data/gw{gameweek + 1}.csv', encoding='UTF-8')
    current_gameweek_df = current_gameweek_df[current_gameweek_df['position'] == 'FWD']
    current_gameweek_filtered_df = current_gameweek_df[['name', 'team', 'xP', 'assists', 'creativity', 'expected_assists', 'expected_goal_involvements', 'expected_goals', 'goals_scored', 'ict_index', 'influence', 'starts', 'threat']]
    current_gameweek_averaged_df = current_gameweek_filtered_df.groupby(['name', 'team']).agg({
        'xP': 'mean',
        'assists': 'mean',
        'creativity': 'mean',
        'expected_assists': 'mean',
        'expected_goal_involvements': 'mean',
        'expected_goals': 'mean',
        'goals_scored': 'mean',
        'ict_index': 'mean',
        'influence': 'mean',
        'starts': 'mean',
        'threat': 'mean'
    }).reset_index()
    current_and_next_gameweek_df = current_gameweek_averaged_df.merge(next_gameweek_df[['name', 'kickoff_time', 'opponent_team', 'was_home', 'total_points']], how='inner', on=['name']).sort_values(['name', 'kickoff_time']).drop_duplicates(subset=['name'], keep='first')
    team_to_threat_rating = {'Southampton': 4, 'Bournemouth': 8, 'Chelsea': 12, 'Newcastle': 16,
                            'Leicester': 4, 'Nott\'m Forest': 8, 'Crystal Palace': 8, 'Wolves': 12,
                            'Brentford': 8, 'Spurs': 16, 'West Ham': 12, 'Liverpool': 20,
                            'Leeds': 4, 'Fulham': 8, 'Brighton': 12, 'Man Utd': 16,
                            'Man City': 20, 'Arsenal': 20, 'Aston Villa': 12, 'Everton': 8}
    current_and_next_gameweek_df['team'].replace(team_to_threat_rating, inplace=True)
    current_and_next_gameweek_df['was_home'].replace({True: 1, False: 0}, inplace=True)
    current_and_next_gameweek_df['team_strength_difference'] = current_and_next_gameweek_df['opponent_team'] - (20 - current_and_next_gameweek_df['team'])
    current_and_next_gameweek_df[['xP'
                                  ,'assists'
                                  ,'creativity'
                                  ,'expected_assists'
                                  ,'expected_goal_involvements'
                                  ,'expected_goals'
                                  ,'goals_scored'
                                  ,'ict_index'
                                  ,'influence'
                                  ,'threat'
                                  ,'team_strength_difference']] = StandardScaler().fit_transform(current_and_next_gameweek_df[['xP'
                                                                                                              ,'assists'
                                                                                                              ,'creativity'
                                                                                                              ,'expected_assists'
                                                                                                              ,'expected_goal_involvements'
                                                                                                              ,'expected_goals'
                                                                                                              ,'goals_scored'
                                                                                                              ,'ict_index'
                                                                                                              ,'influence'
                                                                                                              ,'threat'
                                                                                                              ,'team_strength_difference']])
    current_and_next_gameweek_df = current_and_next_gameweek_df[['name'
                                                                ,'xP'
                                                                ,'assists'
                                                                ,'creativity'
                                                                ,'expected_assists'
                                                                ,'expected_goal_involvements'
                                                                ,'expected_goals'
                                                                ,'goals_scored'
                                                                ,'ict_index'
                                                                ,'influence'
                                                                ,'starts'
                                                                ,'threat'
                                                                ,'was_home'
                                                                ,'team_strength_difference'
                                                                ,'total_points']]
    combined_gameweeks_fwd_df = pd.concat([current_and_next_gameweek_df, combined_gameweeks_fwd_df], ignore_index=True)
# combined_gameweeks_fwd_df.to_csv('combined_gameweeks.csv', index=False)
combined_gameweeks_fwd_df.head(10)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  current_and_next_gameweek_df['team'].replace(team_to_threat_rating, inplace=True)
  current_and_next_gameweek_df['team'].replace(team_to_threat_rating, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  current_and_next_gameweek_df['was_home'].replace({True: 1, False: 

Unnamed: 0,name,xP,assists,creativity,expected_assists,expected_goal_involvements,expected_goals,goals_scored,ict_index,influence,starts,threat,was_home,team_strength_difference,total_points
0,Adam Armstrong,-0.459859,-0.175466,-0.508227,-0.31524,-0.490926,-0.451723,-0.320087,-0.543653,-0.396277,0.0,-0.520271,1,-0.722704,5
1,Aleksandar Mitrović,0.767364,-0.175466,0.126428,0.431698,5.408117,6.143748,5.09229,3.007205,4.269854,1.0,2.04561,0,0.181486,-1
2,Alexander Isak,0.87893,-0.175466,1.095113,0.431698,1.188662,1.221041,-0.320087,0.831904,-0.396277,1.0,1.556871,0,0.181486,2
3,Anthony Martial,2.068965,-0.175466,1.203673,-0.004016,1.598318,1.86625,1.033007,1.295755,1.216706,1.0,0.976493,1,0.633581,1
4,Antoine Semenyo,-0.571425,-0.175466,-0.508227,-0.31524,-0.490926,-0.451723,-0.320087,-0.543653,-0.396277,0.0,-0.520271,0,-0.722704,0
5,Armando Broja,-0.571425,-0.175466,-0.508227,-0.31524,-0.490926,-0.451723,-0.320087,-0.543653,-0.396277,0.0,-0.520271,1,0.934977,0
6,Brennan Johnson,0.35829,-0.175466,-0.508227,-0.31524,-0.490926,-0.451723,-0.320087,-0.543653,-0.396277,0.0,-0.520271,0,-0.873402,2
7,Bryan Mbeumo,2.031776,6.351855,2.030395,6.4072,3.5237,1.65118,5.09229,4.958577,6.055657,1.0,3.572921,1,0.030788,5
8,Callum Wilson,2.589605,-0.175466,-0.458123,-0.31524,1.188662,1.507801,-0.320087,0.288079,-0.396277,1.0,1.373594,0,0.181486,1
9,Cameron Archer,-0.571425,-0.175466,-0.508227,-0.31524,-0.490926,-0.451723,-0.320087,-0.543653,-0.396277,0.0,-0.520271,1,-0.572005,0


In [16]:
# Select relevant features and the target variable
X = combined_gameweeks_fwd_df.drop(columns=['total_points'])  # Features, including binary columns
y = combined_gameweeks_fwd_df['total_points']  # Target variable

# Split the dataset into training and validation sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=41 )

# Create and train the Random Forest regression model
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
rf_regressor.fit(X_train.drop(columns=['name']), y_train)

# Predict on the validation set
y_name_pred = X_test['name'].to_frame()
y_pred = rf_regressor.predict(X_test.drop(columns=['name']))
y_name_pred['predicted_score'] = pd.Series(y_pred)

# Create a DataFrame to store predicted and actual values side by side
results_df = pd.DataFrame({'Name': y_name_pred['name'], 'Actual Total Points': y_test, 'Predicted Total Points': y_pred})
results_df.sort_values(by='Predicted Total Points', ascending=False, inplace=True)

# Write the DataFrame to a CSV file
# results_df.to_csv('predicted_vs_actual_random_forest_fwd.csv', index=False)

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 7.9334362072066655


**Best Result**: 7.93(No hyperparameter tuning)

In [17]:
# Define the hyperparameters grid
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],   # Maximum depth of the trees
    'min_samples_split': [2, 5, 10],   # Minimum number of samples required to split a node
    'min_samples_leaf': [1, 2, 4]      # Minimum number of samples required at each leaf node
}

# Create the Random Forest regressor
rf_regressor = RandomForestRegressor(random_state=42)

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=rf_regressor, param_grid=param_grid, 
                           scoring='neg_mean_squared_error', cv=5, verbose=1, n_jobs=-1)

# Perform the grid search
grid_search.fit(X_train.drop(columns=['name']), y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best hyperparameters:", best_params)

best_score = grid_search.best_score_
print("Best score (negative mean squared error):", best_score)

# Predict on the test set using the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test.drop(columns=['name']))

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error(Tuned Hyperparameters):", mse)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 200}
Best score (negative mean squared error): -5.554923418451181
Mean Squared Error(Tuned Hyperparameters): 7.462077533188957


**Best Result**: ~7.46(after hyperparameter tuning)

# Hyperparameter Tuning Results
**GK Best Hyperparameters**: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 300} \
**DEF Best Hyperparameters**: {'max_depth': 20, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 300} \
**MID Best Hyperparameters**: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 100} \
**FWD Best Hyperparameters**: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 200}

# Final Model Creation and Training

In [18]:
# Create Final Models

# Select relevant features and the target variable
X_train_gk = combined_gameweeks_gk_df.drop(columns=['total_points'])
y_train_gk = combined_gameweeks_gk_df['total_points']

rf_gk_regressor = RandomForestRegressor(max_depth=10, min_samples_leaf=4, min_samples_split=10, n_estimators=300, random_state=42)
rf_gk_regressor.fit(X_train_gk.drop(columns=['name']), y_train_gk)

X_train_def = combined_gameweeks_def_df.drop(columns=['total_points'])
y_train_def = combined_gameweeks_def_df['total_points']

rf_def_regressor = RandomForestRegressor(max_depth=20, min_samples_leaf=4, min_samples_split=2, n_estimators=300, random_state=42)
rf_def_regressor.fit(X_train_def.drop(columns=['name']), y_train_def)

X_train_mid = combined_gameweeks_mid_df.drop(columns=['total_points'])
y_train_mid = combined_gameweeks_mid_df['total_points']

rf_mid_regressor = RandomForestRegressor(max_depth=10, min_samples_leaf=4, min_samples_split=10, n_estimators=100, random_state=42)
rf_mid_regressor.fit(X_train_mid.drop(columns=['name']), y_train_mid)

X_train_fwd = combined_gameweeks_fwd_df.drop(columns=['total_points'])
y_train_fwd = combined_gameweeks_fwd_df['total_points']

rf_fwd_regressor = RandomForestRegressor(max_depth=10, min_samples_leaf=4, min_samples_split=10, n_estimators=200, random_state=42)
rf_fwd_regressor.fit(X_train_fwd.drop(columns=['name']), y_train_fwd)

In [19]:
def get_fwd_df(current_gw_filepath, next_gw_filepath):
    current_gameweek_df = pd.read_csv(filepath_or_buffer=current_gw_filepath, encoding='UTF-8')
    next_gameweek_df = pd.read_csv(filepath_or_buffer=next_gw_filepath, encoding='UTF-8')
    current_gameweek_df = current_gameweek_df[current_gameweek_df['position'] == 'FWD']
    current_gameweek_filtered_df = current_gameweek_df[['name', 'team', 'xP', 'assists', 'creativity', 'expected_assists', 'expected_goal_involvements', 'expected_goals', 'goals_scored', 'ict_index', 'influence', 'starts', 'threat']]
    current_gameweek_averaged_df = current_gameweek_filtered_df.groupby(['name', 'team']).agg({
        'xP': 'mean',
        'assists': 'mean',
        'creativity': 'mean',
        'expected_assists': 'mean',
        'expected_goal_involvements': 'mean',
        'expected_goals': 'mean',
        'goals_scored': 'mean',
        'ict_index': 'mean',
        'influence': 'mean',
        'starts': 'mean',
        'threat': 'mean'
    }).reset_index()
    current_and_next_gameweek_df = current_gameweek_averaged_df.merge(next_gameweek_df[['name', 'kickoff_time', 'opponent_team', 'was_home', 'total_points']], how='inner', on=['name']).sort_values(['name', 'kickoff_time']).drop_duplicates(subset=['name'], keep='first')
    team_to_threat_rating = {'Luton': 8, 'Bournemouth': 8, 'Chelsea': 12, 'Newcastle': 16,
                            'Burnley': 8, 'Nott\'m Forest': 8, 'Crystal Palace': 8, 'Wolves': 8,
                            'Brentford': 8, 'Spurs': 16, 'West Ham': 8, 'Liverpool': 20,
                            'Sheffield Utd': 8, 'Fulham': 12, 'Brighton': 12, 'Man Utd': 16,
                            'Man City': 20, 'Arsenal': 20, 'Aston Villa': 12, 'Everton': 8}
    current_and_next_gameweek_df['team'].replace(team_to_threat_rating, inplace=True)
    current_and_next_gameweek_df['was_home'].replace({True: 1, False: 0}, inplace=True)
    current_and_next_gameweek_df['team_strength_difference'] = current_and_next_gameweek_df['opponent_team'] - (20 - current_and_next_gameweek_df['team'])
    current_and_next_gameweek_df[['xP'
                                  ,'assists'
                                  ,'creativity'
                                  ,'expected_assists'
                                  ,'expected_goal_involvements'
                                  ,'expected_goals'
                                  ,'goals_scored'
                                  ,'ict_index'
                                  ,'influence'
                                  ,'threat'
                                  ,'team_strength_difference']] = StandardScaler().fit_transform(current_and_next_gameweek_df[['xP'
                                                                                                              ,'assists'
                                                                                                              ,'creativity'
                                                                                                              ,'expected_assists'
                                                                                                              ,'expected_goal_involvements'
                                                                                                              ,'expected_goals'
                                                                                                              ,'goals_scored'
                                                                                                              ,'ict_index'
                                                                                                              ,'influence'
                                                                                                              ,'threat'
                                                                                                              ,'team_strength_difference']])
    current_and_next_gameweek_df = current_and_next_gameweek_df[['name'
                                                                ,'xP'
                                                                ,'assists'
                                                                ,'creativity'
                                                                ,'expected_assists'
                                                                ,'expected_goal_involvements'
                                                                ,'expected_goals'
                                                                ,'goals_scored'
                                                                ,'ict_index'
                                                                ,'influence'
                                                                ,'starts'
                                                                ,'threat'
                                                                ,'was_home'
                                                                ,'team_strength_difference'
                                                                ,'total_points']]
    return current_and_next_gameweek_df

In [20]:
def get_mid_df(current_gw_filepath, next_gw_filepath):
    current_gameweek_df = pd.read_csv(filepath_or_buffer=current_gw_filepath, encoding='UTF-8')
    next_gameweek_df = pd.read_csv(filepath_or_buffer=next_gw_filepath, encoding='UTF-8')
    current_gameweek_df = current_gameweek_df[current_gameweek_df['position'] == 'MID']
    current_gameweek_filtered_df = current_gameweek_df[['name', 'team', 'xP', 'assists', 'creativity', 'expected_assists', 'expected_goal_involvements', 'expected_goals', 'goals_scored', 'ict_index', 'influence', 'starts', 'threat']]
    current_gameweek_averaged_df = current_gameweek_filtered_df.groupby(['name', 'team']).agg({
        'xP': 'mean',
        'assists': 'mean',
        'creativity': 'mean',
        'expected_assists': 'mean',
        'expected_goal_involvements': 'mean',
        'expected_goals': 'mean',
        'goals_scored': 'mean',
        'ict_index': 'mean',
        'influence': 'mean',
        'starts': 'mean',
        'threat': 'mean'
    }).reset_index()
    current_and_next_gameweek_df = current_gameweek_averaged_df.merge(next_gameweek_df[['name', 'kickoff_time', 'opponent_team', 'was_home', 'total_points']], how='inner', on=['name']).sort_values(['name', 'kickoff_time']).drop_duplicates(subset=['name'], keep='first')
    team_to_threat_rating = {'Luton': 8, 'Bournemouth': 8, 'Chelsea': 12, 'Newcastle': 16,
                            'Burnley': 8, 'Nott\'m Forest': 8, 'Crystal Palace': 8, 'Wolves': 8,
                            'Brentford': 8, 'Spurs': 16, 'West Ham': 8, 'Liverpool': 20,
                            'Sheffield Utd': 8, 'Fulham': 12, 'Brighton': 12, 'Man Utd': 16,
                            'Man City': 20, 'Arsenal': 20, 'Aston Villa': 12, 'Everton': 8}
    current_and_next_gameweek_df['team'].replace(team_to_threat_rating, inplace=True)
    current_and_next_gameweek_df['was_home'].replace({True: 1, False: 0}, inplace=True)
    current_and_next_gameweek_df['team_strength_difference'] = current_and_next_gameweek_df['opponent_team'] - (20 - current_and_next_gameweek_df['team'])
    current_and_next_gameweek_df[['xP'
                                  ,'assists'
                                  ,'creativity'
                                  ,'expected_assists'
                                  ,'expected_goal_involvements'
                                  ,'expected_goals'
                                  ,'goals_scored'
                                  ,'ict_index'
                                  ,'influence'
                                  ,'threat'
                                  ,'team_strength_difference']] = StandardScaler().fit_transform(current_and_next_gameweek_df[['xP'
                                                                                                                                ,'assists'
                                                                                                                                ,'creativity'
                                                                                                                                ,'expected_assists'
                                                                                                                                ,'expected_goal_involvements'
                                                                                                                                ,'expected_goals'
                                                                                                                                ,'goals_scored'
                                                                                                                                ,'ict_index'
                                                                                                                                ,'influence'
                                                                                                                                ,'threat'
                                                                                                                                ,'team_strength_difference']])
    current_and_next_gameweek_df = current_and_next_gameweek_df[['name'
                                                                ,'xP'
                                                                ,'assists'
                                                                ,'creativity'
                                                                ,'expected_assists'
                                                                ,'expected_goal_involvements'
                                                                ,'expected_goals'
                                                                ,'goals_scored'
                                                                ,'ict_index'
                                                                ,'influence'
                                                                ,'starts'
                                                                ,'threat'
                                                                ,'was_home'
                                                                ,'team_strength_difference'
                                                                ,'total_points']]
    return current_and_next_gameweek_df

In [21]:
def get_def_df(current_gw_filepath, next_gw_filepath):
    current_gameweek_df = pd.read_csv(filepath_or_buffer=current_gw_filepath, encoding='UTF-8')
    next_gameweek_df = pd.read_csv(filepath_or_buffer=next_gw_filepath, encoding='UTF-8')
    current_gameweek_df = current_gameweek_df[current_gameweek_df['position'] == 'DEF']
    current_gameweek_filtered_df = current_gameweek_df[['name', 'team', 'xP', 'assists', 'clean_sheets', 'creativity', 'expected_assists', 'expected_goal_involvements', 'expected_goals', 'expected_goals_conceded', 'goals_conceded', 'goals_scored', 'ict_index', 'influence', 'own_goals', 'starts', 'threat']]
    current_gameweek_averaged_df = current_gameweek_filtered_df.groupby(['name', 'team']).agg({
        'xP': 'mean',
        'assists': 'mean',
        'clean_sheets': 'mean',
        'creativity': 'mean',
        'expected_assists': 'mean',
        'expected_goal_involvements': 'mean',
        'expected_goals': 'mean',
        'expected_goals_conceded': 'mean',
        'goals_conceded': 'mean',
        'goals_scored': 'mean',
        'ict_index': 'mean',
        'influence': 'mean',
        'own_goals': 'mean',
        'starts': 'mean',
        'threat': 'mean'
    }).reset_index()
    current_and_next_gameweek_df = current_gameweek_averaged_df.merge(next_gameweek_df[['name', 'kickoff_time', 'opponent_team', 'was_home', 'total_points']], how='inner', on=['name']).sort_values(['name', 'kickoff_time']).drop_duplicates(subset=['name'], keep='first')
    team_to_threat_rating = {'Luton': 8, 'Bournemouth': 8, 'Chelsea': 12, 'Newcastle': 16,
                            'Burnley': 8, 'Nott\'m Forest': 8, 'Crystal Palace': 8, 'Wolves': 8,
                            'Brentford': 8, 'Spurs': 16, 'West Ham': 8, 'Liverpool': 20,
                            'Sheffield Utd': 8, 'Fulham': 12, 'Brighton': 12, 'Man Utd': 16,
                            'Man City': 20, 'Arsenal': 20, 'Aston Villa': 12, 'Everton': 8}
    current_and_next_gameweek_df['team'].replace(team_to_threat_rating, inplace=True)
    current_and_next_gameweek_df['was_home'].replace({True: 1, False: 0}, inplace=True)
    current_and_next_gameweek_df['team_strength_difference'] = current_and_next_gameweek_df['opponent_team'] - (20 - current_and_next_gameweek_df['team'])
    current_and_next_gameweek_df[['xP'
                                  ,'assists'
                                  ,'clean_sheets'
                                  ,'creativity'
                                  ,'expected_assists'
                                  ,'expected_goal_involvements'
                                  ,'expected_goals'
                                  ,'expected_goals_conceded'
                                  ,'goals_conceded'
                                  ,'goals_scored'
                                  ,'ict_index'
                                  ,'influence'
                                  ,'own_goals'
                                  ,'threat'
                                  ,'team_strength_difference']] = StandardScaler().fit_transform(current_and_next_gameweek_df[['xP'
                                                                                                              ,'assists'
                                                                                                              ,'clean_sheets'
                                                                                                              ,'creativity'
                                                                                                              ,'expected_assists'
                                                                                                              ,'expected_goal_involvements'
                                                                                                              ,'expected_goals'
                                                                                                              ,'expected_goals_conceded'
                                                                                                              ,'goals_conceded'
                                                                                                              ,'goals_scored'
                                                                                                              ,'ict_index'
                                                                                                              ,'influence'
                                                                                                              ,'own_goals'
                                                                                                              ,'threat'
                                                                                                              ,'team_strength_difference']])
    current_and_next_gameweek_df = current_and_next_gameweek_df[['name'
                                                                ,'xP'
                                                                ,'assists'
                                                                ,'clean_sheets'
                                                                ,'creativity'
                                                                ,'expected_assists'
                                                                ,'expected_goal_involvements'
                                                                ,'expected_goals'
                                                                ,'expected_goals_conceded'
                                                                ,'goals_conceded'
                                                                ,'goals_scored'
                                                                ,'ict_index'
                                                                ,'influence'
                                                                ,'own_goals'
                                                                ,'starts'
                                                                ,'threat'
                                                                ,'was_home'
                                                                ,'team_strength_difference'
                                                                ,'total_points']]
    return current_and_next_gameweek_df

In [22]:
def get_gk_df(current_gw_filepath, next_gw_filepath):
    current_gameweek_df = pd.read_csv(filepath_or_buffer=current_gw_filepath, encoding='UTF-8')
    next_gameweek_df = pd.read_csv(filepath_or_buffer=next_gw_filepath, encoding='UTF-8')
    current_gameweek_df = current_gameweek_df[current_gameweek_df['position'] == 'GK']
    current_gameweek_filtered_df = current_gameweek_df[['name', 'team', 'clean_sheets', 'xP', 'expected_goals_conceded', 'goals_conceded', 'ict_index', 'influence', 'penalties_saved', 'starts']]
    current_gameweek_averaged_df = current_gameweek_filtered_df.groupby(['name', 'team']).agg({
        'clean_sheets': 'mean',
        'xP': 'mean',
        'expected_goals_conceded': 'mean',
        'goals_conceded': 'mean',
        'ict_index': 'mean',
        'influence': 'mean',
        'penalties_saved': 'mean',
        'starts': 'mean'
    }).reset_index()
    current_and_next_gameweek_df = current_gameweek_averaged_df.merge(next_gameweek_df[['name', 'kickoff_time', 'opponent_team', 'was_home', 'total_points']], how='inner', on=['name']).sort_values(['name', 'kickoff_time']).drop_duplicates(subset=['name'], keep='first')
    team_to_threat_rating = {'Luton': 8, 'Bournemouth': 8, 'Chelsea': 12, 'Newcastle': 16,
                            'Burnley': 8, 'Nott\'m Forest': 8, 'Crystal Palace': 8, 'Wolves': 8,
                            'Brentford': 8, 'Spurs': 16, 'West Ham': 8, 'Liverpool': 20,
                            'Sheffield Utd': 8, 'Fulham': 12, 'Brighton': 12, 'Man Utd': 16,
                            'Man City': 20, 'Arsenal': 20, 'Aston Villa': 12, 'Everton': 8}
    current_and_next_gameweek_df['team'].replace(team_to_threat_rating, inplace=True)
    current_and_next_gameweek_df['was_home'].replace({True: 1, False: 0}, inplace=True)
    current_and_next_gameweek_df['team_strength_difference'] = current_and_next_gameweek_df['opponent_team'] - (20 - current_and_next_gameweek_df['team'])
    current_and_next_gameweek_df[['xP',
                                    'expected_goals_conceded',
                                    'goals_conceded',
                                    'ict_index',
                                    'influence',
                                    'penalties_saved',
                                    'team_strength_difference']] = StandardScaler().fit_transform(current_and_next_gameweek_df[['xP',
                                                                                                                                'expected_goals_conceded',
                                                                                                                                'goals_conceded',
                                                                                                                                'ict_index',
                                                                                                                                'influence',
                                                                                                                                'penalties_saved',
                                                                                                                                'team_strength_difference']])
    current_and_next_gameweek_df = current_and_next_gameweek_df[['name', 'clean_sheets', 'xP', 'expected_goals_conceded', 'goals_conceded', 'ict_index', 'influence', 'penalties_saved', 'starts', 'was_home', 'team_strength_difference', 'total_points']]
    return current_and_next_gameweek_df

In [23]:
# Load Test Data(2023-2024 Season)
gk_predicted_and_actual_df = pd.DataFrame(columns=['name', 'total_points', 'Predicted Total Points', 'gameweek'])
def_predicted_and_actual_df = pd.DataFrame(columns=['name', 'total_points', 'Predicted Total Points', 'gameweek'])
mid_predicted_and_actual_df = pd.DataFrame(columns=['name', 'total_points', 'Predicted Total Points', 'gameweek'])
fwd_predicted_and_actual_df = pd.DataFrame(columns=['name', 'total_points', 'Predicted Total Points', 'gameweek'])
with open('model_output.csv', 'w') as file:
    file.write('2023-2024 Season Weekly Team Prediction\n')
for gameweek in range(1, 31):
    current_gameweek_total_points = 0

    X_test_fwd = get_fwd_df(f'../2023_2024_gameweek_data/gw{gameweek}.csv', f'../2023_2024_gameweek_data/gw{gameweek + 1}.csv')
    y_pred_fwd_names = X_test_fwd[['name', 'total_points']]
    y_pred_fwd = rf_fwd_regressor.predict(X_test_fwd.drop(columns=['name', 'total_points']))
    y_pred_fwd_names['Predicted Total Points'] = y_pred_fwd
    # y_pred_fwd_names['Predicted Total Points'] = pd.Series(y_pred_fwd) # OLD NOT WORKING
    y_pred_fwd_names.sort_values(by='Predicted Total Points', ascending=False, inplace=True)
    with open('model_output.csv', 'a') as file:
        file.write(f'---------------------Week {gameweek + 1}--------------------------\nFWD\n')
    y_pred_fwd_names.head(3).to_csv('model_output.csv', mode='a',index=False)
    y_pred_fwd_names['gameweek'] = gameweek
    fwd_predicted_and_actual_df = pd.concat([y_pred_fwd_names, fwd_predicted_and_actual_df], ignore_index=True)

    X_test_mid = get_mid_df(f'../2023_2024_gameweek_data/gw{gameweek}.csv', f'../2023_2024_gameweek_data/gw{gameweek + 1}.csv')
    y_pred_mid_names = X_test_mid[['name', 'total_points']]
    y_pred_mid = rf_mid_regressor.predict(X_test_mid.drop(columns=['name', 'total_points']))
    y_pred_mid_names['Predicted Total Points'] = y_pred_mid
    # y_pred_mid_names['Predicted Total Points'] = pd.Series(y_pred_mid)
    y_pred_mid_names.sort_values(by='Predicted Total Points', ascending=False, inplace=True)
    with open('model_output.csv', 'a') as file:
        file.write('MID\n')
    y_pred_mid_names.head(5).to_csv('model_output.csv', mode='a', index=False)
    y_pred_mid_names['gameweek'] = gameweek
    mid_predicted_and_actual_df = pd.concat([y_pred_mid_names, mid_predicted_and_actual_df], ignore_index=True)

    X_test_def = get_def_df(f'../2023_2024_gameweek_data/gw{gameweek}.csv', f'../2023_2024_gameweek_data/gw{gameweek + 1}.csv')
    y_pred_def_names = X_test_def[['name', 'total_points']]
    y_pred_def = rf_def_regressor.predict(X_test_def.drop(columns=['name', 'total_points']))
    y_pred_def_names['Predicted Total Points'] = y_pred_def
    # y_pred_def_names['Predicted Total Points'] = pd.Series(y_pred_def)
    y_pred_def_names.sort_values(by='Predicted Total Points', ascending=False, inplace=True)
    with open('model_output.csv', 'a') as file:
        file.write('DEF\n')
    y_pred_def_names.head(5).to_csv('model_output.csv', mode='a', index=False)
    y_pred_def_names['gameweek'] = gameweek
    def_predicted_and_actual_df = pd.concat([y_pred_def_names, def_predicted_and_actual_df], ignore_index=True)

    X_test_gk = get_gk_df(f'../2023_2024_gameweek_data/gw{gameweek}.csv', f'../2023_2024_gameweek_data/gw{gameweek + 1}.csv')
    y_pred_gk_names = X_test_gk[['name', 'total_points']]
    y_pred_gk = rf_gk_regressor.predict(X_test_gk.drop(columns=['name', 'total_points']))
    y_pred_gk_names['Predicted Total Points'] = y_pred_gk
    # y_pred_gk_names['Predicted Total Points'] = pd.Series(y_pred_gk)
    y_pred_gk_names.sort_values(by='Predicted Total Points', ascending=False, inplace=True)
    with open('model_output.csv', 'a') as file:
        file.write('GK\n')
    y_pred_gk_names.head(2).to_csv('model_output.csv', mode='a', index=False)
    y_pred_gk_names['gameweek'] = gameweek
    gk_predicted_and_actual_df = pd.concat([y_pred_gk_names, gk_predicted_and_actual_df], ignore_index=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  current_and_next_gameweek_df['team'].replace(team_to_threat_rating, inplace=True)
  current_and_next_gameweek_df['team'].replace(team_to_threat_rating, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  current_and_next_gameweek_df['was_home'].replace({True: 1, False: 

In [24]:
# PREDICTIONS TO CSV FILE
# fwd_predicted_and_actual_df.to_csv('fwd_predicted_and_actual_points.csv', mode='w', index=False)
# mid_predicted_and_actual_df.to_csv('mid_predicted_and_actual_points.csv', mode='w', index=False)
# def_predicted_and_actual_df.to_csv('def_predicted_and_actual_points.csv', mode='w', index=False)
# gk_predicted_and_actual_df.to_csv('gk_predicted_and_actual_points.csv', mode='w', index=False)

In [25]:
fwd_mse = mean_squared_error(fwd_predicted_and_actual_df['total_points'], fwd_predicted_and_actual_df['Predicted Total Points'])
print("FWD Mean Squared Error(Tuned Hyperparameters):", fwd_mse)
mid_mse = mean_squared_error(mid_predicted_and_actual_df['total_points'], mid_predicted_and_actual_df['Predicted Total Points'])
print("MID Mean Squared Error(Tuned Hyperparameters):", mid_mse)
def_mse = mean_squared_error(def_predicted_and_actual_df['total_points'], def_predicted_and_actual_df['Predicted Total Points'])
print("DEF Mean Squared Error(Tuned Hyperparameters):", def_mse)
gk_mse = mean_squared_error(gk_predicted_and_actual_df['total_points'], gk_predicted_and_actual_df['Predicted Total Points'])
print("GK Mean Squared Error(Tuned Hyperparameters):", gk_mse)

FWD Mean Squared Error(Tuned Hyperparameters): 4.738360452551122
MID Mean Squared Error(Tuned Hyperparameters): 4.1877809289295955
DEF Mean Squared Error(Tuned Hyperparameters): 4.147431893202921
GK Mean Squared Error(Tuned Hyperparameters): 2.1914065279162918
