In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

Import Data

In [2]:
timeseries_data = pd.read_csv('timeseries_data.csv')

df = timeseries_data.copy()

Feature Engineering
- Create new features to investigate impact on model performance
- Convert categorical features if needed

In [3]:
#Create average games played variable
game_duration = 90
average_games_played = df['MP'] / game_duration

df['GP'] = average_games_played

In [4]:
#Checking unique values of positions before encoding
print(df['Position'].unique())

['MID' 'FWD' 'DEF' 'GKP' ' DEF']


In [5]:
# Remove leading and trailing whitespace from the 'DEF' position
df['Position'] = df['Position'].str.strip()

In [6]:
#Verify whitespace has been removed
print(df['Position'].unique())

['MID' 'FWD' 'DEF' 'GKP']


In [7]:
# Perform one-hot encoding
positions_encoded = pd.get_dummies(df['Position'])

# Concatenate the encoded columns with the original DataFrame
df = pd.concat([df, positions_encoded], axis=1)

Split into training and test data
- Train data: All seasons before 23/24 season
- Test data: Current 23/24 season

In [9]:
# Filter the DataFrame to exclude rows with the year 2023
df_train = df[df['Season'] != '2023-08-01']

# Filter data for the 2023 season for testing
df_test = df[df['Season'] == '2023-08-01']

In [None]:
# Select features and target variable
features = ['GS', 'A', 'CS', 'BPS', 'MP', 'DEF', 'FWD', 'MID', 'GKP']
target = 'Pts'

# Split the data into features (X) and target variable (y)
X = df_train[features]
y = df_train[target]

Model Fitting
- Using linear regression model
- MSE and R Squared as evaluation metrics
- Feature standardisation (optional)

In [63]:
# # Standardize the features
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X)

# # Use the scaler trained on the training set to transform the 2023 season data
# X_test_scaled = scaler.transform(df_test[features])

# Create a Linear Regression model
model = LinearRegression()

# Train the model on historical data
model.fit(X, y)

# Make predictions on the 2023 season data
y_pred_2023 = model.predict(df_test[features])

# Actual target values for the 2023 season
y_actual_2023 = df_test[target]


In [64]:
# Model Evaluation
mse = mean_squared_error(y_actual_2023, y_pred_2023)
r2 = r2_score(y_actual_2023, y_pred_2023)

print(f"Mean Squared Error: {mse:.4f}")
print(f"R-squared: {r2:.4f}")

Mean Squared Error: 19.4878
R-squared: 0.9751


Display Results
- select players with the most predicted points based on positions

In [58]:
# Create a DataFrame to display the results for the 2023 season
results_2023_df = pd.DataFrame({
    'Player Name': df_test['Player Name'],
    'Position': df_test['Position'],
    'Team': df_test['Team'],
    'Actual Pts': df_test['Pts'],
    'Predicted Pts': y_pred_2023.round().astype(int) # Use rounded and integer predicted points
})

# Reset the index and drop duplicates of the DataFrame
results_2023_df = results_2023_df.reset_index(drop=True)
results_2023_df = results_2023_df.drop_duplicates(subset='Player Name', keep='first')

results_2023_df

Unnamed: 0,Player Name,Position,Team,Actual Pts,Predicted Pts
0,Salah,MID,LIV,156,147
1,Son,MID,TOT,136,127
2,Watkins,FWD,AVL,121,127
3,Bowen,MID,WHU,113,101
4,Haaland,FWD,MCI,112,127
...,...,...,...,...,...
783,Griffiths,MID,WOL,0,1
784,Mosquera,DEF,WOL,0,-1
785,Bettinelli,GKP,CHE,-1,5
786,Rodák,GKP,FUL,-1,5


In [60]:
#Group players by position
position_groups = results_2023_df.groupby('Position')

# Define the number of players to select for each position
num_defenders = 4
num_midfielders = 4
num_forwards = 2
num_gkp = 2

# Select the top players for each position based on points
top_defenders = position_groups.get_group('DEF').nlargest(num_defenders, 'Predicted Pts')
top_midfielders = position_groups.get_group('MID').nlargest(num_midfielders, 'Predicted Pts')
top_forwards = position_groups.get_group('FWD').nlargest(num_forwards, 'Predicted Pts')
best_goalkeeper = position_groups.get_group('GKP').nlargest(num_gkp, 'Predicted Pts')

# Concatenate the selected players into a single DataFrame
selected_players = pd.concat([top_defenders, top_midfielders, top_forwards, best_goalkeeper])

selected_players = selected_players.reset_index(drop=True)

selected_players

Unnamed: 0,Player Name,Position,Team,Actual Pts,Predicted Pts
0,Pedro Porro,DEF,TOT,83,90
1,Trippier,DEF,NEW,86,88
2,Alexander-Arnold,DEF,LIV,97,87
3,Saliba,DEF,ARS,81,77
4,Salah,MID,LIV,156,147
5,Son,MID,TOT,136,127
6,Saka,MID,ARS,110,111
7,Gordon,MID,NEW,104,107
8,Watkins,FWD,AVL,121,127
9,Haaland,FWD,MCI,112,127
