In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


In [13]:
# Base path for the CSV files
dataframes = [] 
for i in ['2020-21', '2021-22', '2022-23']:
    dataframes.append(pd.read_csv(f'Fantasy-Premier-League/data/{i}/gws/' + 'merged_gw.csv'))
data = pd.concat(dataframes, keys=['2020-21', '2021-22', '2022-23'])
data = data.reset_index(level=0).rename(columns={'level_0': 'Season'})

In [15]:
data.columns

Index(['Season', 'name', 'position', 'team', 'xP', 'assists', 'bonus', 'bps',
       'clean_sheets', 'creativity', 'element', 'fixture', 'goals_conceded',
       'goals_scored', 'ict_index', 'influence', 'kickoff_time', 'minutes',
       'opponent_team', 'own_goals', 'penalties_missed', 'penalties_saved',
       'red_cards', 'round', 'saves', 'selected', 'team_a_score',
       'team_h_score', 'threat', 'total_points', 'transfers_balance',
       'transfers_in', 'transfers_out', 'value', 'was_home', 'yellow_cards',
       'GW', 'expected_assists', 'expected_goal_involvements',
       'expected_goals', 'expected_goals_conceded', 'starts'],
      dtype='object')

## Feature engineering

Create variables for player's
- last score
- average score last 3 matches
- average XP for last 3 matches

In [19]:
data.groupby('name')['expected_assists'].shift(1)

0         NaN
1         NaN
2         NaN
3         NaN
4         NaN
         ... 
26500    0.02
26501    0.00
26502    0.00
26503    0.00
26504    0.00
Name: expected_assists, Length: 76317, dtype: float64

In [20]:
features = ['xP', 'assists', 'bonus', 'bps',
       'clean_sheets', 'creativity', 'element', 'fixture', 'goals_conceded',
       'goals_scored', 'ict_index', 'influence', 'minutes',
       'opponent_team', 'own_goals', 'penalties_missed', 'penalties_saved',
       'red_cards', 'round', 'saves', 'selected', 'team_a_score',
       'team_h_score', 'threat', 'transfers_balance',
       'transfers_in', 'transfers_out', 'value', 'was_home', 'yellow_cards',
       'GW', 'expected_assists', 'expected_goal_involvements',
       'expected_goals', 'expected_goals_conceded', 'starts']

not_features = ['Season', 'name', 'position', 'team', 'kickoff_time', 'total_points', 'large_haul']
for feature in features:
    try:
       data[f'last_{feature}'] = data.groupby('name')[feature].shift(1)
       data[f'last3_{feature}'] = data.groupby('name')[f'last_{feature}'].transform(lambda x: x.rolling(3, 1).mean())
    except Exception as e:
       print(feature)
       print(e)

In [21]:
data['large_haul'] = data['total_points'] > 14

In [22]:
data = data.dropna(how='any', axis=1)

# Data splitting

Split the data into training and test sets

In [23]:

X = data.drop(not_features, axis=1).drop(features, axis=1)  # Features (drop the target column and any other non-feature columns)
y = data['total_points']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


KeyError: "['expected_assists', 'expected_goal_involvements', 'expected_goals', 'expected_goals_conceded', 'starts'] not found in axis"

# Model selection and training

In [77]:
# Train the model
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# Predict on the test set
y_pred = regressor.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 0.26302253371171824


In [90]:
X = data.drop(not_features + ['large_haul'], axis=1)  # Features (drop the target column and any other non-feature columns)
y = data['large_haul']

y_binary = (data['total_points'] > 15).astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.2, random_state=42)

# Train the classifier
classifier = RandomForestClassifier()
classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.9994103773584906


In [92]:
X.columns

Index(['xP', 'assists', 'bonus', 'bps', 'clean_sheets', 'creativity',
       'element', 'fixture', 'goals_conceded', 'goals_scored', 'ict_index',
       'influence', 'minutes', 'opponent_team', 'own_goals',
       'penalties_missed', 'penalties_saved', 'red_cards', 'round', 'saves',
       'selected', 'team_a_score', 'team_h_score', 'threat',
       'transfers_balance', 'transfers_in', 'transfers_out', 'value',
       'was_home', 'yellow_cards', 'GW'],
      dtype='object')