In [1]:
#Import packages
import pandas as pd

In [2]:
#Read CSV
df = pd.read_csv("../data/processed/epl_matches_final.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,date,time,result,poss,sh,sot,dist,fk,pk,...,team_Newcastle Utd,team_Norwich City,team_Nottingham Forest,team_Sheffield United,team_Southampton,team_Tottenham,team_Watford,team_West Bromwich Albion,team_West Ham,team_Wolves
0,0,2019-08-09,20,1,57.0,15.0,7.0,17.1,1.0,0,...,False,False,False,False,False,False,False,False,False,False
1,1,2019-08-17,15,1,63.0,15.0,6.0,18.6,1.0,0,...,False,False,False,False,False,False,False,False,False,False
2,2,2019-08-24,17,1,52.0,24.0,4.0,18.8,0.0,1,...,False,False,False,False,False,False,False,False,False,False
3,3,2019-08-31,17,1,63.0,15.0,7.0,21.0,0.0,0,...,False,False,False,False,False,False,False,False,False,False
4,4,2019-09-14,12,1,74.0,21.0,8.0,13.6,0.0,0,...,False,False,False,False,False,False,False,False,False,False


In [4]:
#Convert date to datetime
df['date'] = pd.to_datetime(df['date'])

In [5]:
# Sort data by date to ensure chronological order
df_sorted = df.sort_values(by='date')

# Define a split point; use 80% of the data for training and 20% for testing
split_date = df_sorted['date'].quantile(0.8)

# Split the data
train_df = df_sorted[df_sorted['date'] <= split_date]
test_df = df_sorted[df_sorted['date'] > split_date]

In [6]:
target_column = 'result'
feature_columns = df.drop(['date', 'Unnamed: 0', 'result'], axis=1).columns

# For training data
X_train = train_df[feature_columns]
y_train = train_df[target_column]

# For testing data
X_test = test_df[feature_columns]
y_test = test_df[target_column]

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Initialize models
models = {
    'RandomForest': RandomForestClassifier(n_estimators=100, min_samples_split=10, random_state=42)
}

# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"Model: {name}")
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    print(classification_report(y_test, y_pred))

Model: RandomForest
Accuracy: 0.5907894736842105
              precision    recall  f1-score   support

          -1       0.62      0.68      0.65       298
           0       0.30      0.05      0.08       164
           1       0.59      0.80      0.68       298

    accuracy                           0.59       760
   macro avg       0.50      0.51      0.47       760
weighted avg       0.54      0.59      0.54       760



In [8]:
from sklearn.model_selection import GridSearchCV

# Define the model
model = RandomForestClassifier(random_state=42)

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Setup GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1)

# Train with GridSearchCV
grid_search.fit(X_train, y_train)

# Best parameters and model performance
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Score: {grid_search.best_score_}")

# Evaluate the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best Parameters: {'max_depth': 20, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 100}
Best Score: 0.5692664527876528
Accuracy: 0.6065789473684211
              precision    recall  f1-score   support

          -1       0.62      0.74      0.67       298
           0       0.50      0.01      0.01       164
           1       0.60      0.81      0.69       298

    accuracy                           0.61       760
   macro avg       0.57      0.52      0.46       760
weighted avg       0.58      0.61      0.54       760



In [10]:
#Read CSV
df_clean = pd.read_csv("../data/interim/epl_matches_cleaned.csv")

In [11]:
#Drop unncessary columns
df_clean.drop('Unnamed: 0', axis=1, inplace=True)

In [12]:
#Define a rolling average function
def rolling_averages(group, cols, new_cols):
    group = group.sort_values('date')
    rolling_stats = group[cols].rolling(2, closed = 'left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group    

In [13]:
cols = ["sh", "sot", "dist", "fk", "pk", "pkatt"]
new_cols = [f"{c}_rolling" for c in cols]

In [14]:
#Apply rolling averages to numerical columns in DataFrame
matches_rolling = df_clean.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))

  matches_rolling = df_clean.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))


In [None]:
matches_rolling = matches_rolling.droplevel('team')

In [15]:
#Correct index
matches_rolling.index = range(matches_rolling.shape[0])

In [16]:
# Encode categorical features
matches_rolling = pd.get_dummies(matches_rolling, columns=['round', 'day', 'venue', 'opponent', 'formation', 'team'])

#Keep only hour for time column
matches_rolling['time'] = matches_rolling['time'].str.split(':').str[0].astype('int')

# Convert result to numerical outcome: 1 for home win, -1 for away win, 0 for draw
matches_rolling['result'] = matches_rolling['result'].map({'W': 1, 'L': -1, 'D': 0})

In [17]:
import re

# Define a function to clean column names
def clean_column_names(columns):
    cleaned_columns = []
    for col in columns:
        # Remove unwanted characters using regex
        cleaned_col = re.sub(r'[^\w\s]', '', col)  # Removes all non-alphanumeric characters except whitespace
        cleaned_columns.append(cleaned_col)
    return cleaned_columns

# Apply the cleaning function
matches_rolling.columns = clean_column_names(matches_rolling.columns)

In [18]:
# Drop duplicated columns based on column names
matches_rolling = matches_rolling.loc[:, ~matches_rolling.columns.duplicated(keep='first')]

In [19]:
# Split the data
rolling_train_df = matches_rolling[matches_rolling['date'] <= '2023-08-01']
rolling_test_df = matches_rolling[matches_rolling['date'] > '2023-08-01']

In [20]:
target_column = 'result'
feature_columns = matches_rolling.drop(['result', 'date'], axis=1).columns

# For training data
rolling_X_train = rolling_train_df[feature_columns]
rolling_y_train = rolling_train_df[target_column]

# For testing data
rolling_X_test = rolling_test_df[feature_columns]
rolling_y_test = rolling_test_df[target_column]

In [21]:
# Initialize models
rolling_models = {
    'RandomForest': RandomForestClassifier(n_estimators=100, min_samples_split=10, random_state=42)
}

# Train and evaluate each model
for name, model in rolling_models.items():
    model.fit(rolling_X_train, rolling_y_train)
    rolling_y_pred = model.predict(rolling_X_test)
    print(f"Model: {name}")
    print(f"Accuracy: {accuracy_score(rolling_y_test, rolling_y_pred)}")
    print(classification_report(rolling_y_test, rolling_y_pred))

Model: RandomForest
Accuracy: 0.5989445910290238
              precision    recall  f1-score   support

          -1       0.63      0.70      0.66       296
           0       0.37      0.04      0.08       164
           1       0.59      0.81      0.68       298

    accuracy                           0.60       758
   macro avg       0.53      0.52      0.47       758
weighted avg       0.56      0.60      0.54       758



In [22]:
# Define the model
rolling_model = RandomForestClassifier(random_state=42)

# Define the parameter grid
rolling_param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Setup GridSearchCV
rolling_grid_search = GridSearchCV(estimator=rolling_model, param_grid=rolling_param_grid, cv=5, scoring='accuracy', verbose=1)

# Train with GridSearchCV
rolling_grid_search.fit(rolling_X_train, rolling_y_train)

# Best parameters and model performance
print(f"Best Parameters: {rolling_grid_search.best_params_}")
print(f"Best Score: {rolling_grid_search.best_score_}")

# Evaluate the best model
rolling_best_model = rolling_grid_search.best_estimator_
rolling_y_pred = rolling_best_model.predict(rolling_X_test)
print(f"Accuracy: {accuracy_score(rolling_y_test, rolling_y_pred)}")
print(classification_report(rolling_y_test, rolling_y_pred))

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best Parameters: {'max_depth': 30, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 50}
Best Score: 0.5162316599721013
Accuracy: 0.5989445910290238
              precision    recall  f1-score   support

          -1       0.63      0.71      0.67       296
           0       0.42      0.03      0.06       164
           1       0.58      0.80      0.67       298

    accuracy                           0.60       758
   macro avg       0.54      0.51      0.47       758
weighted avg       0.56      0.60      0.54       758

