## Data Preperation

In [None]:
# Loading necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV
from dataset_versioning import save_dataset_version, clean_dataset

In [None]:
# Loading and inspecting the data
file_path = 'datasets_versions/EPL_dataset_9_20250305.csv' # change datasets as needed
data = pd.read_csv(file_path)

In [None]:
data['Date'] = pd.to_datetime(data['Date']) # converting date

  data['Date'] = pd.to_datetime(data['Date'])


In [None]:
# Inspecting the data
print(data.head()) # shows the very top of the dataset

        Date   HomeTeam       AwayTeam  FTHG  FTAG  FTR  HTHG  HTAG  HTR  HS  \
0 2024-05-19    Arsenal        Everton     2     1    0     1     1    1  26   
1 2024-05-19  Brentford      Newcastle     2     4    2     0     3    2  10   
2 2024-05-19   Brighton     Man United     0     2    2     0     0    1  17   
3 2024-05-19    Burnley  Nott'm Forest     1     2    2     0     2    2  20   
4 2024-05-19    Chelsea    Bournemouth     2     1    0     1     0    0  16   

   ...       ADS  Home_Overall  Away_Overall  TravelDistance  HomeProb  \
0  ...  1.070707            82            77      288.384207  0.818569   
1  ...  1.077441            77            81      397.511501  0.326078   
2  ...  0.895623            77            82      326.069081  0.441779   
3  ...  0.545455            74            76      119.316643  0.330535   
4  ...  0.787879            81            74      141.818451  0.660229   

   DrawProb  AwayProb  HomeMomentum  AwayMomentum  ExpectedGoalDifference 

In [None]:
print(data.tail()) # shows the very bottom of the dataset

           Date        HomeTeam     AwayTeam  FTHG  FTAG  FTR  HTHG  HTAG  \
1895 2019-08-10         Burnley  Southampton     3     0    0     0     0   
1896 2019-08-10  Crystal Palace      Everton     0     0    1     0     0   
1897 2019-08-10         Watford     Brighton     0     3    2     0     1   
1898 2019-08-10       Tottenham  Aston Villa     3     1    0     0     1   
1899 2019-08-09       Liverpool      Norwich     4     1    0     4     0   

      HTR  HS  ...       ADS  Home_Overall  Away_Overall  TravelDistance  \
1895    1  10  ...  0.989899            76            76      325.623794   
1896    1   6  ...  1.070707            77            78      302.519907   
1897    2  11  ...  0.936027            77            76       90.480320   
1898    2  31  ...  0.936027            82            76      159.886179   
1899    0  15  ...  0.531987            85            74      299.321920   

      HomeProb  DrawProb  AwayProb  HomeMomentum  AwayMomentum  \
1895  0.357612

In [None]:
print(data.describe()) 

                                Date         FTHG         FTAG          FTR  \
count                           1900  1900.000000  1900.000000  1900.000000   
mean   2022-01-12 20:39:09.473684224     1.563158     1.310000     0.890526   
min              2019-08-09 00:00:00     0.000000     0.000000     0.000000   
25%              2020-11-29 00:00:00     1.000000     0.000000     0.000000   
50%              2022-01-02 12:00:00     1.000000     1.000000     1.000000   
75%              2023-04-05 00:00:00     2.000000     2.000000     2.000000   
max              2024-05-19 00:00:00     9.000000     9.000000     2.000000   
std                              NaN     1.344233     1.238833     0.872380   

              HTHG         HTAG          HTR           HS           AS  \
count  1900.000000  1900.000000  1900.000000  1900.000000  1900.000000   
mean      0.707895     0.586842     0.922632    13.868421    11.598421   
min       0.000000     0.000000     0.000000     1.000000     1.00

In [None]:
# Assigning Points Based on FTR
def assign_points(row):
    if row['FTR'] == 0:  # Home win
        return (3, 0)
    elif row['FTR'] == 2:  # Away win
        return (0, 3)
    else:  # Draw
        return (1, 1)

data[['HomePoints', 'AwayPoints']] = data.apply(assign_points, axis=1, result_type='expand')

In [None]:
# Filter Training Data (Seasons 2019-2023)
train_data = data[data['Date'] < "2023-08-01"]

# Filter Test Data (2023/24 Season)
test_data = data[data['Date'] >= "2023-08-01"]


features = ['HTHG','HTAG','HTR','HS','AS','HST','AST','HF','AF',
            'HC','AC','HY','AY','HR','AR','AvgH','AvgD','AvgA', 
            'Home_Overall','Away_Overall','HAS','HDS','AAS','ADS',
            'TravelDistance','HomeProb','DrawProb','AwayProb',
            'HomeMomentum','AwayMomentum','ExpectedGoalDifference'] # this needs to change depending on the dataset and the features inside the chosen dataset

target = 'FTR' # our target is Full Time Result

# Training and Testing Data
X_train = train_data[features]
y_train = train_data[target]

X_test = test_data[features]
y_test = test_data[target]

### Train - Test Split

In [39]:
# Optionally, scale features for Logistic Regression
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Model Building

In [None]:
# Training Logistic Regression
log_reg = LogisticRegression(class_weight='balanced')
log_reg.fit(X_train_scaled, y_train)


rf = RandomForestClassifier(
    n_estimators=100,  
    max_depth=10,      
    min_samples_split=5,  
    random_state=42
)
# Ensure the same scaled features are used for both models
rf.fit(X_train_scaled, y_train)

### Logistic Regression

In [None]:
# Getting probabilities from both models
lr_probs = log_reg.predict_proba(X_test_scaled)  # Logistic Regression probabilities
rf_probs = rf.predict_proba(X_test_scaled)      # Random Forest probabilities

# Averaging the probabilities
ensemble_probs = (lr_probs + rf_probs) / 2

# Converting averaged probabilities to predictions
ensemble_predictions = np.argmax(ensemble_probs, axis=1)

# Confusion Matrix with percentages
cm_ens = confusion_matrix(y_test, ensemble_predictions)

labels = ['Win', 'Draw', 'Loss']
cm_ens_df = pd.DataFrame(
    cm_ens,
    index=[f'Actual {l}' for l in labels],
    columns=[f'Predicted {l}' for l in labels]
)

cm_ens_percent = (cm_ens_df.div(cm_ens_df.sum(axis=1), axis=0) * 100).round(1)

cm_ens_formatted = cm_ens_df.astype(str) + " (" + cm_ens_percent.astype(str) + "%)"

# Printing the formatted confusion matrix
print("Ensemble Confusion Matrix with Counts and Row-wise Percentages:")
print(cm_ens_formatted.to_string())

# Printing the classification report and accuracy
print("\nEnsemble Classification Report:")
print(classification_report(y_test, ensemble_predictions))
print(f"Accuracy: {accuracy_score(y_test, ensemble_predictions):.2f}")

Ensemble Confusion Matrix with Counts and Row-wise Percentages:
            Predicted Win Predicted Draw Predicted Loss
Actual Win    142 (81.1%)     27 (15.4%)       6 (3.4%)
Actual Draw    24 (29.3%)     35 (42.7%)     23 (28.0%)
Actual Loss      8 (6.5%)     22 (17.9%)     93 (75.6%)

Ensemble Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.81      0.81       175
           1       0.42      0.43      0.42        82
           2       0.76      0.76      0.76       123

    accuracy                           0.71       380
   macro avg       0.67      0.66      0.66       380
weighted avg       0.71      0.71      0.71       380

Accuracy: 0.71


## League Standings Simulation

### Ensemble model

In [None]:
# Assigning ensemble predictions to test data
test_data['Predicted_FTR'] = ensemble_predictions

# assigning points based on the predictions made
def assign_points(row):
    if row['Predicted_FTR'] == 0:  # Home win
        return (3, 0)
    elif row['Predicted_FTR'] == 2:  # Away win
        return (0, 3)
    else:  # Draw
        return (1, 1)

test_data[['HomePoints', 'AwayPoints']] = test_data.apply(assign_points, axis=1, result_type='expand')

# calculating points for each team
home_points = test_data.groupby('HomeTeam')['HomePoints'].sum()
away_points = test_data.groupby('AwayTeam')['AwayPoints'].sum()

# combining both home and away points to form league table
total_points = home_points.add(away_points, fill_value=0).sort_values(ascending=False)

league_table = total_points.sort_values(ascending=False).reset_index()
league_table.columns = ['Team', 'Points']

# displaying predicted league table for 2023/24 season
print("\n=== Predicted League Table for 2023/24 ===")
print(league_table)


=== Predicted League Table for 2023/24 ===
                Team  Points
0           Man City      97
1            Arsenal      96
2          Liverpool      80
3            Chelsea      77
4          Newcastle      65
5          Tottenham      63
6        Aston Villa      62
7         Man United      59
8           Brighton      57
9     Crystal Palace      55
10       Bournemouth      49
11            Fulham      48
12           Everton      46
13          West Ham      42
14            Wolves      41
15     Nott'm Forest      36
16         Brentford      35
17             Luton      20
18           Burnley      16
19  Sheffield United      12


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['Predicted_FTR'] = ensemble_predictions
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data[['HomePoints', 'AwayPoints']] = test_data.apply(assign_points, axis=1, result_type='expand')


In [None]:
# carrying out a feature importance analysis
feature_importances = pd.DataFrame({
    'Feature': features,
    'Importance': rf.feature_importances_
}).sort_values(by='Importance', ascending=False)
print(feature_importances)

                   Feature  Importance
29            AwayMomentum    0.102484
2                      HTR    0.095654
28            HomeMomentum    0.082992
5                      HST    0.054632
1                     HTAG    0.053300
0                     HTHG    0.051621
6                      AST    0.048431
25                HomeProb    0.037512
27                AwayProb    0.036299
15                    AvgH    0.031619
30  ExpectedGoalDifference    0.029606
17                    AvgA    0.028532
26                DrawProb    0.026693
4                       AS    0.026103
24          TravelDistance    0.025905
3                       HS    0.024884
16                    AvgD    0.024884
9                       HC    0.023220
20                     HAS    0.020143
10                      AC    0.019284
7                       HF    0.018708
21                     HDS    0.018683
8                       AF    0.018651
22                     AAS    0.017162
19            Away_Overal

## Table Level Analysis and Predictions

In [None]:
# necessary libaries
import pandas as pd
from itertools import combinations
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Loading actual and predicted league tables
actual_table = pd.read_csv('Tables/Actual_league_table.csv') 
predicted_table = league_table

# Ensuring both tables contain the same teams
common_teams = set(actual_table["Team"]).intersection(set(predicted_table["Team"]))
actual_table = actual_table[actual_table["Team"].isin(common_teams)]
predicted_table = predicted_table[predicted_table["Team"].isin(common_teams)]

# ranking teams based on points totalled
actual_table["Actual_Rank"] = actual_table["Points"].rank(ascending=False, method='dense')
predicted_table["Predicted_Rank"] = predicted_table["Points"].rank(ascending=False, method='dense')

# merging actual and predicted points
merged = actual_table.merge(predicted_table, on="Team", how="inner")

# Sorting teams by Points, then Goal Difference
actual_table = actual_table.sort_values(by=['Points', 'GD'], ascending=[False, False])

# Assigning ranks 
actual_table["Actual_Rank"] = range(1, len(actual_table) + 1)

# RMSE,MAE and MSE analysis
# Calculating correctly predicted relative positions
correct_pairs = 0
total_pairs = 0

for (team1, team2) in combinations(merged["Team"], 2):
    actual_diff = merged.loc[merged["Team"] == team1, "Actual_Rank"].values[0] - merged.loc[merged["Team"] == team2, "Actual_Rank"].values[0]
    predicted_diff = merged.loc[merged["Team"] == team1, "Predicted_Rank"].values[0] - merged.loc[merged["Team"] == team2, "Predicted_Rank"].values[0]
    
    if (actual_diff > 0 and predicted_diff > 0) or (actual_diff < 0 and predicted_diff < 0) or (actual_diff == 0 and predicted_diff == 0):
        correct_pairs += 1
    total_pairs += 1

percentage_correct_relative_positions = (correct_pairs / total_pairs) * 100
print(f"Percentage of Correctly Predicted Relative Positions: {percentage_correct_relative_positions:.2f}%")

# Compute MSE and MAE
y_actual = merged["Actual_Rank"]
y_pred = merged["Predicted_Rank"]

mse_positions = mean_squared_error(y_actual, y_pred)
mae_positions = mean_absolute_error(y_actual, y_pred)

print(f"Mean Squared Error (MSE) of League Positions: {mse_positions:.2f}")
print(f"Mean Absolute Error (MAE) of League Positions: {mae_positions:.2f}")

# Merging actual and predicted tables on "Team"
combined_table = actual_table.merge(predicted_table, on="Team", how="inner", suffixes=('_Actual', '_Predicted'))

# Sort by Actual Rank
combined_table = combined_table.sort_values(by=["Actual_Rank"], ascending=True)


combined_table["Rank_Difference"] = abs(combined_table["Actual_Rank"] - combined_table["Predicted_Rank"])
combined_table = combined_table.sort_values(by="Rank_Difference")

print(combined_table[["Team", "Actual_Rank", "Predicted_Rank", "Rank_Difference"]])



Percentage of Correctly Predicted Relative Positions: 92.11%
Mean Squared Error (MSE) of League Positions: 4.50
Mean Absolute Error (MAE) of League Positions: 1.60
                Team  Actual_Rank  Predicted_Rank  Rank_Difference
0           Man City            1             1.0              0.0
1            Arsenal            2             2.0              0.0
2          Liverpool            3             3.0              0.0
7         Man United            8             8.0              0.0
9     Crystal Palace           10            10.0              0.0
19  Sheffield United           20            20.0              0.0
18           Burnley           19            19.0              0.0
17             Luton           18            18.0              0.0
15         Brentford           16            17.0              1.0
12            Fulham           13            12.0              1.0
11       Bournemouth           12            11.0              1.0
13            Wolves           1