## Data Preparation

In [None]:
# importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV
from dataset_versioning import save_dataset_version, clean_dataset

In [None]:
# Loading and inspecting the data
file_path = 'datasets_versions/EPL_dataset_9_20250305.csv' # change according to dataset chosen
data = pd.read_csv(file_path)

In [3]:
data['Date'] = pd.to_datetime(data['Date'])

  data['Date'] = pd.to_datetime(data['Date'])


In [None]:
# inspecting start of the dataset
print(data.head())

        Date   HomeTeam       AwayTeam  FTHG  FTAG  FTR  HTHG  HTAG  HTR  HS  \
0 2024-05-19    Arsenal        Everton     2     1    0     1     1    1  26   
1 2024-05-19  Brentford      Newcastle     2     4    2     0     3    2  10   
2 2024-05-19   Brighton     Man United     0     2    2     0     0    1  17   
3 2024-05-19    Burnley  Nott'm Forest     1     2    2     0     2    2  20   
4 2024-05-19    Chelsea    Bournemouth     2     1    0     1     0    0  16   

   ...       ADS  Home_Overall  Away_Overall  TravelDistance  HomeProb  \
0  ...  1.070707            82            77      288.384207  0.818569   
1  ...  1.077441            77            81      397.511501  0.326078   
2  ...  0.895623            77            82      326.069081  0.441779   
3  ...  0.545455            74            76      119.316643  0.330535   
4  ...  0.787879            81            74      141.818451  0.660229   

   DrawProb  AwayProb  HomeMomentum  AwayMomentum  ExpectedGoalDifference 

In [None]:
print(data.tail()) # inspecting the end of the dataset

           Date        HomeTeam     AwayTeam  FTHG  FTAG  FTR  HTHG  HTAG  \
1895 2019-08-10         Burnley  Southampton     3     0    0     0     0   
1896 2019-08-10  Crystal Palace      Everton     0     0    1     0     0   
1897 2019-08-10         Watford     Brighton     0     3    2     0     1   
1898 2019-08-10       Tottenham  Aston Villa     3     1    0     0     1   
1899 2019-08-09       Liverpool      Norwich     4     1    0     4     0   

      HTR  HS  ...       ADS  Home_Overall  Away_Overall  TravelDistance  \
1895    1  10  ...  0.989899            76            76      325.623794   
1896    1   6  ...  1.070707            77            78      302.519907   
1897    2  11  ...  0.936027            77            76       90.480320   
1898    2  31  ...  0.936027            82            76      159.886179   
1899    0  15  ...  0.531987            85            74      299.321920   

      HomeProb  DrawProb  AwayProb  HomeMomentum  AwayMomentum  \
1895  0.357612

In [6]:
print(data.describe())

                                Date         FTHG         FTAG          FTR  \
count                           1900  1900.000000  1900.000000  1900.000000   
mean   2022-01-12 20:39:09.473684224     1.563158     1.310000     0.890526   
min              2019-08-09 00:00:00     0.000000     0.000000     0.000000   
25%              2020-11-29 00:00:00     1.000000     0.000000     0.000000   
50%              2022-01-02 12:00:00     1.000000     1.000000     1.000000   
75%              2023-04-05 00:00:00     2.000000     2.000000     2.000000   
max              2024-05-19 00:00:00     9.000000     9.000000     2.000000   
std                              NaN     1.344233     1.238833     0.872380   

              HTHG         HTAG          HTR           HS           AS  \
count  1900.000000  1900.000000  1900.000000  1900.000000  1900.000000   
mean      0.707895     0.586842     0.922632    13.868421    11.598421   
min       0.000000     0.000000     0.000000     1.000000     1.00

In [None]:
# Assigning Points Based on FTR
def assign_points(row):
    if row['FTR'] == 0:  # Home win
        return (3, 0)
    elif row['FTR'] == 2:  # Away win
        return (0, 3)
    else:  # Draw
        return (1, 1)

data[['HomePoints', 'AwayPoints']] = data.apply(assign_points, axis=1, result_type='expand')

In [None]:
# Filtering Training Data (Seasons 2019-2023)
train_data = data[data['Date'] < "2023-08-01"]

# Filtering Test Data (2023/24 Season)
test_data = data[data['Date'] >= "2023-08-01"]


features = ['HTHG','HTAG','HTR','HS','AS','HST','AST','HF','AF',
            'HC','AC','HY','AY','HR','AR','AvgH','AvgD','AvgA', 
            'Home_Overall','Away_Overall','HAS','HDS','AAS','ADS',
            'TravelDistance','HomeProb','DrawProb','AwayProb',
            'HomeMomentum','AwayMomentum','ExpectedGoalDifference']

target = 'FTR'

# Training and Testing Data
X_train = train_data[features]
y_train = train_data[target]

X_test = test_data[features]
y_test = test_data[target]

### Train - Test Split

In [None]:
# scaling features for random forest
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Model Building

### Random Forest

In [None]:
rf = RandomForestClassifier(
    n_estimators=100,  
    max_depth=10,      
    min_samples_split=5,  
    random_state=42
)
rf.fit(X_train_scaled, y_train)

# Evaluating random forest
y_pred_rf = rf.predict(X_test_scaled)

# confusion matrix with percentages
cm_rf = confusion_matrix(y_test, y_pred_rf)

labels = ['Win', 'Draw', 'Loss']
cm_rf_df = pd.DataFrame(
    cm_rf,
    index=[f'Actual {l}' for l in labels],
    columns=[f'Predicted {l}' for l in labels]
)

cm_rf_percent = (cm_rf_df.div(cm_rf_df.sum(axis=1), axis=0) * 100).round(1)

cm_rf_formatted = cm_rf_df.astype(str) + " (" + cm_rf_percent.astype(str) + "%)"

# print the formatted confusion matrix
print("Random Forest Confusion Matrix with Counts and Row-wise Percentages:")
print(cm_rf_formatted.to_string())

# Print the classification report and accuracy
print("\nRandom Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.2f}")

Random Forest Confusion Matrix with Counts and Row-wise Percentages:
            Predicted Win Predicted Draw Predicted Loss
Actual Win    154 (88.0%)      15 (8.6%)       6 (3.4%)
Actual Draw    41 (50.0%)     12 (14.6%)     29 (35.4%)
Actual Loss    19 (15.4%)       7 (5.7%)     97 (78.9%)

Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.88      0.79       175
           1       0.35      0.15      0.21        82
           2       0.73      0.79      0.76       123

    accuracy                           0.69       380
   macro avg       0.60      0.60      0.59       380
weighted avg       0.65      0.69      0.66       380

Accuracy: 0.69


## League Standings Simulation

### Random Forest

In [None]:
# Predicting match outcomes using Random Forest
test_data['Predicted_FTR'] = rf.predict(X_test_scaled)  # Assign only to test_data

# Assigning points based on predictions
def assign_points(row):
    if row['Predicted_FTR'] == 0:  # Home win
        return (3, 0)
    elif row['Predicted_FTR'] == 2:  # Away win
        return (0, 3)
    else:  # Draw
        return (1, 1)

# Applying the function to assign predicted points
test_data[['HomePoints', 'AwayPoints']] = test_data.apply(assign_points, axis=1, result_type='expand')

# Calculating total points for each team
home_points = test_data.groupby('HomeTeam')['HomePoints'].sum()
away_points = test_data.groupby('AwayTeam')['AwayPoints'].sum() 

# Combining home and away points to get league standings
total_points = home_points.add(away_points, fill_value=0).sort_values(ascending=False)

# Creating league table
league_table = total_points.sort_values(ascending=False).reset_index()
league_table.columns = ['Team', 'Points']

# Displaying the final league table for 2023/24 season
print("\n=== Predicted League Table for 2023/24 ===")
print(league_table)



=== Predicted League Table for 2023/24 ===
                Team  Points
0           Man City     102
1            Arsenal     101
2          Liverpool      89
3            Chelsea      72
4        Aston Villa      71
5          Tottenham      71
6          Newcastle      69
7         Man United      60
8           Brighton      59
9        Bournemouth      54
10    Crystal Palace      53
11            Wolves      51
12            Fulham      49
13           Everton      48
14          West Ham      43
15         Brentford      37
16     Nott'm Forest      29
17           Burnley      24
18             Luton      13
19  Sheffield United      11


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['Predicted_FTR'] = rf.predict(X_test_scaled)  # Assign only to test_data
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data[['HomePoints', 'AwayPoints']] = test_data.apply(assign_points, axis=1, result_type='expand')


In [None]:
# feature importance analysis
feature_importances = pd.DataFrame({
    'Feature': features,
    'Importance': rf.feature_importances_
}).sort_values(by='Importance', ascending=False)
print(feature_importances)

                   Feature  Importance
29            AwayMomentum    0.102484
2                      HTR    0.095654
28            HomeMomentum    0.082992
5                      HST    0.054632
1                     HTAG    0.053300
0                     HTHG    0.051621
6                      AST    0.048431
25                HomeProb    0.037512
27                AwayProb    0.036299
15                    AvgH    0.031619
30  ExpectedGoalDifference    0.029606
17                    AvgA    0.028532
26                DrawProb    0.026693
4                       AS    0.026103
24          TravelDistance    0.025905
3                       HS    0.024884
16                    AvgD    0.024884
9                       HC    0.023220
20                     HAS    0.020143
10                      AC    0.019284
7                       HF    0.018708
21                     HDS    0.018683
8                       AF    0.018651
22                     AAS    0.017162
19            Away_Overal

## League Table Level Analysis and Predictions

In [None]:
import pandas as pd
from itertools import combinations
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Loading actual and predicted league tables
actual_table = pd.read_csv('Tables/Actual_league_table.csv')  
predicted_table = league_table 

# Ensuring both tables contain the same teams
common_teams = set(actual_table["Team"]).intersection(set(predicted_table["Team"]))
actual_table = actual_table[actual_table["Team"].isin(common_teams)]
predicted_table = predicted_table[predicted_table["Team"].isin(common_teams)]

# Ranking teams based on points
actual_table["Actual_Rank"] = actual_table["Points"].rank(ascending=False, method='dense')
predicted_table["Predicted_Rank"] = predicted_table["Points"].rank(ascending=False, method='dense')

# Merging actual and predicted rankings
merged = actual_table.merge(predicted_table, on="Team", how="inner")

# Sort teams by Points, then Goal Difference
actual_table = actual_table.sort_values(by=['Points', 'GD'], ascending=[False, False])

# Assign ranks correctly
actual_table["Actual_Rank"] = range(1, len(actual_table) + 1)

# Calculate correctly predicted relative positions
correct_pairs = 0
total_pairs = 0

for (team1, team2) in combinations(merged["Team"], 2):
    actual_diff = merged.loc[merged["Team"] == team1, "Actual_Rank"].values[0] - merged.loc[merged["Team"] == team2, "Actual_Rank"].values[0]
    predicted_diff = merged.loc[merged["Team"] == team1, "Predicted_Rank"].values[0] - merged.loc[merged["Team"] == team2, "Predicted_Rank"].values[0]
    
    if (actual_diff > 0 and predicted_diff > 0) or (actual_diff < 0 and predicted_diff < 0) or (actual_diff == 0 and predicted_diff == 0):
        correct_pairs += 1
    total_pairs += 1

percentage_correct_relative_positions = (correct_pairs / total_pairs) * 100
print(f"Percentage of Correctly Predicted Relative Positions: {percentage_correct_relative_positions:.2f}%")

# Compute MSE and MAE
y_actual = merged["Actual_Rank"]
y_pred = merged["Predicted_Rank"]

mse_positions = mean_squared_error(y_actual, y_pred)
mae_positions = mean_absolute_error(y_actual, y_pred)

print(f"Mean Squared Error (MSE) of League Positions: {mse_positions:.2f}")
print(f"Mean Absolute Error (MAE) of League Positions: {mae_positions:.2f}")

# Merge actual and predicted tables on "Team"
combined_table = actual_table.merge(predicted_table, on="Team", how="inner", suffixes=('_Actual', '_Predicted'))

# Sort by Actual Rank
combined_table = combined_table.sort_values(by=["Actual_Rank"], ascending=True)


combined_table["Rank_Difference"] = abs(combined_table["Actual_Rank"] - combined_table["Predicted_Rank"])
combined_table = combined_table.sort_values(by="Rank_Difference")

print(combined_table[["Team", "Actual_Rank", "Predicted_Rank", "Rank_Difference"]])



Percentage of Correctly Predicted Relative Positions: 92.11%
Mean Squared Error (MSE) of League Positions: 2.85
Mean Absolute Error (MAE) of League Positions: 1.05
                Team  Actual_Rank  Predicted_Rank  Rank_Difference
0           Man City            1             1.0              0.0
1            Arsenal            2             2.0              0.0
2          Liverpool            3             3.0              0.0
4          Tottenham            5             5.0              0.0
9     Crystal Palace           10            10.0              0.0
17             Luton           18            18.0              0.0
7         Man United            8             7.0              1.0
3        Aston Villa            4             5.0              1.0
15         Brentford           16            15.0              1.0
12            Fulham           13            12.0              1.0
16     Nott'm Forest           17            16.0              1.0
6          Newcastle            

## Poisson Distribution for when teams are predicted in the same position

### Re-structure league table accoring to goal difference

In [None]:
# Creating per-match team-level dataset
home_df = data[['HomeTeam', 'AwayTeam', 'FTHG', 'FTAG']].copy()
home_df.columns = ['Team', 'Opponent', 'GoalsFor', 'GoalsAgainst']
home_df['Home'] = 1

away_df = data[['AwayTeam', 'HomeTeam', 'FTAG', 'FTHG']].copy()
away_df.columns = ['Team', 'Opponent', 'GoalsFor', 'GoalsAgainst']
away_df['Home'] = 0

team_match_df = pd.concat([home_df, away_df], ignore_index=True)

# Encoding team names for regression
from sklearn.preprocessing import LabelEncoder

le_team = LabelEncoder()
le_opp = LabelEncoder()

team_match_df['Team_enc'] = le_team.fit_transform(team_match_df['Team'])
team_match_df['Opponent_enc'] = le_opp.fit_transform(team_match_df['Opponent'])

# Fitting regression models for GF and GA
from sklearn.ensemble import RandomForestRegressor

X = team_match_df[['Team_enc', 'Opponent_enc', 'Home']]
y_gf = team_match_df['GoalsFor']
y_ga = team_match_df['GoalsAgainst']

# Poisson for Goals For
model_gf = RandomForestRegressor(random_state=42)
model_gf.fit(X, y_gf)
team_match_df['Predicted_GF'] = model_gf.predict(X)

# Poisson for Goals Against
model_ga = RandomForestRegressor(random_state=42)
model_ga.fit(X, y_ga)
team_match_df['Predicted_GA'] = model_ga.predict(X)

avg_preds = team_match_df.groupby('Team')[['Predicted_GF', 'Predicted_GA']].mean()
season_predictions = avg_preds * 38  # Simulate a 38-match season
season_predictions['Predicted_GD'] = season_predictions['Predicted_GF'] - season_predictions['Predicted_GA']

# Merge GD and re-rank
final_table = combined_table.merge(season_predictions[['Predicted_GD']], on='Team', how='left')
final_table.sort_values(by=['Predicted_Rank', 'Predicted_GD'], ascending=[True, False], inplace=True)
final_table['Adjusted_Rank'] = range(1, len(final_table) + 1)

# View final adjusted league table
print("\nAdjusted Table (with realistic GD tiebreaker):")
print(final_table[['Team', 'Actual_Rank', 'Predicted_Rank', 'Predicted_GD', 'Adjusted_Rank']])



Adjusted Table (with realistic GD tiebreaker):
                Team  Actual_Rank  Predicted_Rank  Predicted_GD  Adjusted_Rank
0           Man City            1             1.0     62.157689              1
1            Arsenal            2             2.0     27.660738              2
2          Liverpool            3             3.0     44.169705              3
14           Chelsea            6             4.0     17.244652              4
3          Tottenham            5             5.0     17.444366              5
7        Aston Villa            4             5.0      0.438283              6
11         Newcastle            7             6.0      0.510387              7
6         Man United            8             7.0     14.608738              8
18          Brighton           11             8.0     -2.516199              9
16       Bournemouth           12             9.0    -24.089564             10
4     Crystal Palace           10            10.0     -9.810557             11
17  

### Performing new league table analaysis (MAE, MSE, RMSE)

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
from itertools import combinations
import pandas as pd

# Calculate metrics
y_actual = final_table["Actual_Rank"]
y_pred = final_table["Adjusted_Rank"]

mse_positions = mean_squared_error(y_actual, y_pred)
mae_positions = mean_absolute_error(y_actual, y_pred)

# Calculate predicted relative positions
correct_pairs = 0
total_pairs = 0

for team1, team2 in combinations(final_table["Team"], 2):
    actual_diff = final_table.loc[final_table["Team"] == team1, "Actual_Rank"].values[0] - \
                  final_table.loc[final_table["Team"] == team2, "Actual_Rank"].values[0]
    predicted_diff = final_table.loc[final_table["Team"] == team1, "Adjusted_Rank"].values[0] - \
                     final_table.loc[final_table["Team"] == team2, "Adjusted_Rank"].values[0]

    if (actual_diff > 0 and predicted_diff > 0) or \
       (actual_diff < 0 and predicted_diff < 0) or \
       (actual_diff == 0 and predicted_diff == 0):
        correct_pairs += 1
    total_pairs += 1

percentage_correct_relative_positions = (correct_pairs / total_pairs) * 100

print(f"Percentage of Correctly Predicted Relative Positions: {percentage_correct_relative_positions:.2f}%")
print(f"Mean Squared Error (MSE) of League Positions: {mse_positions:.2f}")
print(f"Mean Absolute Error (MAE) of League Positions: {mae_positions:.2f}")

# Create and display detailed rank comparison
final_table['Rank_Difference'] = abs(final_table['Actual_Rank'] - final_table['Adjusted_Rank'])
final_table_sorted = final_table.sort_values(by='Rank_Difference')

print(final_table_sorted[['Team', 'Actual_Rank', 'Adjusted_Rank', 'Rank_Difference']])

Percentage of Correctly Predicted Relative Positions: 93.16%
Mean Squared Error (MSE) of League Positions: 3.00
Mean Absolute Error (MAE) of League Positions: 1.00
                Team  Actual_Rank  Adjusted_Rank  Rank_Difference
0           Man City            1              1                0
1            Arsenal            2              2                0
2          Liverpool            3              3                0
3          Tottenham            5              5                0
6         Man United            8              8                0
11         Newcastle            7              7                0
9             Fulham           13             13                0
8          Brentford           16             16                0
12  Sheffield United           20             20                0
10     Nott'm Forest           17             17                0
4     Crystal Palace           10             11                1
15           Everton           15           