## Data Preparation

In [None]:
# importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV
from dataset_versioning import save_dataset_version, clean_dataset

In [None]:
# Loading and inspecting the data
file_path = 'datasets_versions/EPL_dataset_9_20250305.csv' # change according to dataset chosen
data = pd.read_csv(file_path)

In [16]:
data['Date'] = pd.to_datetime(data['Date'])

  data['Date'] = pd.to_datetime(data['Date'])


In [None]:
# inspecting start of the dataset
print(data.head())

        Date   HomeTeam       AwayTeam  FTHG  FTAG  FTR  HTHG  HTAG  HTR  HS  \
0 2024-05-19    Arsenal        Everton     2     1    0     1     1    1  26   
1 2024-05-19  Brentford      Newcastle     2     4    2     0     3    2  10   
2 2024-05-19   Brighton     Man United     0     2    2     0     0    1  17   
3 2024-05-19    Burnley  Nott'm Forest     1     2    2     0     2    2  20   
4 2024-05-19    Chelsea    Bournemouth     2     1    0     1     0    0  16   

   ...       ADS  Home_Overall  Away_Overall  TravelDistance  HomeProb  \
0  ...  1.070707            82            77      288.384207  0.818569   
1  ...  1.077441            77            81      397.511501  0.326078   
2  ...  0.895623            77            82      326.069081  0.441779   
3  ...  0.545455            74            76      119.316643  0.330535   
4  ...  0.787879            81            74      141.818451  0.660229   

   DrawProb  AwayProb  HomeMomentum  AwayMomentum  ExpectedGoalDifference 

In [None]:
print(data.tail()) # inspecting the end of the dataset

           Date        HomeTeam     AwayTeam  FTHG  FTAG  FTR  HTHG  HTAG  \
1895 2019-08-10         Burnley  Southampton     3     0    0     0     0   
1896 2019-08-10  Crystal Palace      Everton     0     0    1     0     0   
1897 2019-08-10         Watford     Brighton     0     3    2     0     1   
1898 2019-08-10       Tottenham  Aston Villa     3     1    0     0     1   
1899 2019-08-09       Liverpool      Norwich     4     1    0     4     0   

      HTR  HS  ...       ADS  Home_Overall  Away_Overall  TravelDistance  \
1895    1  10  ...  0.989899            76            76      325.623794   
1896    1   6  ...  1.070707            77            78      302.519907   
1897    2  11  ...  0.936027            77            76       90.480320   
1898    2  31  ...  0.936027            82            76      159.886179   
1899    0  15  ...  0.531987            85            74      299.321920   

      HomeProb  DrawProb  AwayProb  HomeMomentum  AwayMomentum  \
1895  0.357612

In [19]:
print(data.describe())

                                Date         FTHG         FTAG          FTR  \
count                           1900  1900.000000  1900.000000  1900.000000   
mean   2022-01-12 20:39:09.473684224     1.563158     1.310000     0.890526   
min              2019-08-09 00:00:00     0.000000     0.000000     0.000000   
25%              2020-11-29 00:00:00     1.000000     0.000000     0.000000   
50%              2022-01-02 12:00:00     1.000000     1.000000     1.000000   
75%              2023-04-05 00:00:00     2.000000     2.000000     2.000000   
max              2024-05-19 00:00:00     9.000000     9.000000     2.000000   
std                              NaN     1.344233     1.238833     0.872380   

              HTHG         HTAG          HTR           HS           AS  \
count  1900.000000  1900.000000  1900.000000  1900.000000  1900.000000   
mean      0.707895     0.586842     0.922632    13.868421    11.598421   
min       0.000000     0.000000     0.000000     1.000000     1.00

In [None]:
# Assigning Points Based on FTR
def assign_points(row):
    if row['FTR'] == 0:  # Home win
        return (3, 0)
    elif row['FTR'] == 2:  # Away win
        return (0, 3)
    else:  # Draw
        return (1, 1)

data[['HomePoints', 'AwayPoints']] = data.apply(assign_points, axis=1, result_type='expand')

In [None]:
# Filtering Training Data (Seasons 2019-2023)
train_data = data[data['Date'] < "2023-08-01"]

# Filtering Test Data (2023/24 Season)
test_data = data[data['Date'] >= "2023-08-01"]


features = ['HTHG','HTAG','HTR','HS','AS','HST','AST','HF','AF',
            'HC','AC','HY','AY','HR','AR','AvgH','AvgD','AvgA', 
            'Home_Overall','Away_Overall','HAS','HDS','AAS','ADS',
            'TravelDistance','HomeProb','DrawProb','AwayProb',
            'HomeMomentum','AwayMomentum','ExpectedGoalDifference']
            

target = 'FTR'

# Training and Testing Data
X_train = train_data[features]
y_train = train_data[target]

X_test = test_data[features]
y_test = test_data[target]

### Train - Test Split

In [None]:
# scaling features for logistic regression
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Model Building

### Logistic Regression

In [None]:
# Training Logistic Regression
# log_reg = LogisticRegression()
# log_reg.fit(X_train_scaled, y_train)

log_reg = LogisticRegression(class_weight='balanced')
log_reg.fit(X_train_scaled, y_train)

# Evaluating the Logistic Regression model
y_pred_log = log_reg.predict(X_test_scaled)

# confusion matrix with percentages
cm = confusion_matrix(y_test, y_pred_log)

labels = ['Win', 'Draw', 'Loss']
cm_df = pd.DataFrame(
    cm,
    index=[f'Actual {l}' for l in labels],
    columns=[f'Predicted {l}' for l in labels]
)

cm_percent = (cm_df.div(cm_df.sum(axis=1), axis=0) * 100).round(1)
cm_formatted = cm_df.astype(str) + " (" + cm_percent.astype(str) + "%)"

# Print the formatted confusion matrix
print("Confusion Matrix with Counts and Row-wise Percentages:")
print(cm_formatted.to_string())

# Print the classification report and accuracy
print("\nClassification Report:")
print(classification_report(y_test, y_pred_log))
print(f"Accuracy: {accuracy_score(y_test, y_pred_log):.2f}")

Confusion Matrix with Counts and Row-wise Percentages:
            Predicted Win Predicted Draw Predicted Loss
Actual Win    132 (75.4%)     37 (21.1%)       6 (3.4%)
Actual Draw    19 (23.2%)     45 (54.9%)     18 (22.0%)
Actual Loss      6 (4.9%)     27 (22.0%)     90 (73.2%)

Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.75      0.80       175
           1       0.41      0.55      0.47        82
           2       0.79      0.73      0.76       123

    accuracy                           0.70       380
   macro avg       0.68      0.68      0.68       380
weighted avg       0.73      0.70      0.71       380

Accuracy: 0.70


## League Standings Simulation

### Logistic Regression

In [None]:
# Scale the test features for 2023/24 Season
X_test_scaled = scaler.transform(X_test)

# Predicting match outcomes using Logistic Regression for 2023/24 matches
test_data['Predicted_FTR'] = log_reg.predict(X_test_scaled)

# Assigning points based on predictions for test dataset only
def assign_points(row):
    if row['Predicted_FTR'] == 0:  # Home win
        return (3, 0)
    elif row['Predicted_FTR'] == 2:  # Away win
        return (0, 3)
    else:  # Draw
        return (1, 1)

test_data[['HomePoints', 'AwayPoints']] = test_data.apply(assign_points, axis=1, result_type='expand')

# Calculating total points for each team (2023/24 season only)
home_points = test_data.groupby('HomeTeam')['HomePoints'].sum()
away_points = test_data.groupby('AwayTeam')['AwayPoints'].sum()

# Combining home and away points to get final league standings
total_points = home_points.add(away_points, fill_value=0).sort_values(ascending=False)

league_table = total_points.reset_index()
league_table.columns = ['Team', 'Points']

# Displaying the final league table for 2023/24 season
print("\n=== Predicted League Table for 2023/24 ===")
print(league_table)


=== Predicted League Table for 2023/24 ===
                Team  Points
0           Man City      98
1            Arsenal      94
2          Liverpool      76
3            Chelsea      73
4          Newcastle      64
5        Aston Villa      60
6          Tottenham      59
7         Man United      56
8     Crystal Palace      53
9        Bournemouth      49
10            Fulham      49
11          Brighton      46
12           Everton      45
13     Nott'm Forest      42
14          West Ham      42
15            Wolves      38
16         Brentford      35
17             Luton      21
18           Burnley      19
19  Sheffield United      12


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['Predicted_FTR'] = log_reg.predict(X_test_scaled)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data[['HomePoints', 'AwayPoints']] = test_data.apply(assign_points, axis=1, result_type='expand')


In [None]:
# feature coefficeint analysis
lr_coefficients = pd.DataFrame({
    'Feature': features,
    'Coefficient': log_reg.coef_[0]
}).sort_values(by='Coefficient', ascending=False)
print(lr_coefficients)

                   Feature  Coefficient
0                     HTHG     0.922716
28            HomeMomentum     0.836426
5                      HST     0.684150
16                    AvgD     0.451766
10                      AC     0.301596
2                      HTR     0.230781
26                DrawProb     0.208622
14                      AR     0.179918
19            Away_Overall     0.074518
8                       AF     0.074160
7                       HF     0.054803
22                     AAS     0.044055
24          TravelDistance     0.039258
21                     HDS     0.028156
18            Home_Overall    -0.015120
4                       AS    -0.021011
20                     HAS    -0.021541
27                AwayProb    -0.022041
30  ExpectedGoalDifference    -0.022419
23                     ADS    -0.030996
25                HomeProb    -0.032810
11                      HY    -0.069588
12                      AY    -0.087131
17                    AvgA    -0.118741


## League Table Level Analysis and Predictions

In [None]:
import pandas as pd
from itertools import combinations
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Loading actual and predicted league tables
actual_table = pd.read_csv('Tables/Actual_league_table.csv') 
predicted_table = league_table 

# Ensuring both tables contain the same teams
common_teams = set(actual_table["Team"]).intersection(set(predicted_table["Team"]))
actual_table = actual_table[actual_table["Team"].isin(common_teams)]
predicted_table = predicted_table[predicted_table["Team"].isin(common_teams)]

# Ranking teams based on points
actual_table["Actual_Rank"] = actual_table["Points"].rank(ascending=False, method='dense')
predicted_table["Predicted_Rank"] = predicted_table["Points"].rank(ascending=False, method='dense')

# Merging actual and predicted rankings
merged = actual_table.merge(predicted_table, on="Team", how="inner")

# Sort teams by Points, then Goal Difference
actual_table = actual_table.sort_values(by=['Points', 'GD'], ascending=[False, False])

# Assign ranks correctly
actual_table["Actual_Rank"] = range(1, len(actual_table) + 1)

# Calculate correctly predicted relative positions
correct_pairs = 0
total_pairs = 0

for (team1, team2) in combinations(merged["Team"], 2):
    actual_diff = merged.loc[merged["Team"] == team1, "Actual_Rank"].values[0] - merged.loc[merged["Team"] == team2, "Actual_Rank"].values[0]
    predicted_diff = merged.loc[merged["Team"] == team1, "Predicted_Rank"].values[0] - merged.loc[merged["Team"] == team2, "Predicted_Rank"].values[0]
    
    if (actual_diff > 0 and predicted_diff > 0) or (actual_diff < 0 and predicted_diff < 0) or (actual_diff == 0 and predicted_diff == 0):
        correct_pairs += 1
    total_pairs += 1

percentage_correct_relative_positions = (correct_pairs / total_pairs) * 100
print(f"Percentage of Correctly Predicted Relative Positions: {percentage_correct_relative_positions:.2f}%")

# Compute MSE and MAE
y_actual = merged["Actual_Rank"]
y_pred = merged["Predicted_Rank"]

mse_positions = mean_squared_error(y_actual, y_pred)
mae_positions = mean_absolute_error(y_actual, y_pred)

print(f"Mean Squared Error (MSE) of League Positions: {mse_positions:.2f}")
print(f"Mean Absolute Error (MAE) of League Positions: {mae_positions:.2f}")

# Merge actual and predicted tables on "Team"
combined_table = actual_table.merge(predicted_table, on="Team", how="inner", suffixes=('_Actual', '_Predicted'))

# Sort by Actual Rank
combined_table = combined_table.sort_values(by=["Actual_Rank"])


combined_table["Rank_Difference"] = abs(combined_table["Actual_Rank"] - combined_table["Predicted_Rank"])
combined_table = combined_table.sort_values(by="Rank_Difference", ascending=True)

print(combined_table[["Team", "Actual_Rank", "Predicted_Rank", "Rank_Difference"]])



Percentage of Correctly Predicted Relative Positions: 91.05%
Mean Squared Error (MSE) of League Positions: 2.70
Mean Absolute Error (MAE) of League Positions: 1.10
                Team  Actual_Rank  Predicted_Rank  Rank_Difference
0           Man City            1             1.0              0.0
1            Arsenal            2             2.0              0.0
2          Liverpool            3             3.0              0.0
7         Man United            8             8.0              0.0
13            Wolves           14            14.0              0.0
10          Brighton           11            11.0              0.0
9     Crystal Palace           10             9.0              1.0
15         Brentford           16            15.0              1.0
6          Newcastle            7             5.0              2.0
5            Chelsea            6             4.0              2.0
4          Tottenham            5             7.0              2.0
3        Aston Villa            

## Poisson Distribution for when teams are predicted in the same position

#### Testing

In [27]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

print("All good! Ready for Poisson regression.")


All good! Ready for Poisson regression.


### Re-structure league table accoring to goal difference

In [None]:
# Create per-match team-level dataset
home_df = data[['HomeTeam', 'AwayTeam', 'FTHG', 'FTAG']].copy()
home_df.columns = ['Team', 'Opponent', 'GoalsFor', 'GoalsAgainst']
home_df['Home'] = 1

away_df = data[['AwayTeam', 'HomeTeam', 'FTAG', 'FTHG']].copy()
away_df.columns = ['Team', 'Opponent', 'GoalsFor', 'GoalsAgainst']
away_df['Home'] = 0

team_match_df = pd.concat([home_df, away_df], ignore_index=True)

# Encode team names for regression
from sklearn.preprocessing import LabelEncoder

le_team = LabelEncoder()
le_opp = LabelEncoder()

team_match_df['Team_enc'] = le_team.fit_transform(team_match_df['Team'])
team_match_df['Opponent_enc'] = le_opp.fit_transform(team_match_df['Opponent'])

# Fit Poisson regression models for GF and GA
# Import PoissonRegressor from scikit-learn
from sklearn.linear_model import PoissonRegressor

# Define features and target variables
X = pd.get_dummies(team_match_df[['Team', 'Opponent', 'Home']], columns=['Team', 'Opponent'], drop_first=True)
y_gf = team_match_df['GoalsFor']
y_ga = team_match_df['GoalsAgainst']

# Poisson for Goals For
model_gf = PoissonRegressor(alpha=1e-12, max_iter=1000)
model_gf.fit(X, y_gf)
team_match_df['Predicted_GF'] = model_gf.predict(X)

# Poisson for Goals Against
model_ga = PoissonRegressor(alpha=1e-12, max_iter=1000)
model_ga.fit(X, y_ga)
team_match_df['Predicted_GA'] = model_ga.predict(X)

avg_preds = team_match_df.groupby('Team')[['Predicted_GF', 'Predicted_GA']].mean()
season_predictions = avg_preds * 38  # Simulate a 38-match season
season_predictions['Predicted_GD'] = season_predictions['Predicted_GF'] - season_predictions['Predicted_GA']

# Merge GD and re-rank
final_table = combined_table.merge(season_predictions[['Predicted_GD']], on='Team', how='left')
final_table.sort_values(by=['Predicted_Rank', 'Predicted_GD'], ascending=[True, False], inplace=True)
final_table['Adjusted_Rank'] = range(1, len(final_table) + 1)

# View final adjusted league table
print("\nAdjusted Table (with realistic GD tiebreaker):")
print(final_table[['Team', 'Actual_Rank', 'Predicted_Rank', 'Predicted_GD', 'Adjusted_Rank']])



Adjusted Table (with realistic GD tiebreaker):
                Team  Actual_Rank  Predicted_Rank  Predicted_GD  Adjusted_Rank
0           Man City            1             1.0     62.816955              1
1            Arsenal            2             2.0     28.654346              2
2          Liverpool            3             3.0     43.798029              3
9            Chelsea            6             4.0     16.999924              4
8          Newcastle            7             5.0      0.798635              5
11       Aston Villa            4             6.0      0.196201              6
10         Tottenham            5             7.0     17.204750              7
3         Man United            8             8.0     14.600156              8
6     Crystal Palace           10             9.0     -9.994649              9
16            Fulham           13            10.0    -10.014170             10
12       Bournemouth           12            10.0    -23.981500             11
5   

### Performing new league table analaysis (MAE, MSE, RMSE)

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
from itertools import combinations
import pandas as pd

# Calculate metrics
y_actual = final_table["Actual_Rank"]
y_pred = final_table["Adjusted_Rank"]

mse_positions = mean_squared_error(y_actual, y_pred)
mae_positions = mean_absolute_error(y_actual, y_pred)

# Calculate  predicted relative positions
correct_pairs = 0
total_pairs = 0

for team1, team2 in combinations(final_table["Team"], 2):
    actual_diff = final_table.loc[final_table["Team"] == team1, "Actual_Rank"].values[0] - \
                  final_table.loc[final_table["Team"] == team2, "Actual_Rank"].values[0]
    predicted_diff = final_table.loc[final_table["Team"] == team1, "Adjusted_Rank"].values[0] - \
                     final_table.loc[final_table["Team"] == team2, "Adjusted_Rank"].values[0]

    if (actual_diff > 0 and predicted_diff > 0) or \
       (actual_diff < 0 and predicted_diff < 0) or \
       (actual_diff == 0 and predicted_diff == 0):
        correct_pairs += 1
    total_pairs += 1

percentage_correct_relative_positions = (correct_pairs / total_pairs) * 100

print(f"Percentage of Correctly Predicted Relative Positions: {percentage_correct_relative_positions:.2f}%")
print(f"Mean Squared Error (MSE) of League Positions: {mse_positions:.2f}")
print(f"Mean Absolute Error (MAE) of League Positions: {mae_positions:.2f}")

# Create and display detailed rank comparison
final_table['Rank_Difference'] = abs(final_table['Actual_Rank'] - final_table['Adjusted_Rank'])
final_table_sorted = final_table.sort_values(by='Rank_Difference')

print(final_table_sorted[['Team', 'Actual_Rank', 'Adjusted_Rank', 'Rank_Difference']])


Percentage of Correctly Predicted Relative Positions: 92.11%
Mean Squared Error (MSE) of League Positions: 3.30
Mean Absolute Error (MAE) of League Positions: 1.30
                Team  Actual_Rank  Adjusted_Rank  Rank_Difference
0           Man City            1              1                0
1            Arsenal            2              2                0
2          Liverpool            3              3                0
3         Man United            8              8                0
14           Burnley           19             19                0
13  Sheffield United           20             20                0
15             Luton           18             18                0
12       Bournemouth           12             11                1
7          Brentford           16             17                1
5           Brighton           11             12                1
6     Crystal Palace           10              9                1
17           Everton           15           