## Data Preparation

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV
from dataset_versioning import save_dataset_version, clean_dataset

In [2]:
# Loading and inspecting the data
file_path = 'datasets_versions\EPL_Full_Dataset_filled.csv'
data = pd.read_csv(file_path)

In [3]:
data['Date'] = pd.to_datetime(data['Date'])

  data['Date'] = pd.to_datetime(data['Date'])


In [4]:
# Inspect the data
print(data.head())

  Div       Date   Time   HomeTeam       AwayTeam  FTHG  FTAG FTR  HTHG  HTAG  \
0  E0 2024-05-19  16:00    Arsenal        Everton     2     1   H     1     1   
1  E0 2024-05-19  16:00  Brentford      Newcastle     2     4   A     0     3   
2  E0 2024-05-19  16:00   Brighton     Man United     0     2   A     0     0   
3  E0 2024-05-19  16:00    Burnley  Nott'm Forest     1     2   A     0     2   
4  E0 2024-05-19  16:00    Chelsea    Bournemouth     2     1   H     1     0   

   ... AvgC<2.5  AHCh  B365CAHH  B365CAHA  PCAHH  PCAHA  MaxCAHH  MaxCAHA  \
0  ...     3.09 -2.00      2.04      1.89   2.02   1.89     2.05     1.91   
1  ...     3.24  0.25      1.93      2.00   1.94   1.98     1.97     2.02   
2  ...     3.46  0.00      2.06      1.87   2.04   1.89     2.12     1.89   
3  ...     2.50  0.25      1.85      2.08   1.85   2.08     1.90     2.09   
4  ...     3.54 -1.50      2.01      1.92   2.02   1.91     2.02     1.96   

   AvgCAHH  AvgCAHA  
0     1.98     1.89  
1     

In [5]:
print(data.tail())

     Div       Date   Time        HomeTeam     AwayTeam  FTHG  FTAG FTR  HTHG  \
1895  E0 2019-08-10  15:00         Burnley  Southampton     3     0   H     0   
1896  E0 2019-08-10  15:00  Crystal Palace      Everton     0     0   D     0   
1897  E0 2019-08-10  15:00         Watford     Brighton     0     3   A     0   
1898  E0 2019-08-10  17:30       Tottenham  Aston Villa     3     1   H     0   
1899  E0 2019-08-09  20:00       Liverpool      Norwich     4     1   H     4   

      HTAG  ... AvgC<2.5  AHCh  B365CAHH  B365CAHA  PCAHH  PCAHA  MaxCAHH  \
1895     0  ...     1.71  0.00      1.87      2.03   1.89   2.03     1.90   
1896     0  ...     1.71  0.25      1.82      2.08   1.97   1.96     2.03   
1897     1  ...     1.73 -0.50      2.04      1.86   2.05   1.88     2.12   
1898     1  ...     2.40 -1.50      2.10      1.70   2.18   1.77     2.21   
1899     0  ...     3.43 -2.25      1.91      1.99   1.94   1.98     1.99   

      MaxCAHA  AvgCAHH  AvgCAHA  
1895     2.07   

In [6]:
print(data.describe())

                                Date         FTHG         FTAG         HTHG  \
count                           1900  1900.000000  1900.000000  1900.000000   
mean   2022-01-12 20:39:09.473684224     1.563158     1.310000     0.707895   
min              2019-08-09 00:00:00     0.000000     0.000000     0.000000   
25%              2020-11-29 00:00:00     1.000000     0.000000     0.000000   
50%              2022-01-02 12:00:00     1.000000     1.000000     1.000000   
75%              2023-04-05 00:00:00     2.000000     2.000000     1.000000   
max              2024-05-19 00:00:00     9.000000     9.000000     5.000000   
std                              NaN     1.344233     1.238833     0.859504   

              HTAG           HS           AS          HST          AST  \
count  1900.000000  1900.000000  1900.000000  1900.000000  1900.000000   
mean      0.586842    13.868421    11.598421     4.824737     4.109474   
min       0.000000     1.000000     1.000000     0.000000     0.00

In [None]:
# Encoding FTR (Full time result: H, D, A)
data['FTR'] = data['FTR'].map({'H': 0, 'D': 1, 'A': 2})

# Encode categorical columns (modify this list if necessary)
categorical_columns = ['HTR']  

for col in categorical_columns:
    data[col] = data[col].map({'H': 0, 'D': 1, 'A': 2}) 

print(data.head()) 

  Div       Date   Time   HomeTeam       AwayTeam  FTHG  FTAG  FTR  HTHG  \
0  E0 2024-05-19  16:00    Arsenal        Everton     2     1    0     1   
1  E0 2024-05-19  16:00  Brentford      Newcastle     2     4    2     0   
2  E0 2024-05-19  16:00   Brighton     Man United     0     2    2     0   
3  E0 2024-05-19  16:00    Burnley  Nott'm Forest     1     2    2     0   
4  E0 2024-05-19  16:00    Chelsea    Bournemouth     2     1    0     1   

   HTAG  ...  AvgC<2.5  AHCh  B365CAHH  B365CAHA  PCAHH  PCAHA  MaxCAHH  \
0     1  ...      3.09 -2.00      2.04      1.89   2.02   1.89     2.05   
1     3  ...      3.24  0.25      1.93      2.00   1.94   1.98     1.97   
2     0  ...      3.46  0.00      2.06      1.87   2.04   1.89     2.12   
3     2  ...      2.50  0.25      1.85      2.08   1.85   2.08     1.90   
4     0  ...      3.54 -1.50      2.01      1.92   2.02   1.91     2.02   

   MaxCAHA  AvgCAHH  AvgCAHA  
0     1.91     1.98     1.89  
1     2.02     1.92     1.94  

In [None]:
def assign_points(row):
    if row['FTR'] == 0:  # Home win
        return (3, 0)
    elif row['FTR'] == 2:  # Away win
        return (0, 3)
    else:  # Draw
        return (1, 1)

data[['HomePoints', 'AwayPoints']] = data.apply(assign_points, axis=1, result_type='expand')

In [9]:
# Filter Training Data (Seasons 2019-2023)
train_data = data[data['Date'] < "2023-08-01"]

# Filter Test Data (2023/24 Season)
test_data = data[data['Date'] >= "2023-08-01"]


features = ['HTHG','HTAG','HTR','HS','AS','HST','AST','HF','AF','HC','AC','HY','AY','HR','AR','B365H','B365D','B365A','BWH','BWD','BWA','IWH','IWD','IWA','PSH','PSD',
        	'PSA','WHH','WHD','WHA','VCH','VCD','VCA','MaxH','MaxD','MaxA','AvgH','AvgD','AvgA','B365>2.5','B365<2.5','P>2.5',
        	'P<2.5','Max>2.5','Max<2.5','Avg>2.5','Avg<2.5','AHh','B365AHH','B365AHA','PAHH','PAHA','MaxAHH','MaxAHA',	
            'AvgAHH',	'AvgAHA',	'B365CH',	'B365CD',	'B365CA',	'BWCH',	'BWCD',	'BWCA',	'IWCH',	'IWCD',	'IWCA',	
            'PSCH',	'PSCD',	'PSCA',	'WHCH',	'WHCD'	,'WHCA'	,'VCCH',	'VCCD',	'VCCA',	'MaxCH',	'MaxCD'	,
            'MaxCA'	,'AvgCH'	,'AvgCD',	'AvgCA',	'B365C>2.5',	'B365C<2.5',	'PC>2.5'	,'PC<2.5',	'MaxC>2.5',	'MaxC<2.5'	,
            'AvgC>2.5',	'AvgC<2.5',	'AHCh'	,'B365CAHH',	'B365CAHA',	'PCAHH',	'PCAHA',	'MaxCAHH',	'MaxCAHA',	'AvgCAHH',	'AvgCAHA'
]

target = 'FTR'

# Training and Testing Data
X_train = train_data[features]
y_train = train_data[target]

X_test = test_data[features]
y_test = test_data[target]

### Train - Test Split

In [10]:
# Optionally, scale features for Logistic Regression
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Model Building

### Random Forest

In [None]:
rf = RandomForestClassifier(
    n_estimators=100,  
    max_depth=10,      
    min_samples_split=5,  
    random_state=42
)
rf.fit(X_train_scaled, y_train)

# Evaluate random forest
print("Random Forest Results with Odds:")
y_pred_rf = rf.predict(X_test_scaled)
print(confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.2f}")

Random Forest Results with Odds:
[[146  17  12]
 [ 41  12  29]
 [ 22  10  91]]
              precision    recall  f1-score   support

           0       0.70      0.83      0.76       175
           1       0.31      0.15      0.20        82
           2       0.69      0.74      0.71       123

    accuracy                           0.66       380
   macro avg       0.57      0.57      0.56       380
weighted avg       0.61      0.66      0.62       380

Accuracy: 0.66


## League Standings Simulation

### Random Forest

In [None]:
test_data['Predicted_FTR'] = rf.predict(X_test_scaled)  

def assign_points(row):
    if row['Predicted_FTR'] == 0:  # Home win
        return (3, 0)
    elif row['Predicted_FTR'] == 2:  # Away win
        return (0, 3)
    else:  # Draw
        return (1, 1)

# Apply the function to assign predicted points
test_data[['HomePoints', 'AwayPoints']] = test_data.apply(assign_points, axis=1, result_type='expand')

# Calculate total points for each team
home_points = test_data.groupby('HomeTeam')['HomePoints'].sum()
away_points = test_data.groupby('AwayTeam')['AwayPoints'].sum() 

# Combine home and away points to get league standings
total_points = home_points.add(away_points, fill_value=0).sort_values(ascending=False)

# Create league table
league_table = total_points.sort_values(ascending=False).reset_index()
league_table.columns = ['Team', 'Points']

# Display the final league table for 2023/24 season
print("\n=== Predicted League Table for 2023/24 ===")
print(league_table)



=== Predicted League Table for 2023/24 ===
                Team  Points
0           Man City     103
1            Arsenal     101
2          Liverpool      84
3          Newcastle      82
4            Chelsea      79
5         Man United      72
6          Tottenham      69
7           Brighton      63
8        Aston Villa      61
9             Fulham      49
10       Bournemouth      47
11    Crystal Palace      45
12           Everton      44
13         Brentford      43
14     Nott'm Forest      39
15          West Ham      37
16            Wolves      30
17           Burnley      26
18             Luton      15
19  Sheffield United      12


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['Predicted_FTR'] = rf.predict(X_test_scaled)  # Assign only to test_data
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data[['HomePoints', 'AwayPoints']] = test_data.apply(assign_points, axis=1, result_type='expand')


In [21]:
feature_importances = pd.DataFrame({
    'Feature': features,
    'Importance': rf.feature_importances_
}).sort_values(by='Importance', ascending=False)

pd.set_option('display.max_rows', None)

print(feature_importances)

      Feature  Importance
2         HTR    0.107817
1        HTAG    0.037449
5         HST    0.035364
0        HTHG    0.032570
6         AST    0.031826
77      AvgCH    0.017494
79      AvgCA    0.016105
65       PSCH    0.015551
62       IWCH    0.014100
4          AS    0.013888
3          HS    0.012150
76      MaxCA    0.011978
33       MaxH    0.011336
36       AvgH    0.011285
48    B365AHH    0.011088
93    MaxCAHH    0.010809
9          HC    0.010685
71       VCCH    0.010653
67       PSCA    0.010592
50       PAHH    0.010503
64       IWCA    0.010404
38       AvgA    0.010326
94    MaxCAHA    0.010290
59       BWCH    0.010283
8          AF    0.010268
91      PCAHH    0.010264
7          HF    0.010183
95    AvgCAHH    0.010160
89   B365CAHH    0.010029
75      MaxCD    0.009945
83     PC<2.5    0.009930
74      MaxCH    0.009898
90   B365CAHA    0.009675
61       BWCA    0.009380
10         AC    0.009290
24        PSH    0.009221
70       WHCA    0.009159
51       PAH