## Data Preparation

In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV
from dataset_versioning import save_dataset_version, clean_dataset

In [18]:
# Loading and inspecting the data
file_path = 'datasets_versions\EPL_Full_Dataset_filled.csv'
data = pd.read_csv(file_path)

In [None]:
# Display number of missing values before filling
print("Missing values before filling:\n", data.isnull().sum())

# Fill all NaN or missing values with 0
data.fillna(0, inplace=True)

# Display number of missing values after filling
print("\nMissing values after filling:\n", data.isnull().sum())

# (Optional) Save the cleaned dataset if needed
data.to_csv('EPL_Full_Dataset_filled.csv', index=False)

Missing values before filling:
 Div          0
Date         0
Time         0
HomeTeam     0
AwayTeam     0
FTHG         0
FTAG         0
FTR          0
HTHG         0
HTAG         0
HTR          0
Referee      0
HS           0
AS           0
HST          0
AST          0
HF           0
AF           0
HC           0
AC           0
HY           0
AY           0
HR           0
AR           0
B365H        0
B365D        0
B365A        0
BWH          0
BWD          0
BWA          0
IWH          0
IWD          0
IWA          0
PSH          0
PSD          0
PSA          0
WHH          0
WHD          0
WHA          0
VCH          0
VCD          0
VCA          0
MaxH         0
MaxD         0
MaxA         0
AvgH         0
AvgD         0
AvgA         0
B365>2.5     0
B365<2.5     0
P>2.5        0
P<2.5        0
Max>2.5      0
Max<2.5      0
Avg>2.5      0
Avg<2.5      0
AHh          0
B365AHH      0
B365AHA      0
PAHH         0
PAHA         0
MaxAHH       0
MaxAHA       0
AvgAHH       0
AvgAHA  

In [20]:
data['Date'] = pd.to_datetime(data['Date'])

  data['Date'] = pd.to_datetime(data['Date'])


In [21]:
# Inspect the data
print(data.head())

  Div       Date   Time   HomeTeam       AwayTeam  FTHG  FTAG FTR  HTHG  HTAG  \
0  E0 2024-05-19  16:00    Arsenal        Everton     2     1   H     1     1   
1  E0 2024-05-19  16:00  Brentford      Newcastle     2     4   A     0     3   
2  E0 2024-05-19  16:00   Brighton     Man United     0     2   A     0     0   
3  E0 2024-05-19  16:00    Burnley  Nott'm Forest     1     2   A     0     2   
4  E0 2024-05-19  16:00    Chelsea    Bournemouth     2     1   H     1     0   

  HTR   Referee  HS  AS  HST  AST  HF  AF  HC  AC  HY  AY  HR  AR  B365H  \
0   D  M Oliver  26   5    5    2   8  11   8   1   4   3   0   0   1.18   
1   A  S Hooper  10  12    5    7  15  11   3   0   4   4   0   0   2.90   
2   D  C Pawson  17  11    3    4  10   9   7   5   1   3   0   0   2.15   
3   A   G Scott  20  12    3    6  11   5   4   3   1   0   0   0   3.00   
4   H  A Taylor  16  22    6    5   5   9   6   5   2   3   0   0   1.45   

   B365D  B365A   BWH   BWD    BWA  IWH  IWD  IWA   PSH 

In [22]:
print(data.tail())

     Div       Date   Time        HomeTeam     AwayTeam  FTHG  FTAG FTR  HTHG  \
1895  E0 2019-08-10  15:00         Burnley  Southampton     3     0   H     0   
1896  E0 2019-08-10  15:00  Crystal Palace      Everton     0     0   D     0   
1897  E0 2019-08-10  15:00         Watford     Brighton     0     3   A     0   
1898  E0 2019-08-10  17:30       Tottenham  Aston Villa     3     1   H     0   
1899  E0 2019-08-09  20:00       Liverpool      Norwich     4     1   H     4   

      HTAG HTR     Referee  HS  AS  HST  AST  HF  AF  HC  AC  HY  AY  HR  AR  \
1895     0   D     G Scott  10  11    4    3   6  12   2   7   0   0   0   0   
1896     0   D      J Moss   6  10    2    3  16  14   6   2   2   1   0   1   
1897     1   A    C Pawson  11   5    3    3  15  11   5   2   0   1   0   0   
1898     1   A  C Kavanagh  31   7    7    4  13   9  14   0   1   0   0   0   
1899     0   H    M Oliver  15  12    7    5   9   9  11   2   0   2   0   0   

      B365H  B365D  B365A   BWH 

In [23]:
print(data.describe())

                                Date         FTHG         FTAG         HTHG  \
count                           1900  1900.000000  1900.000000  1900.000000   
mean   2022-01-12 20:39:09.473684224     1.563158     1.310000     0.707895   
min              2019-08-09 00:00:00     0.000000     0.000000     0.000000   
25%              2020-11-29 00:00:00     1.000000     0.000000     0.000000   
50%              2022-01-02 12:00:00     1.000000     1.000000     1.000000   
75%              2023-04-05 00:00:00     2.000000     2.000000     1.000000   
max              2024-05-19 00:00:00     9.000000     9.000000     5.000000   
std                              NaN     1.344233     1.238833     0.859504   

              HTAG           HS           AS          HST          AST  \
count  1900.000000  1900.000000  1900.000000  1900.000000  1900.000000   
mean      0.586842    13.868421    11.598421     4.824737     4.109474   
min       0.000000     1.000000     1.000000     0.000000     0.00

In [None]:
# Encoding FTR (Full time result: H, D, A)
data['FTR'] = data['FTR'].map({'H': 0, 'D': 1, 'A': 2})

# Encoding categorical columns
categorical_columns = ['HTR'] 

for col in categorical_columns:
    data[col] = data[col].map({'H': 0, 'D': 1, 'A': 2})  # Convert text to numbers

print(data.head())  # Check if categorical columns are now numeric

  Div       Date   Time   HomeTeam       AwayTeam  FTHG  FTAG  FTR  HTHG  \
0  E0 2024-05-19  16:00    Arsenal        Everton     2     1    0     1   
1  E0 2024-05-19  16:00  Brentford      Newcastle     2     4    2     0   
2  E0 2024-05-19  16:00   Brighton     Man United     0     2    2     0   
3  E0 2024-05-19  16:00    Burnley  Nott'm Forest     1     2    2     0   
4  E0 2024-05-19  16:00    Chelsea    Bournemouth     2     1    0     1   

   HTAG  HTR   Referee  HS  AS  HST  AST  HF  AF  HC  AC  HY  AY  HR  AR  \
0     1    1  M Oliver  26   5    5    2   8  11   8   1   4   3   0   0   
1     3    2  S Hooper  10  12    5    7  15  11   3   0   4   4   0   0   
2     0    1  C Pawson  17  11    3    4  10   9   7   5   1   3   0   0   
3     2    2   G Scott  20  12    3    6  11   5   4   3   1   0   0   0   
4     0    0  A Taylor  16  22    6    5   5   9   6   5   2   3   0   0   

   B365H  B365D  B365A   BWH   BWD    BWA  IWH  IWD  IWA   PSH   PSD    PSA  \
0   1.1

In [None]:
def assign_points(row):
    if row['FTR'] == 0:  # Home win
        return (3, 0)
    elif row['FTR'] == 2:  # Away win
        return (0, 3)
    else:  # Draw
        return (1, 1)

data[['HomePoints', 'AwayPoints']] = data.apply(assign_points, axis=1, result_type='expand')

In [26]:
# Filter Training Data (Seasons 2019-2023)
train_data = data[data['Date'] < "2023-08-01"]

# Filter Test Data (2023/24 Season)
test_data = data[data['Date'] >= "2023-08-01"]


features = ['HTHG','HTAG','HTR','HS','AS','HST','AST','HF','AF','HC','AC','HY','AY','HR','AR','B365H','B365D','B365A','BWH','BWD','BWA','IWH','IWD','IWA','PSH','PSD',
        	'PSA','WHH','WHD','WHA','VCH','VCD','VCA','MaxH','MaxD','MaxA','AvgH','AvgD','AvgA','B365>2.5','B365<2.5','P>2.5',
        	'P<2.5','Max>2.5','Max<2.5','Avg>2.5','Avg<2.5','AHh','B365AHH','B365AHA','PAHH','PAHA','MaxAHH','MaxAHA',	
            'AvgAHH',	'AvgAHA',	'B365CH',	'B365CD',	'B365CA',	'BWCH',	'BWCD',	'BWCA',	'IWCH',	'IWCD',	'IWCA',	
            'PSCH',	'PSCD',	'PSCA',	'WHCH',	'WHCD'	,'WHCA'	,'VCCH',	'VCCD',	'VCCA',	'MaxCH',	'MaxCD'	,
            'MaxCA'	,'AvgCH'	,'AvgCD',	'AvgCA',	'B365C>2.5',	'B365C<2.5',	'PC>2.5'	,'PC<2.5',	'MaxC>2.5',	'MaxC<2.5'	,
            'AvgC>2.5',	'AvgC<2.5',	'AHCh'	,'B365CAHH',	'B365CAHA',	'PCAHH',	'PCAHA',	'MaxCAHH',	'MaxCAHA',	'AvgCAHH',	'AvgCAHA'
]

target = 'FTR'

# Training and Testing Data
X_train = train_data[features]
y_train = train_data[target]

X_test = test_data[features]
y_test = test_data[target]

### Train - Test Split

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Model Building

### Logistic Regression

In [None]:
log_reg = LogisticRegression(class_weight='balanced')
log_reg.fit(X_train_scaled, y_train)

print("Logistic Regression Results with Odds:")
y_pred_log = log_reg.predict(X_test_scaled)
print(confusion_matrix(y_test, y_pred_log))
print(classification_report(y_test, y_pred_log))
print(f"Accuracy: {accuracy_score(y_test, y_pred_log):.2f}")

Logistic Regression Results with Odds:
[[94 67 14]
 [16 49 17]
 [ 7 47 69]]
              precision    recall  f1-score   support

           0       0.80      0.54      0.64       175
           1       0.30      0.60      0.40        82
           2       0.69      0.56      0.62       123

    accuracy                           0.56       380
   macro avg       0.60      0.57      0.55       380
weighted avg       0.66      0.56      0.58       380

Accuracy: 0.56


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## League Standings Simulation

### Logistic Regression

In [29]:
# Scale the test features for 2023/24 Season
X_test_scaled = scaler.transform(X_test)

# Predict match outcomes using Logistic Regression for 2023/24 matches
test_data['Predicted_FTR'] = log_reg.predict(X_test_scaled)

# Assign points based on predictions for test dataset only
def assign_points(row):
    if row['Predicted_FTR'] == 0:  # Home win
        return (3, 0)
    elif row['Predicted_FTR'] == 2:  # Away win
        return (0, 3)
    else:  # Draw
        return (1, 1)

test_data[['HomePoints', 'AwayPoints']] = test_data.apply(assign_points, axis=1, result_type='expand')

# Calculate total points for each team (2023/24 season only)
home_points = test_data.groupby('HomeTeam')['HomePoints'].sum()
away_points = test_data.groupby('AwayTeam')['AwayPoints'].sum()

# Combine home and away points to get final league standings
total_points = home_points.add(away_points, fill_value=0).sort_values(ascending=False)

league_table = total_points.reset_index()
league_table.columns = ['Team', 'Points']

# Display the final league table for 2023/24 season
print("\n=== Predicted League Table for 2023/24 ===")
print(league_table)


=== Predicted League Table for 2023/24 ===
                Team  Points
0           Man City      78
1          Liverpool      75
2            Arsenal      71
3          Tottenham      64
4            Chelsea      59
5          Newcastle      58
6           Brighton      57
7        Aston Villa      51
8         Man United      51
9          Brentford      46
10       Bournemouth      45
11     Nott'm Forest      45
12    Crystal Palace      44
13           Everton      43
14            Fulham      40
15            Wolves      39
16          West Ham      35
17  Sheffield United      27
18           Burnley      26
19             Luton      23


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['Predicted_FTR'] = log_reg.predict(X_test_scaled)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data[['HomePoints', 'AwayPoints']] = test_data.apply(assign_points, axis=1, result_type='expand')


In [30]:
lr_coefficients = pd.DataFrame({
    'Feature': features,
    'Coefficient': log_reg.coef_[0]
}).sort_values(by='Coefficient', ascending=False)
print(lr_coefficients)

      Feature  Coefficient
22        IWD     0.804335
0        HTHG     0.793763
5         HST     0.720881
41      P>2.5     0.669334
37       AvgD     0.576230
59       BWCH     0.567173
44    Max<2.5     0.497118
35       MaxA     0.472513
25        PSD     0.433302
78      AvgCD     0.368364
55     AvgAHA     0.329654
73       VCCA     0.314806
57     B365CD     0.272693
10         AC     0.238029
47        AHh     0.237379
84   MaxC>2.5     0.194388
14         AR     0.190750
33       MaxH     0.179687
30        VCH     0.159348
74      MaxCH     0.155692
19        BWD     0.148804
26        PSA     0.147281
45    Avg>2.5     0.145114
71       VCCH     0.137831
39   B365>2.5     0.130583
34       MaxD     0.126117
81  B365C<2.5     0.121789
69       WHCD     0.112366
46    Avg<2.5     0.111563
96    AvgCAHA     0.110227
64       IWCA     0.106149
76      MaxCA     0.104919
32        VCA     0.100090
67       PSCA     0.090874
86   AvgC>2.5     0.087281
68       WHCH     0.084856
8