## Data Preparation

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV
from dataset_versioning import save_dataset_version, clean_dataset

In [3]:
# Loading and inspecting the data
file_path = 'datasets_versions\EPL_Full_Dataset_filled.csv'
data = pd.read_csv(file_path)

In [4]:
data['Date'] = pd.to_datetime(data['Date'])

  data['Date'] = pd.to_datetime(data['Date'])


In [5]:
# Inspect the data
print(data.head())

  Div       Date   Time   HomeTeam       AwayTeam  FTHG  FTAG FTR  HTHG  HTAG  \
0  E0 2024-05-19  16:00    Arsenal        Everton     2     1   H     1     1   
1  E0 2024-05-19  16:00  Brentford      Newcastle     2     4   A     0     3   
2  E0 2024-05-19  16:00   Brighton     Man United     0     2   A     0     0   
3  E0 2024-05-19  16:00    Burnley  Nott'm Forest     1     2   A     0     2   
4  E0 2024-05-19  16:00    Chelsea    Bournemouth     2     1   H     1     0   

   ... AvgC<2.5  AHCh  B365CAHH  B365CAHA  PCAHH  PCAHA  MaxCAHH  MaxCAHA  \
0  ...     3.09 -2.00      2.04      1.89   2.02   1.89     2.05     1.91   
1  ...     3.24  0.25      1.93      2.00   1.94   1.98     1.97     2.02   
2  ...     3.46  0.00      2.06      1.87   2.04   1.89     2.12     1.89   
3  ...     2.50  0.25      1.85      2.08   1.85   2.08     1.90     2.09   
4  ...     3.54 -1.50      2.01      1.92   2.02   1.91     2.02     1.96   

   AvgCAHH  AvgCAHA  
0     1.98     1.89  
1     

In [6]:
print(data.tail())

     Div       Date   Time        HomeTeam     AwayTeam  FTHG  FTAG FTR  HTHG  \
1895  E0 2019-08-10  15:00         Burnley  Southampton     3     0   H     0   
1896  E0 2019-08-10  15:00  Crystal Palace      Everton     0     0   D     0   
1897  E0 2019-08-10  15:00         Watford     Brighton     0     3   A     0   
1898  E0 2019-08-10  17:30       Tottenham  Aston Villa     3     1   H     0   
1899  E0 2019-08-09  20:00       Liverpool      Norwich     4     1   H     4   

      HTAG  ... AvgC<2.5  AHCh  B365CAHH  B365CAHA  PCAHH  PCAHA  MaxCAHH  \
1895     0  ...     1.71  0.00      1.87      2.03   1.89   2.03     1.90   
1896     0  ...     1.71  0.25      1.82      2.08   1.97   1.96     2.03   
1897     1  ...     1.73 -0.50      2.04      1.86   2.05   1.88     2.12   
1898     1  ...     2.40 -1.50      2.10      1.70   2.18   1.77     2.21   
1899     0  ...     3.43 -2.25      1.91      1.99   1.94   1.98     1.99   

      MaxCAHA  AvgCAHH  AvgCAHA  
1895     2.07   

In [7]:
print(data.describe())

                                Date         FTHG         FTAG         HTHG  \
count                           1900  1900.000000  1900.000000  1900.000000   
mean   2022-01-12 20:39:09.473684224     1.563158     1.310000     0.707895   
min              2019-08-09 00:00:00     0.000000     0.000000     0.000000   
25%              2020-11-29 00:00:00     1.000000     0.000000     0.000000   
50%              2022-01-02 12:00:00     1.000000     1.000000     1.000000   
75%              2023-04-05 00:00:00     2.000000     2.000000     1.000000   
max              2024-05-19 00:00:00     9.000000     9.000000     5.000000   
std                              NaN     1.344233     1.238833     0.859504   

              HTAG           HS           AS          HST          AST  \
count  1900.000000  1900.000000  1900.000000  1900.000000  1900.000000   
mean      0.586842    13.868421    11.598421     4.824737     4.109474   
min       0.000000     1.000000     1.000000     0.000000     0.00

In [8]:
# Encoding FTR (Full time result: H, D, A)
data['FTR'] = data['FTR'].map({'H': 0, 'D': 1, 'A': 2})

# Encode categorical columns (modify this list if necessary)
categorical_columns = ['HTR']  # Add more columns if needed

for col in categorical_columns:
    data[col] = data[col].map({'H': 0, 'D': 1, 'A': 2})  # Convert text to numbers

# Verify encoding
print(data.head())  # Check if categorical columns are now numeric

  Div       Date   Time   HomeTeam       AwayTeam  FTHG  FTAG  FTR  HTHG  \
0  E0 2024-05-19  16:00    Arsenal        Everton     2     1    0     1   
1  E0 2024-05-19  16:00  Brentford      Newcastle     2     4    2     0   
2  E0 2024-05-19  16:00   Brighton     Man United     0     2    2     0   
3  E0 2024-05-19  16:00    Burnley  Nott'm Forest     1     2    2     0   
4  E0 2024-05-19  16:00    Chelsea    Bournemouth     2     1    0     1   

   HTAG  ...  AvgC<2.5  AHCh  B365CAHH  B365CAHA  PCAHH  PCAHA  MaxCAHH  \
0     1  ...      3.09 -2.00      2.04      1.89   2.02   1.89     2.05   
1     3  ...      3.24  0.25      1.93      2.00   1.94   1.98     1.97   
2     0  ...      3.46  0.00      2.06      1.87   2.04   1.89     2.12   
3     2  ...      2.50  0.25      1.85      2.08   1.85   2.08     1.90   
4     0  ...      3.54 -1.50      2.01      1.92   2.02   1.91     2.02   

   MaxCAHA  AvgCAHH  AvgCAHA  
0     1.91     1.98     1.89  
1     2.02     1.92     1.94  

In [9]:
### Assign Points Based on FTR
def assign_points(row):
    if row['FTR'] == 0:  # Home win
        return (3, 0)
    elif row['FTR'] == 2:  # Away win
        return (0, 3)
    else:  # Draw
        return (1, 1)

data[['HomePoints', 'AwayPoints']] = data.apply(assign_points, axis=1, result_type='expand')

In [10]:
# Filter Training Data (Seasons 2019-2023)
train_data = data[data['Date'] < "2023-08-01"]

# Filter Test Data (2023/24 Season)
test_data = data[data['Date'] >= "2023-08-01"]


features = ['HTHG','HTAG','HTR','HS','AS','HST','AST','HF','AF','HC','AC','HY','AY','HR','AR','B365H','B365D','B365A','BWH','BWD','BWA','IWH','IWD','IWA','PSH','PSD',
        	'PSA','WHH','WHD','WHA','VCH','VCD','VCA','MaxH','MaxD','MaxA','AvgH','AvgD','AvgA','B365>2.5','B365<2.5','P>2.5',
        	'P<2.5','Max>2.5','Max<2.5','Avg>2.5','Avg<2.5','AHh','B365AHH','B365AHA','PAHH','PAHA','MaxAHH','MaxAHA',	
            'AvgAHH',	'AvgAHA',	'B365CH',	'B365CD',	'B365CA',	'BWCH',	'BWCD',	'BWCA',	'IWCH',	'IWCD',	'IWCA',	
            'PSCH',	'PSCD',	'PSCA',	'WHCH',	'WHCD'	,'WHCA'	,'VCCH',	'VCCD',	'VCCA',	'MaxCH',	'MaxCD'	,
            'MaxCA'	,'AvgCH'	,'AvgCD',	'AvgCA',	'B365C>2.5',	'B365C<2.5',	'PC>2.5'	,'PC<2.5',	'MaxC>2.5',	'MaxC<2.5'	,
            'AvgC>2.5',	'AvgC<2.5',	'AHCh'	,'B365CAHH',	'B365CAHA',	'PCAHH',	'PCAHA',	'MaxCAHH',	'MaxCAHA',	'AvgCAHH',	'AvgCAHA'
]

target = 'FTR'

# Training and Testing Data
X_train = train_data[features]
y_train = train_data[target]

X_test = test_data[features]
y_test = test_data[target]

### Train - Test Split

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
log_reg = LogisticRegression(class_weight='balanced')
log_reg.fit(X_train_scaled, y_train)


rf = RandomForestClassifier(
    n_estimators=100,  
    max_depth=10,      
    min_samples_split=5,  
    random_state=42
)

rf.fit(X_train_scaled, y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Model Building

### Ensemble

In [None]:
lr_probs = log_reg.predict_proba(X_test_scaled)  
rf_probs = rf.predict_proba(X_test_scaled)      

ensemble_probs = (lr_probs + rf_probs) / 2

ensemble_predictions = np.argmax(ensemble_probs, axis=1)

print("\n=== Ensemble Model Results for 2023/24 ===")
print(confusion_matrix(y_test, ensemble_predictions))
print(classification_report(y_test, ensemble_predictions))
print(f"Accuracy: {accuracy_score(y_test, ensemble_predictions):.2f}")


=== Ensemble Model Results for 2023/24 ===
[[117  47  11]
 [ 24  38  20]
 [  6  36  81]]
              precision    recall  f1-score   support

           0       0.80      0.67      0.73       175
           1       0.31      0.46      0.37        82
           2       0.72      0.66      0.69       123

    accuracy                           0.62       380
   macro avg       0.61      0.60      0.60       380
weighted avg       0.67      0.62      0.64       380

Accuracy: 0.62


## League Standings Simulation



In [None]:
test_data['Predicted_FTR'] = ensemble_predictions

def assign_points(row):
    if row['Predicted_FTR'] == 0: 
        return (3, 0)
    elif row['Predicted_FTR'] == 2:  
        return (0, 3)
    else:  # Draw
        return (1, 1)

test_data[['HomePoints', 'AwayPoints']] = test_data.apply(assign_points, axis=1, result_type='expand')

home_points = test_data.groupby('HomeTeam')['HomePoints'].sum()
away_points = test_data.groupby('AwayTeam')['AwayPoints'].sum()

total_points = home_points.add(away_points, fill_value=0).sort_values(ascending=False)

league_table = total_points.sort_values(ascending=False).reset_index()
league_table.columns = ['Team', 'Points']

print("\n=== Predicted League Table for 2023/24 ===")
print(league_table)


=== Predicted League Table for 2023/24 ===
                Team  Points
0           Man City      87
1            Arsenal      80
2          Liverpool      79
3            Chelsea      67
4          Tottenham      64
5          Newcastle      63
6        Aston Villa      62
7         Man United      62
8           Brighton      51
9        Bournemouth      49
10         Brentford      44
11            Fulham      44
12    Crystal Palace      43
13     Nott'm Forest      41
14            Wolves      40
15           Everton      37
16          West Ham      36
17           Burnley      24
18  Sheffield United      24
19             Luton      22


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['Predicted_FTR'] = ensemble_predictions
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data[['HomePoints', 'AwayPoints']] = test_data.apply(assign_points, axis=1, result_type='expand')
