# Support Vector Machine

## Section 1: Data Importing

In [1]:
# Importing Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

In [2]:
df_matches = pd.read_csv('com.csv')
df_matches.dtypes

Date         object
HomeTeam     object
AwayTeam     object
FTHG        float64
FTAG        float64
FTR          object
HTHG        float64
HTAG        float64
HTR          object
Referee      object
HS          float64
AS          float64
HST         float64
AST         float64
HC          float64
AC          float64
HF          float64
AF          float64
HY          float64
AY          float64
HR          float64
AR          float64
dtype: object

In [3]:
# Importing Data File
df_matches = pd.read_csv('com.csv')
df_managers = pd.read_excel('PremierLeagueManagers.xlsx')
df_spending = pd.read_excel('Spending_data.xlsx')

# Ensure the match date is in datetime format
df_matches['Date'] = pd.to_datetime(df_matches['Date'],dayfirst = True)

# Sort the df_matches dataframe by ascending date order
df_matches = df_matches.sort_values(by='Date', ascending=True).reset_index(drop=True)

# Drop any rows where all the values are nan
#df_matches = df_matches.dropna()

# Ensure Season_Start and Season_End are in datetime format
df_managers['Season_Start'] = pd.to_datetime(df_managers['Season_Start'],dayfirst = True)
df_managers['Season_End'] = pd.to_datetime(df_managers['Season_End'],dayfirst = True)

df_spending['Season_Start'] = pd.to_datetime(df_spending['Season_Start'],dayfirst = True)
df_spending['Season_End'] = pd.to_datetime(df_spending['Season_End'],dayfirst = True)
#Showing What Each Training Dataset Looks Like
display(df_matches)
display(df_managers)
display(df_spending)

df_matches.dtypes

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,HST,AST,HC,AC,HF,AF,HY,AY,HR,AR
0,2000-08-19,Charlton,Man City,4.0,0.0,H,2.0,0.0,H,Rob Harris,...,14.0,4.0,6.0,6.0,13.0,12.0,1.0,2.0,0.0,0.0
1,2000-08-19,Chelsea,West Ham,4.0,2.0,H,1.0,0.0,H,Graham Barber,...,10.0,5.0,7.0,7.0,19.0,14.0,1.0,2.0,0.0,0.0
2,2000-08-19,Coventry,Middlesbrough,1.0,3.0,A,1.0,1.0,D,Barry Knight,...,3.0,9.0,8.0,4.0,15.0,21.0,5.0,3.0,1.0,0.0
3,2000-08-19,Derby,Southampton,2.0,2.0,D,1.0,2.0,A,Andy D'Urso,...,4.0,6.0,5.0,8.0,11.0,13.0,1.0,1.0,0.0,0.0
4,2000-08-19,Leeds,Everton,2.0,0.0,H,2.0,0.0,H,Dermot Gallagher,...,8.0,6.0,6.0,4.0,21.0,20.0,1.0,3.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9326,2024-11-10,Nott'm Forest,Newcastle,1.0,3.0,A,1.0,0.0,H,A Taylor,...,3.0,6.0,4.0,5.0,13.0,6.0,1.0,1.0,0.0,0.0
9327,2024-11-10,Tottenham,Ipswich,1.0,2.0,A,0.0,2.0,A,D England,...,5.0,3.0,12.0,2.0,10.0,19.0,1.0,5.0,0.0,0.0
9328,2024-11-10,Man United,Leicester,3.0,0.0,H,2.0,0.0,H,P Bankes,...,3.0,5.0,1.0,5.0,9.0,5.0,0.0,1.0,0.0,0.0
9329,2024-11-10,Chelsea,Arsenal,1.0,1.0,D,0.0,0.0,D,M Oliver,...,3.0,3.0,4.0,3.0,12.0,12.0,4.0,2.0,0.0,0.0


Unnamed: 0,Season_Start,Season_End,Manager,Club,Manager_Nationality
0,2024-08-16,2025-05-25,Mikel Arteta,Arsenal,Spain
1,2024-08-16,2025-05-25,Unai Emery,Aston Villa,Spain
2,2024-08-16,2025-05-25,Andoni Iraola,Bournemouth,Spain
3,2024-08-16,2025-05-25,Thomas Frank,Brentford,Denmark
4,2024-08-16,2025-05-25,Fabian Hürzeler,Brighton,Germany
...,...,...,...,...,...
696,2000-08-19,2001-05-19,David Pleat,Tottenham,England
697,2000-08-19,2001-05-19,George Graham,Tottenham,Scotland
698,2000-08-19,2001-05-19,Glenn Hoddle,Tottenham,England
699,2000-08-19,2001-05-19,Glenn Roeder,West Ham,England


Unnamed: 0,Team,Expenditure,Season_Start,Season_End
0,Brighton,244.2,2024-08-16,2025-05-25
1,Chelsea,238.5,2024-08-16,2025-05-25
2,Man United,214.5,2024-08-16,2025-05-25
3,Aston Villa,176.2,2024-08-16,2025-05-25
4,Tottenham,148.85,2024-08-16,2025-05-25
...,...,...,...,...
495,Man United,11.7,2000-08-19,2001-05-19
496,Leicester,11.7,2000-08-19,2001-05-19
497,Ipswich,9.9,2000-08-19,2001-05-19
498,Bradford,6.61,2000-08-19,2001-05-19


Date        datetime64[ns]
HomeTeam            object
AwayTeam            object
FTHG               float64
FTAG               float64
FTR                 object
HTHG               float64
HTAG               float64
HTR                 object
Referee             object
HS                 float64
AS                 float64
HST                float64
AST                float64
HC                 float64
AC                 float64
HF                 float64
AF                 float64
HY                 float64
AY                 float64
HR                 float64
AR                 float64
dtype: object

In [4]:
df_matches.columns

Index(['Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HTHG', 'HTAG',
       'HTR', 'Referee', 'HS', 'AS', 'HST', 'AST', 'HC', 'AC', 'HF', 'AF',
       'HY', 'AY', 'HR', 'AR'],
      dtype='object')

In [5]:
# Checking the distribution of FTR Values
print(df_matches['FTR'].value_counts()) # Raw values
print(df_matches['FTR'].value_counts(normalize=True)*100) # Percentage of the results being H, A, or D

FTR
H    4287
A    2738
D    2305
Name: count, dtype: int64
FTR
H    45.948553
A    29.346195
D    24.705252
Name: proportion, dtype: float64


## Section 2: Combining the Different Data Sources Together

The next section is to stitch the manager and spending information to the main dataframe.

In [6]:

# Function to get the manager for a specific team and match date
def get_manager(team, match_date):
    manager_row = df_managers[
        (df_managers['Club'] == team) &
        (df_managers['Season_Start'] <= match_date) &
        (df_managers['Season_End'] >= match_date)
    ]
    return manager_row['Manager'].iloc[0] if not manager_row.empty else None

# Function to get the spending for a specific team and match date
def get_spending(team, match_date):
    spending_row = df_spending[
        (df_spending['Team'] == team) &
        (df_spending['Season_Start'] <= match_date) &
        (df_spending['Season_End'] >= match_date)
    ]
    return spending_row['Expenditure'].iloc[0] if not spending_row.empty else None


In [7]:
# Add HomeManager and AwayManager columns to df_matches
df_matches['HomeManager'] = df_matches.apply(lambda row: get_manager(row['HomeTeam'], row['Date']), axis=1)
df_matches['AwayManager'] = df_matches.apply(lambda row: get_manager(row['AwayTeam'], row['Date']), axis=1)

# Add HomeSpending and AwaySpending columns to df_matches
df_matches['HomeSpending'] = df_matches.apply(lambda row: get_spending(row['HomeTeam'], row['Date']), axis=1)
df_matches['AwaySpending'] = df_matches.apply(lambda row: get_spending(row['AwayTeam'], row['Date']), axis=1)

In [8]:
# Save the DataFrame to an Excel file
df_matches.to_excel("matches_with_managers_and_spending.xlsx", index=False)

print("Data saved to matches_with_managers_and_spending.xlsx")


Data saved to matches_with_managers_and_spending.xlsx


## Section 3: Calculating Input Features for ML Model
### Section 3.1: Calculating Rolling Average Statistics

In [9]:
# Calculating Rolling Average Statistics
k = 38 # Number of matches to look backwards to.

# Function to Calculate Rolling Average Statistics for the Past k Matches. Each row's kAvg does not include the current results.
def generate_kAvg(df, attribute, HomeTeam=True):
    if HomeTeam:
        group_by_team = "HomeTeam"
    elif not HomeTeam:
        group_by_team = "AwayTeam"
    
    kattribute = "k" + attribute
    df[kattribute] = (
    df.groupby(group_by_team)[attribute]     # Group by HomeTeam or AwayTeam
    .transform(lambda x: x.shift(1).rolling(window=k, min_periods=1).mean())  # Shift by 1 to exclude the current match, and create a rolling window of up to k past matches
    )
    df[kattribute] = df[kattribute].fillna(0) # Filling nan values with 0

Home_kAvg_features = ['FTHG','HTHG','HS','HC','HF','HY','HR']
Away_kAvg_features = ['FTAG','HTAG','AS','AC','AF','AY','AR']

for feature in Home_kAvg_features:
    generate_kAvg(df_matches, feature,True)
for feature in Away_kAvg_features:
    generate_kAvg(df_matches, feature,False)

# Set display options to show all rows and columns
#pd.set_option('display.max_rows', None) 
#pd.set_option('display.max_columns', None)
    
display(df_matches)

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,kHF,kHY,kHR,kFTAG,kHTAG,kAS,kAC,kAF,kAY,kAR
0,2000-08-19,Charlton,Man City,4.0,0.0,H,2.0,0.0,H,Rob Harris,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,2000-08-19,Chelsea,West Ham,4.0,2.0,H,1.0,0.0,H,Graham Barber,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,2000-08-19,Coventry,Middlesbrough,1.0,3.0,A,1.0,1.0,D,Barry Knight,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,2000-08-19,Derby,Southampton,2.0,2.0,D,1.0,2.0,A,Andy D'Urso,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,2000-08-19,Leeds,Everton,2.0,0.0,H,2.0,0.0,H,Dermot Gallagher,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9326,2024-11-10,Nott'm Forest,Newcastle,1.0,3.0,A,1.0,0.0,H,A Taylor,...,7.868421,1.842105,0.078947,1.868421,0.763158,12.210526,7.868421,7.921053,2.184211,0.000000
9327,2024-11-10,Tottenham,Ipswich,1.0,2.0,A,0.0,2.0,A,D England,...,9.736842,2.289474,0.078947,1.157895,0.552632,9.210526,4.578947,12.552632,1.447368,0.078947
9328,2024-11-10,Man United,Leicester,3.0,0.0,H,2.0,0.0,H,P Bankes,...,9.763158,2.184211,0.026316,1.500000,0.631579,11.026316,4.105263,10.500000,1.842105,0.000000
9329,2024-11-10,Chelsea,Arsenal,1.0,1.0,D,0.0,0.0,D,M Oliver,...,8.078947,2.473684,0.026316,1.921053,1.000000,13.473684,7.868421,8.842105,1.710526,0.105263


### Section 3.2 Calculating Cumulative Win Rates for the Home and Away Team

In [10]:
# Create a new column to store home wins and away wins
df_matches['HomeWin'] = (df_matches['FTR'] == 'H').astype(int)
df_matches['AwayWin'] = (df_matches['FTR'] == 'A').astype(int)

# Calculate rolling win rate for home games
df_matches['HomeWinRate'] = (
    df_matches.groupby('HomeTeam')['HomeWin']    # Group by HomeTeam
    .cumsum()                                    # Cumulative sum of home wins
    / df_matches.groupby('HomeTeam').cumcount()  # Divide by cumulative games played
    .add(1)                                      # To avoid division by zero
)

# Calculate rolling win rate for away games
df_matches['AwayWinRate'] = (
    df_matches.groupby('AwayTeam')['AwayWin']    # Group by AwayTeam
    .cumsum()                                    # Cumulative sum of away wins
    / df_matches.groupby('AwayTeam').cumcount()  # Divide by cumulative games played
    .add(1)                                      # To avoid division by zero
)

# Display relevant columns
display(df_matches)


Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,kHTAG,kAS,kAC,kAF,kAY,kAR,HomeWin,AwayWin,HomeWinRate,AwayWinRate
0,2000-08-19,Charlton,Man City,4.0,0.0,H,2.0,0.0,H,Rob Harris,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1,0,1.000000,0.000000
1,2000-08-19,Chelsea,West Ham,4.0,2.0,H,1.0,0.0,H,Graham Barber,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1,0,1.000000,0.000000
2,2000-08-19,Coventry,Middlesbrough,1.0,3.0,A,1.0,1.0,D,Barry Knight,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0,1,0.000000,1.000000
3,2000-08-19,Derby,Southampton,2.0,2.0,D,1.0,2.0,A,Andy D'Urso,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0,0,0.000000,0.000000
4,2000-08-19,Leeds,Everton,2.0,0.0,H,2.0,0.0,H,Dermot Gallagher,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1,0,1.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9326,2024-11-10,Nott'm Forest,Newcastle,1.0,3.0,A,1.0,0.0,H,A Taylor,...,0.763158,12.210526,7.868421,7.921053,2.184211,0.000000,0,1,0.333333,0.249417
9327,2024-11-10,Tottenham,Ipswich,1.0,2.0,A,0.0,2.0,A,D England,...,0.552632,9.210526,4.578947,12.552632,1.447368,0.078947,0,1,0.596567,0.295455
9328,2024-11-10,Man United,Leicester,3.0,0.0,H,2.0,0.0,H,P Bankes,...,0.631579,11.026316,4.105263,10.500000,1.842105,0.000000,1,0,0.683761,0.277778
9329,2024-11-10,Chelsea,Arsenal,1.0,1.0,D,0.0,0.0,D,M Oliver,...,1.000000,13.473684,7.868421,8.842105,1.710526,0.105263,0,0,0.628205,0.469957


### Section 3.3: Generating Rolling Average Statistics for this Specific Particular Home and Away Combination

In [11]:
n = 5; # Number of matches to look back for this particular home and away pair

def generate_nAvg_pairwise(df, row, attribute):
    # Getting the matches with the current row's home and away team lineup
    matches = df[ (df['HomeTeam'] == row['HomeTeam']) & (df['AwayTeam'] == row['AwayTeam']) ]
    
    # Get the current index of the row
    current_index = row.name
    
    # Get previous matches before this match
    previous_matches = matches[matches.index < current_index]
    
    return (
        previous_matches[attribute]
        .rolling(window=n, min_periods=1)
        .mean()
        .iloc[-1]  # Get the latest rolling average value
        if not previous_matches.empty
        else 0  # Default value for no previous matches
    )

In [12]:
# List of attributes to calculate team pair specific rolling averages for
attributes = ['FTHG','HTHG','HS','HC','HF','HY','HR',
              'FTAG','HTAG','AS','AC','AF','AY','AR']

for attribute in attributes:
    # Defining new columns for the specific combination
    new_column = f"n{attribute}_Pairwise"
    df_matches[new_column] = df_matches.apply(
        lambda row: generate_nAvg_pairwise(df_matches, row, attribute), axis = 1
    )


In [13]:
# Showing what columns are present in the current dataframe
df_matches.columns

Index(['Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HTHG', 'HTAG',
       'HTR', 'Referee', 'HS', 'AS', 'HST', 'AST', 'HC', 'AC', 'HF', 'AF',
       'HY', 'AY', 'HR', 'AR', 'HomeManager', 'AwayManager', 'HomeSpending',
       'AwaySpending', 'kFTHG', 'kHTHG', 'kHS', 'kHC', 'kHF', 'kHY', 'kHR',
       'kFTAG', 'kHTAG', 'kAS', 'kAC', 'kAF', 'kAY', 'kAR', 'HomeWin',
       'AwayWin', 'HomeWinRate', 'AwayWinRate', 'nFTHG_Pairwise',
       'nHTHG_Pairwise', 'nHS_Pairwise', 'nHC_Pairwise', 'nHF_Pairwise',
       'nHY_Pairwise', 'nHR_Pairwise', 'nFTAG_Pairwise', 'nHTAG_Pairwise',
       'nAS_Pairwise', 'nAC_Pairwise', 'nAF_Pairwise', 'nAY_Pairwise',
       'nAR_Pairwise'],
      dtype='object')

### Section 3.3: Label Encoding the Categorical Columns for Use in the Machine Learning Model

Input features that needs to be encoded:
1. HomeTeam
2. AwayTeam
3. FTR
4. HTR
5. Referee
6. HomeManager
7. AwayManager

In [14]:
# Team Names
le_teams = LabelEncoder()

# Using the same label encoder for the home and away teams.
le_teams.fit(pd.concat([df_matches['HomeTeam'],df_matches['AwayTeam']])) 

# Using the label encoder to encode the home and away teams
df_matches['HomeTeam_Encoded'] = le_teams.transform(df_matches['HomeTeam'])
df_matches['AwayTeam_Encoded'] = le_teams.transform(df_matches['AwayTeam'])

# FTR and HTR
le_results = LabelEncoder()
le_results.fit(pd.concat([df_matches['FTR'],df_matches['HTR']]))

# Using the label encoder to encode the FTR and HTR
df_matches['FTR_Encoded'] = le_results.transform(df_matches['FTR'])
df_matches['HTR_Encoded'] = le_results.transform(df_matches['HTR'])

# Referee
le_referee = LabelEncoder()
le_referee.fit(df_matches['Referee'])

df_matches['Referee_Encoded'] = le_referee.transform(df_matches['Referee'])

# Managers
le_managers = LabelEncoder() 
le_managers.fit(pd.concat([df_matches['HomeManager'],df_matches['AwayManager']]))
df_matches['HomeManager_Encoded'] = le_managers.transform(df_matches['HomeManager'])
df_matches['AwayManager_Encoded'] = le_managers.transform(df_matches['AwayManager'])

display(df_matches)

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,nAF_Pairwise,nAY_Pairwise,nAR_Pairwise,HomeTeam_Encoded,AwayTeam_Encoded,FTR_Encoded,HTR_Encoded,Referee_Encoded,HomeManager_Encoded,AwayManager_Encoded
0,2000-08-19,Charlton,Man City,4.0,0.0,H,2.0,0.0,H,Rob Harris,...,0.0,0.0,0.0,12,26,2,2,143,2,78
1,2000-08-19,Chelsea,West Ham,4.0,2.0,H,1.0,0.0,H,Graham Barber,...,0.0,0.0,0.0,13,43,2,2,65,30,61
2,2000-08-19,Coventry,Middlesbrough,1.0,3.0,A,1.0,1.0,D,Barry Knight,...,0.0,0.0,0.0,14,28,0,1,19,62,164
3,2000-08-19,Derby,Southampton,2.0,2.0,D,1.0,2.0,A,Andy D'Urso,...,0.0,0.0,0.0,16,36,1,0,12,77,160
4,2000-08-19,Leeds,Everton,2.0,0.0,H,2.0,0.0,H,Dermot Gallagher,...,0.0,0.0,0.0,22,17,2,2,46,36,174
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9326,2024-11-10,Nott'm Forest,Newcastle,1.0,3.0,A,1.0,0.0,H,A Taylor,...,6.5,1.5,0.0,31,29,0,2,4,112,44
9327,2024-11-10,Tottenham,Ipswich,1.0,2.0,A,0.0,2.0,A,D England,...,14.5,1.0,0.0,40,21,0,0,34,10,90
9328,2024-11-10,Man United,Leicester,3.0,0.0,H,2.0,0.0,H,P Bankes,...,9.8,1.4,0.0,27,23,2,2,116,141,154
9329,2024-11-10,Chelsea,Arsenal,1.0,1.0,D,0.0,0.0,D,M Oliver,...,9.2,2.0,0.0,13,0,1,1,95,46,107


## Section 4: Separating the DataFrame into Input and Output Features

### Section 4.1: Selecting the Input and Output Features of the Model

In [15]:
input_features = ['HomeTeam_Encoded','AwayTeam_Encoded','HomeManager_Encoded',
                 'AwayManager_Encoded','Referee_Encoded','kFTHG', 'kHTHG','kHS', 'kHC', 'kHF', 'kHY', 
                  'kHR', 'kFTAG', 'kHTAG', 'kAS', 'kAC', 'kAF', 'kAY', 'kAR', 
                  'HomeWinRate', 'AwayWinRate', 
                  'nFTHG_Pairwise', 'nHTHG_Pairwise', 'nHS_Pairwise', 'nHC_Pairwise',
                   'nHF_Pairwise', 'nHY_Pairwise', 'nHR_Pairwise', 'nFTAG_Pairwise',
                   'nHTAG_Pairwise', 'nAS_Pairwise', 'nAC_Pairwise', 'nAF_Pairwise',
                   'nAY_Pairwise', 'nAR_Pairwise', 'HomeSpending', 'AwaySpending']
df_input = df_matches[input_features]
display(df_input)

Unnamed: 0,HomeTeam_Encoded,AwayTeam_Encoded,HomeManager_Encoded,AwayManager_Encoded,Referee_Encoded,kFTHG,kHTHG,kHS,kHC,kHF,...,nHR_Pairwise,nFTAG_Pairwise,nHTAG_Pairwise,nAS_Pairwise,nAC_Pairwise,nAF_Pairwise,nAY_Pairwise,nAR_Pairwise,HomeSpending,AwaySpending
0,12,26,2,78,143,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.8,20.28
1,13,43,30,61,65,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,52.39,18.65
2,14,28,62,164,19,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
3,16,36,77,160,12,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.33,-
4,22,17,36,174,46,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,53.15,34.48
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9326,31,29,112,44,4,1.447368,0.631579,12.605263,7.263158,7.868421,...,0.0,2.5,1.5,11.0,6.5,6.5,1.5,0.0,105.5,68.2
9327,40,21,10,90,34,2.026316,0.684211,17.000000,8.947368,9.736842,...,0.5,1.5,1.0,13.5,4.5,14.5,1.0,0.0,148.85,126.49
9328,27,23,141,154,116,1.500000,0.473684,16.921053,8.394737,9.763158,...,0.0,0.8,0.2,12.8,5.8,9.8,1.4,0.0,214.5,86.05
9329,13,0,46,107,95,1.710526,0.894737,16.026316,8.368421,8.078947,...,0.0,2.0,0.6,11.8,5.8,9.2,2.0,0.0,,108.9


In [None]:
output_features = ['FTR_Encoded']
df_output = df_matches['FTR_Encoded']
display(df_output)

### Section 4.3: Checking the Correlation Between Each Input Feature With Respect to FTR_Encoded

In [None]:
#Checking correlation Matrix
correlation_matrix = pd.concat([df_input, df_output], axis=1).corr()

sorted_correlation = correlation_matrix['FTR_Encoded'].abs().sort_values(ascending = False)

display(sorted_correlation)

### Section 4.4: Splitting the Dataset into Training and Test Datasets

In [None]:
#Splitting the dataset into training and test data
from sklearn.model_selection import train_test_split
testing_size = 0.33
input_train, input_test, output_train, output_test = train_test_split(df_input, df_output, test_size=testing_size, random_state=42)

### Section 4.5: Scaling the Input and Test Datasets

In [None]:
# Scaling the input training and test data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

input_train_scaled = scaler.fit_transform(input_train)
input_test_scaled = scaler.transform(input_test)


### Section 4.6: Training the Logistic Regression Model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
# Creating an array of numbers from 0 to 1 which can be iterated through to change the ratio of l1 vs l2 regularisation
l1_component = np.arange(0,1,0.01)

# Creates an empty list to hold the weighted f1 scores at each l1 component for train and test data
f1_train_lst = []
f1_test_lst = [] 

for l1 in l1_component:
    clf = LogisticRegression(penalty= 'elasticnet', l1_ratio = l1, solver = 'saga', max_iter = 5000)
    # Training the Classifier
    clf = clf.fit(input_train_scaled, output_train)
    
    pred_test = clf.predict(input_test_scaled) # Prediction using the test split of the data
    pred_train = clf.predict(input_train_scaled) # Prediction using the train split of the data
    
    f1_test = f1_score(output_test, pred_test, average='weighted')
    f1_train = f1_score(output_train, pred_train, average = 'weighted')
    
    f1_train_lst.append(f1_train)
    f1_test_lst.append(f1_test)

In [None]:
plt.plot(l1_component, f1_train_lst)
plt.plot(l1_component, f1_test_lst)
plt.legend(['Training Data','Test Data'])
plt.xlabel('$l_1$ ratio')
plt.ylabel('Weighted F1 Score')
plt.title('Effect of $l_1$ Ratio on F1 Score of a Multinomial Logistic Regression with Elastic Net Regularisation')

In [None]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(penalty= 'elasticnet', l1_ratio = 0.25, solver = 'saga', max_iter = 5000)
# Training the Classifier
clf = clf.fit(input_train_scaled, output_train)

### Section 4.7: Evaluating the Performance of the Logistic Regression Model using the Test Dataset

In [None]:
from sklearn.metrics import classification_report
output_pred = clf.predict(input_test_scaled)

output_test = le_results.inverse_transform(output_test)
output_pred = le_results.inverse_transform(output_pred)

# Transforming Back Into Human Understandable Results
print(classification_report(output_test, output_pred))

### Section 4.8: Predicting the Match Results for the 2024-25 Season

In [None]:
# Importing the matches that needs to be predicted
df_test = pd.read_csv('epl-test.csv')

# Ensure the match date is in datetime format
df_test['Date'] = pd.to_datetime(df_test['Date'], dayfirst = True)


# Number of rows of the df_test dataframe
test_rows = len(df_test.index)

df_test.head()

In [None]:
# Replacing the inconsistent team names with the team names used previously
df_test.replace(to_replace = 'AFC Bournemouth', value= 'Bournemouth', inplace = True)
df_test.replace(to_replace = 'Ipswich Town', value= 'Ipswich', inplace = True)
df_test.replace(to_replace = 'Man Utd', value= 'Man United', inplace = True)
df_test.replace(to_replace = 'Nottingham Forest', value= "Nott'm Forest", inplace = True)
df_test.replace(to_replace = 'Leicester City', value= "Leicester", inplace = True)
df_test.replace(to_replace = 'Spurs', value= "Tottenham", inplace = True)

In [None]:
# Add HomeManager and AwayManager columns to df_test
df_test['HomeManager'] = df_test.apply(lambda row: get_manager(row['HomeTeam'], row['Date']), axis=1)
df_test['AwayManager'] = df_test.apply(lambda row: get_manager(row['AwayTeam'], row['Date']), axis=1)
display(df_test)

In [None]:
# Merge df_test into df_matches 
merged_data = pd.concat([df_matches,df_test]) # data

# Generating kAvg Statistics
for feature in Home_kAvg_features:
    generate_kAvg(merged_data, feature,True)
for feature in Away_kAvg_features:
    generate_kAvg(merged_data, feature,False)

    
# Create a new column to store home wins and away wins
merged_data['HomeWin'] = (merged_data['FTR'] == 'H').astype(int)
merged_data['AwayWin'] = (merged_data['FTR'] == 'A').astype(int)

# Calculate rolling win rate for home games
merged_data['HomeWinRate'] = (
    merged_data.groupby('HomeTeam')['HomeWin']    # Group by HomeTeam
    .cumsum()                                    # Cumulative sum of home wins
    / merged_data.groupby('HomeTeam').cumcount()  # Divide by cumulative games played
    .add(1)                                      # To avoid division by zero
)

# Calculate rolling win rate for away games
merged_data['AwayWinRate'] = (
    merged_data.groupby('AwayTeam')['AwayWin']    # Group by AwayTeam
    .cumsum()                                    # Cumulative sum of away wins
    / merged_data.groupby('AwayTeam').cumcount()  # Divide by cumulative games played
    .add(1)                                      # To avoid division by zero
)

# Generating pairwise statistics for each team pair
for attribute in attributes:
    # Defining new columns for the specific combination
    new_column = f"n{attribute}_Pairwise"
    merged_data[new_column] = merged_data.apply(
        lambda row: generate_nAvg_pairwise(merged_data, row, attribute), axis = 1
    )


In [None]:
df_test = merged_data.tail(test_rows).copy()

display(df_test)

In [None]:

def safe_transform_with_zero(encoder, values):
    return [encoder.transform([v])[0] if v in encoder.classes_ else 0 for v in values]


# Using the label encoder to encode the home and away teams
df_test['HomeTeam_Encoded'] = safe_transform_with_zero(le_teams, df_test['HomeTeam'])
df_test['AwayTeam_Encoded'] = safe_transform_with_zero(le_teams, df_test['AwayTeam'])

df_test['Referee_Encoded'] = safe_transform_with_zero(le_referee, df_test['Referee'])

df_test['HomeManager_Encoded'] = safe_transform_with_zero(le_managers, df_test['HomeManager'])
df_test['AwayManager_Encoded'] = safe_transform_with_zero(le_managers, df_test['AwayManager'])

In [None]:
df_test_input = df_test[input_features].copy()
display(df_test_input)
df_test_input.columns

In [None]:


ftr_predict = clf.predict(df_test_input)
ftr_predict = le_results.inverse_transform(ftr_predict)

df_export = pd.read_csv('epl-test.csv')
df_export['FTR'] = ftr_predict

display(df_export)