# 1 Introduction

# 2 Data Import


In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, f1_score

In [16]:
# Importing the vanilla dataset
df_matches = pd.read_csv('com.csv')


# Ensure the match date is in datetime format
df_matches['Date'] = pd.to_datetime(df_matches['Date'],dayfirst = True)

# Sort the df_matches dataframe by ascending date order
df_matches = df_matches.sort_values(by='Date', ascending=True).reset_index(drop=True)

# Drop any rows where all the values are nan
df_matches = df_matches.dropna()

df_matches.dtypes


Date        datetime64[ns]
HomeTeam            object
AwayTeam            object
FTHG               float64
FTAG               float64
FTR                 object
HTHG               float64
HTAG               float64
HTR                 object
Referee             object
HS                 float64
AS                 float64
HST                float64
AST                float64
HC                 float64
AC                 float64
HF                 float64
AF                 float64
HY                 float64
AY                 float64
HR                 float64
AR                 float64
dtype: object

In [17]:
# Travel Distance and Travel Fatigue Index
import math
# Stadium Coordinates
stadium_coordinates = {
    'Swansea': (51.6428, -3.9347),
    'West Ham': (51.5383, -0.0166),
    'Charlton': (51.4865, 0.0368),
    'Wigan': (53.5477, -2.6542),
    'Wolves': (52.5904, -2.1306),
    'Brighton': (50.8609, -0.0801),
    'Bournemouth': (50.7352, -1.8384),
    'Blackpool': (53.8046, -3.0483),
    "Nott'm Forest": (52.9399, -1.1326),
    'Aston Villa': (52.5092, -1.8851),
    'Brentford': (51.4908, -0.2888),
    'Chelsea': (51.4816, -0.1910),
    'Coventry': (52.4481, -1.4956),
    'Sheffield United': (53.3703, -1.4708),
    'Fulham': (51.4749, -0.2216),
    'Leeds': (53.7775, -1.5721),
    'Middlesbrough': (54.5781, -1.2178),
    'Newcastle': (54.9756, -1.6218),
    'Luton': (51.8842, -0.4316),
    'Leicester': (52.6203, -1.1422),
    'Hull': (53.7465, -0.3680),
    'Huddersfield': (53.6543, -1.7684),
    'Southampton': (50.9058, -1.3911),
    'QPR': (51.5093, -0.2322),
    'Bradford': (53.8042, -1.7590),
    'Everton': (53.4387, -2.9662),
    'Blackburn': (53.7286, -2.4894),
    'Man United': (53.4631, -2.2914),
    'Stoke': (52.9884, -2.1754),
    'Reading': (51.4222, -0.9828),
    'Birmingham': (52.4756, -1.8682),
    'Liverpool': (53.4308, -2.9610),
    'Tottenham': (51.6044, -0.0664),
    'Ipswich': (52.0544, 1.1455),
    'Norwich': (52.6221, 1.3091),
    'Watford': (51.6498, -0.4016),
    'Man City': (53.4830, -2.2002),
    'Crystal Palace': (51.3983, -0.0855),
    'Derby': (52.9149, -1.4473),
    'Burnley': (53.7888, -2.2302),
    'Sunderland': (54.9146, -1.3884),
    'West Brom': (52.5090, -1.9639),
    'Arsenal': (51.5549, -0.1084),
    'Portsmouth': (50.7964, -1.0639),
    'Cardiff': (51.4729, -3.2041),
    'Bolton': (53.5805, -2.5357)
}

# Define Haversine function
def haversine(coord1, coord2):
    lon1, lat1 = coord1
    lon2, lat2 = coord2

    R = 6371000  # radius of Earth in meters
    phi_1 = math.radians(lat1)
    phi_2 = math.radians(lat2)

    delta_phi = math.radians(lat2 - lat1)
    delta_lambda = math.radians(lon2 - lon1)

    a = math.sin(delta_phi / 2.0) ** 2 + math.cos(phi_1) * math.cos(phi_2) * math.sin(delta_lambda / 2.0) ** 2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))

    meters = R * c  # output distance in meters
    km = meters / 1000.0  # output distance in kilometers

    return round(km, 3)

# Calculate travel distance
def calculate_travel_distance(row, stadium_coordinates):
    home_team = row['HomeTeam']
    away_team = row['AwayTeam']
    if home_team in stadium_coordinates and away_team in stadium_coordinates:
        home_coords = stadium_coordinates[home_team]
        away_coords = stadium_coordinates[away_team]
        return haversine(home_coords, away_coords)
    return None  # Handle missing coordinates

# Apply the function to calculate travel distances
df_matches['TravelDistance'] = df_matches.apply(
    calculate_travel_distance, axis=1, args=(stadium_coordinates,)
)

# Normalize Travel Distance to create a Travel Fatigue Index
min_distance = df_matches['TravelDistance'].min()
max_distance = df_matches['TravelDistance'].max()

# Apply Travel Fatigue Index to the Dataframe
df_matches['TravelFatigueIndex'] = (df_matches['TravelDistance'] - min_distance) / (
    max_distance - min_distance
)

# Display a sample of the updated dataset
display(df_matches[['HomeTeam', 'AwayTeam', 'TravelDistance', 'TravelFatigueIndex']].head(10))

Unnamed: 0,HomeTeam,AwayTeam,TravelDistance,TravelFatigueIndex
0,Charlton,Man City,333.367,0.561248
1,Chelsea,West Ham,20.392,0.032665
2,Coventry,Middlesbrough,238.785,0.401509
3,Derby,Southampton,223.421,0.37556
4,Leeds,Everton,159.522,0.267641
5,Leicester,Aston Villa,83.525,0.13929
6,Liverpool,Bradford,139.946,0.23458
7,Sunderland,Arsenal,399.739,0.673343
8,Tottenham,Ipswich,143.746,0.240997
9,Man United,Newcastle,183.836,0.308705


In [18]:
# Importing Manager Information

df_managers = pd.read_excel('PremierLeagueManagers.xlsx')

# Ensure Season_Start and Season_End are in datetime format
df_managers['Season_Start'] = pd.to_datetime(df_managers['Season_Start'],dayfirst = True)
df_managers['Season_End'] = pd.to_datetime(df_managers['Season_End'],dayfirst = True)

# Ensure Season_Start and Season_End are in datetime format
df_managers['Season_Start'] = pd.to_datetime(df_managers['Season_Start'],dayfirst = True)
df_managers['Season_End'] = pd.to_datetime(df_managers['Season_End'],dayfirst = True)

# Function to get the manager for a specific team and match date
def get_manager(team, match_date):
    manager_row = df_managers[
        (df_managers['Club'] == team) &
        (df_managers['Season_Start'] <= match_date) &
        (df_managers['Season_End'] >= match_date)
    ]
    return manager_row['Manager'].iloc[0] if not manager_row.empty else None

# Add HomeManager and AwayManager columns to df_matches
df_matches['HomeManager'] = df_matches.apply(lambda row: get_manager(row['HomeTeam'], row['Date']), axis=1)
df_matches['AwayManager'] = df_matches.apply(lambda row: get_manager(row['AwayTeam'], row['Date']), axis=1)

# Importing Spending Information

df_spending = pd.read_excel('Spending_data.xlsx')

# Ensure Season_Start and Season_End are in datetime format
df_spending['Season_Start'] = pd.to_datetime(df_spending['Season_Start'],dayfirst = True)
df_spending['Season_End'] = pd.to_datetime(df_spending['Season_End'],dayfirst = True)

# Ensure Season_Start and Season_End are in datetime format
df_spending['Season_Start'] = pd.to_datetime(df_spending['Season_Start'],dayfirst = True)
df_spending['Season_End'] = pd.to_datetime(df_spending['Season_End'],dayfirst = True)

# Function to get the expenditure for a specific team and match date
def get_spending(team, match_date):
    spending_row = df_spending[
        (df_spending['Team'] == team) &
        (df_spending['Season_Start'] <= match_date) &
        (df_spending['Season_End'] >= match_date)
    ]
    return spending_row['Expenditure'].iloc[0] if not spending_row.empty else 0

# Add HomeManager and AwayManager columns to df_matches
df_matches['HomeExpenditure'] = df_matches.apply(lambda row: get_spending(row['HomeTeam'], row['Date']), axis=1)
df_matches['AwayExpenditure'] = df_matches.apply(lambda row: get_spending(row['AwayTeam'], row['Date']), axis=1)

# Display a sample of the updated dataset
display(df_matches[['HomeTeam', 'AwayTeam','HomeManager','AwayManager','HomeExpenditure' ,'AwayExpenditure', 'TravelDistance', 'TravelFatigueIndex']].head(10))

df_matches.columns

Unnamed: 0,HomeTeam,AwayTeam,HomeManager,AwayManager,HomeExpenditure,AwayExpenditure,TravelDistance,TravelFatigueIndex
0,Charlton,Man City,Alan Curbishley,Joe Royle,20.8,20.28,333.367,0.561248
1,Chelsea,West Ham,Claudio Ranieri,Glenn Roeder,52.39,18.65,20.392,0.032665
2,Coventry,Middlesbrough,Gordon Strachan,Terry Venables,0.0,0.0,238.785,0.401509
3,Derby,Southampton,Jim Smith,Stuart Gray,13.33,0.0,223.421,0.37556
4,Leeds,Everton,David O'Leary,Walter Smith,53.15,34.48,159.522,0.267641
5,Leicester,Aston Villa,Peter Taylor,John Gregory,11.7,21.12,83.525,0.13929
6,Liverpool,Bradford,Gérard Houllier,Jim Jefferies,34.0,0.0,139.946,0.23458
7,Sunderland,Arsenal,Peter Reid,Arsène Wenger,16.87,56.3,399.739,0.673343
8,Tottenham,Ipswich,David Pleat,George Burley,25.74,9.9,143.746,0.240997
9,Man United,Newcastle,Alex Ferguson,Bobby Robson,0.0,25.91,183.836,0.308705


Index(['Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HTHG', 'HTAG',
       'HTR', 'Referee', 'HS', 'AS', 'HST', 'AST', 'HC', 'AC', 'HF', 'AF',
       'HY', 'AY', 'HR', 'AR', 'TravelDistance', 'TravelFatigueIndex',
       'HomeManager', 'AwayManager', 'HomeExpenditure', 'AwayExpenditure'],
      dtype='object')

In [19]:
# Checking the distribution of FTR Values
print(df_matches['FTR'].value_counts()) # Raw values
print(df_matches['FTR'].value_counts(normalize=True)*100) # Percentage of the results being H, A, or D

FTR
H    4287
A    2738
D    2305
Name: count, dtype: int64
FTR
H    45.948553
A    29.346195
D    24.705252
Name: proportion, dtype: float64


# 3 Data Transformation & Exploration

In [20]:
# Calculating Rolling Average Statistics
k = 38 # Number of matches to look backwards to.

# Function to Calculate Rolling Average Statistics for the Past k Matches. Each row's kAvg does not include the current results.
def generate_kAvg(df, attribute, HomeTeam=True):
    if HomeTeam:
        group_by_team = "HomeTeam"
    elif not HomeTeam:
        group_by_team = "AwayTeam"
    
    kattribute = "k" + attribute
    df[kattribute] = (
    df.groupby(group_by_team)[attribute]     # Group by HomeTeam or AwayTeam
    .transform(lambda x: x.shift(1).rolling(window=k, min_periods=1).mean())  # Shift by 1 to exclude the current match, and create a rolling window of up to k past matches
    )
    df[kattribute] = df[kattribute].fillna(0) # Filling nan values with 0

Home_kAvg_features = ['FTHG','HTHG','HS','HC','HF','HY','HR']
Away_kAvg_features = ['FTAG','HTAG','AS','AC','AF','AY','AR']

for feature in Home_kAvg_features:
    generate_kAvg(df_matches, feature,True)
for feature in Away_kAvg_features:
    generate_kAvg(df_matches, feature,False)
    
display(df_matches)

# Create a new column to store home wins and away wins
df_matches['HomeWin'] = (df_matches['FTR'] == 'H').astype(int)
df_matches['AwayWin'] = (df_matches['FTR'] == 'A').astype(int)

# Calculate rolling win rate for home games
df_matches['HomeWinRate'] = (
    df_matches.groupby('HomeTeam')['HomeWin']    # Group by HomeTeam
    .cumsum()                                    # Cumulative sum of home wins
    / df_matches.groupby('HomeTeam').cumcount()  # Divide by cumulative games played
    .add(1)                                      # To avoid division by zero
)

# Calculate rolling win rate for away games
df_matches['AwayWinRate'] = (
    df_matches.groupby('AwayTeam')['AwayWin']    # Group by AwayTeam
    .cumsum()                                    # Cumulative sum of away wins
    / df_matches.groupby('AwayTeam').cumcount()  # Divide by cumulative games played
    .add(1)                                      # To avoid division by zero
)

# Display relevant columns
display(df_matches)

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,kHF,kHY,kHR,kFTAG,kHTAG,kAS,kAC,kAF,kAY,kAR
0,2000-08-19,Charlton,Man City,4.0,0.0,H,2.0,0.0,H,Rob Harris,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,2000-08-19,Chelsea,West Ham,4.0,2.0,H,1.0,0.0,H,Graham Barber,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,2000-08-19,Coventry,Middlesbrough,1.0,3.0,A,1.0,1.0,D,Barry Knight,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,2000-08-19,Derby,Southampton,2.0,2.0,D,1.0,2.0,A,Andy D'Urso,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,2000-08-19,Leeds,Everton,2.0,0.0,H,2.0,0.0,H,Dermot Gallagher,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9325,2024-11-09,Brighton,Man City,2.0,1.0,H,0.0,1.0,A,S Barrott,...,9.473684,1.921053,0.052632,2.157895,0.868421,17.473684,7.500000,7.342105,1.526316,0.000000
9326,2024-11-10,Nott'm Forest,Newcastle,1.0,3.0,A,1.0,0.0,H,A Taylor,...,7.868421,1.842105,0.078947,1.868421,0.763158,12.210526,7.868421,7.921053,2.184211,0.000000
9327,2024-11-10,Tottenham,Ipswich,1.0,2.0,A,0.0,2.0,A,D England,...,9.736842,2.289474,0.078947,1.157895,0.552632,9.210526,4.578947,12.552632,1.447368,0.078947
9328,2024-11-10,Man United,Leicester,3.0,0.0,H,2.0,0.0,H,P Bankes,...,9.763158,2.184211,0.026316,1.500000,0.631579,11.026316,4.105263,10.500000,1.842105,0.000000


Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,kHTAG,kAS,kAC,kAF,kAY,kAR,HomeWin,AwayWin,HomeWinRate,AwayWinRate
0,2000-08-19,Charlton,Man City,4.0,0.0,H,2.0,0.0,H,Rob Harris,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1,0,1.000000,0.000000
1,2000-08-19,Chelsea,West Ham,4.0,2.0,H,1.0,0.0,H,Graham Barber,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1,0,1.000000,0.000000
2,2000-08-19,Coventry,Middlesbrough,1.0,3.0,A,1.0,1.0,D,Barry Knight,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0,1,0.000000,1.000000
3,2000-08-19,Derby,Southampton,2.0,2.0,D,1.0,2.0,A,Andy D'Urso,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0,0,0.000000,0.000000
4,2000-08-19,Leeds,Everton,2.0,0.0,H,2.0,0.0,H,Dermot Gallagher,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1,0,1.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9325,2024-11-09,Brighton,Man City,2.0,1.0,H,0.0,1.0,A,S Barrott,...,0.868421,17.473684,7.500000,7.342105,1.526316,0.000000,1,0,0.351724,0.463252
9326,2024-11-10,Nott'm Forest,Newcastle,1.0,3.0,A,1.0,0.0,H,A Taylor,...,0.763158,12.210526,7.868421,7.921053,2.184211,0.000000,0,1,0.333333,0.249417
9327,2024-11-10,Tottenham,Ipswich,1.0,2.0,A,0.0,2.0,A,D England,...,0.552632,9.210526,4.578947,12.552632,1.447368,0.078947,0,1,0.596567,0.295455
9328,2024-11-10,Man United,Leicester,3.0,0.0,H,2.0,0.0,H,P Bankes,...,0.631579,11.026316,4.105263,10.500000,1.842105,0.000000,1,0,0.683761,0.277778


In [21]:
n = 5; # Number of matches to look back for this particular home and away pair

def generate_nAvg_pairwise(df, row, attribute):
    # Getting the matches with the current row's home and away team lineup
    matches = df[ (df['HomeTeam'] == row['HomeTeam']) & (df['AwayTeam'] == row['AwayTeam']) ]
    
    # Get the current index of the row
    current_index = row.name
    
    # Get previous matches before this match
    previous_matches = matches[matches.index < current_index]
    
    return (
        previous_matches[attribute]
        .rolling(window=n, min_periods=1)
        .mean()
        .iloc[-1]  # Get the latest rolling average value
        if not previous_matches.empty
        else 0  # Default value for no previous matches
    )
# List of attributes to calculate team pair specific rolling averages for
attributes = ['FTHG','HTHG','HS','HC','HF','HY','HR',
              'FTAG','HTAG','AS','AC','AF','AY','AR']

for attribute in attributes:
    # Defining new columns for the specific combination
    new_column = f"n{attribute}_Pairwise"
    df_matches[new_column] = df_matches.apply(
        lambda row: generate_nAvg_pairwise(df_matches, row, attribute), axis = 1
    )

# Showing what columns are present in the current dataframe
df_matches.columns

Index(['Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HTHG', 'HTAG',
       'HTR', 'Referee', 'HS', 'AS', 'HST', 'AST', 'HC', 'AC', 'HF', 'AF',
       'HY', 'AY', 'HR', 'AR', 'TravelDistance', 'TravelFatigueIndex',
       'HomeManager', 'AwayManager', 'HomeExpenditure', 'AwayExpenditure',
       'kFTHG', 'kHTHG', 'kHS', 'kHC', 'kHF', 'kHY', 'kHR', 'kFTAG', 'kHTAG',
       'kAS', 'kAC', 'kAF', 'kAY', 'kAR', 'HomeWin', 'AwayWin', 'HomeWinRate',
       'AwayWinRate', 'nFTHG_Pairwise', 'nHTHG_Pairwise', 'nHS_Pairwise',
       'nHC_Pairwise', 'nHF_Pairwise', 'nHY_Pairwise', 'nHR_Pairwise',
       'nFTAG_Pairwise', 'nHTAG_Pairwise', 'nAS_Pairwise', 'nAC_Pairwise',
       'nAF_Pairwise', 'nAY_Pairwise', 'nAR_Pairwise'],
      dtype='object')

In [22]:
# Team Names
le_teams = LabelEncoder()

# Using the same label encoder for the home and away teams.
le_teams.fit(pd.concat([df_matches['HomeTeam'],df_matches['AwayTeam']])) 

# Using the label encoder to encode the home and away teams
df_matches['HomeTeam_Encoded'] = le_teams.transform(df_matches['HomeTeam'])
df_matches['AwayTeam_Encoded'] = le_teams.transform(df_matches['AwayTeam'])

# FTR and HTR
le_results = LabelEncoder()
le_results.fit(pd.concat([df_matches['FTR'],df_matches['HTR']]))

# Using the label encoder to encode the FTR and HTR
df_matches['FTR_Encoded'] = le_results.transform(df_matches['FTR'])
df_matches['HTR_Encoded'] = le_results.transform(df_matches['HTR'])

# Referee
le_referee = LabelEncoder()
le_referee.fit(df_matches['Referee'])

df_matches['Referee_Encoded'] = le_referee.transform(df_matches['Referee'])

# Managers
le_managers = LabelEncoder() 
le_managers.fit(pd.concat([df_matches['HomeManager'],df_matches['AwayManager']]))
df_matches['HomeManager_Encoded'] = le_managers.transform(df_matches['HomeManager'])
df_matches['AwayManager_Encoded'] = le_managers.transform(df_matches['AwayManager'])

display(df_matches)
df_matches.to_excel('encoded.xlsx', index=False)
df_matches = df_matches.fillna(0)

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,nAF_Pairwise,nAY_Pairwise,nAR_Pairwise,HomeTeam_Encoded,AwayTeam_Encoded,FTR_Encoded,HTR_Encoded,Referee_Encoded,HomeManager_Encoded,AwayManager_Encoded
0,2000-08-19,Charlton,Man City,4.0,0.0,H,2.0,0.0,H,Rob Harris,...,0.0,0.0,0.0,12,26,2,2,143,2,78
1,2000-08-19,Chelsea,West Ham,4.0,2.0,H,1.0,0.0,H,Graham Barber,...,0.0,0.0,0.0,13,43,2,2,65,30,61
2,2000-08-19,Coventry,Middlesbrough,1.0,3.0,A,1.0,1.0,D,Barry Knight,...,0.0,0.0,0.0,14,28,0,1,19,62,164
3,2000-08-19,Derby,Southampton,2.0,2.0,D,1.0,2.0,A,Andy D'Urso,...,0.0,0.0,0.0,16,36,1,0,12,77,160
4,2000-08-19,Leeds,Everton,2.0,0.0,H,2.0,0.0,H,Dermot Gallagher,...,0.0,0.0,0.0,22,17,2,2,46,36,174
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9325,2024-11-09,Brighton,Man City,2.0,1.0,H,0.0,1.0,A,S Barrott,...,7.0,1.4,0.2,9,26,2,0,148,49,124
9326,2024-11-10,Nott'm Forest,Newcastle,1.0,3.0,A,1.0,0.0,H,A Taylor,...,6.5,1.5,0.0,31,29,0,2,4,112,44
9327,2024-11-10,Tottenham,Ipswich,1.0,2.0,A,0.0,2.0,A,D England,...,14.5,1.0,0.0,40,21,0,0,34,10,90
9328,2024-11-10,Man United,Leicester,3.0,0.0,H,2.0,0.0,H,P Bankes,...,9.8,1.4,0.0,27,23,2,2,116,141,154


In [23]:
# Defining the input features 
input_features = ['HomeTeam_Encoded','AwayTeam_Encoded','HomeManager_Encoded',
                 'AwayManager_Encoded','Referee_Encoded', 'HomeExpenditure', 'AwayExpenditure', 
                  'TravelDistance', 'TravelFatigueIndex',
                  'kFTHG', 'kHTHG','kHS', 'kHC', 'kHF', 'kHY', 
                  'kHR', 'kFTAG', 'kHTAG', 'kAS', 'kAC', 'kAF', 'kAY', 'kAR', 
                  'HomeWinRate', 'AwayWinRate', 
                  'nFTHG_Pairwise', 'nHTHG_Pairwise', 'nHS_Pairwise', 'nHC_Pairwise',
                   'nHF_Pairwise', 'nHY_Pairwise', 'nHR_Pairwise', 'nFTAG_Pairwise',
                   'nHTAG_Pairwise', 'nAS_Pairwise', 'nAC_Pairwise', 'nAF_Pairwise',
                   'nAY_Pairwise', 'nAR_Pairwise']
df_input = df_matches[input_features]
display(df_input)

output_features = ['FTR_Encoded']
df_output = df_matches['FTR_Encoded']
display(df_output)

Unnamed: 0,HomeTeam_Encoded,AwayTeam_Encoded,HomeManager_Encoded,AwayManager_Encoded,Referee_Encoded,HomeExpenditure,AwayExpenditure,TravelDistance,TravelFatigueIndex,kFTHG,...,nHF_Pairwise,nHY_Pairwise,nHR_Pairwise,nFTAG_Pairwise,nHTAG_Pairwise,nAS_Pairwise,nAC_Pairwise,nAF_Pairwise,nAY_Pairwise,nAR_Pairwise
0,12,26,2,78,143,20.80,20.28,333.367,0.561248,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,13,43,30,61,65,52.39,18.65,20.392,0.032665,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,14,28,62,164,19,0.00,0.00,238.785,0.401509,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,16,36,77,160,12,13.33,0.00,223.421,0.375560,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,22,17,36,174,46,53.15,34.48,159.522,0.267641,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9325,9,26,49,124,148,0.00,25.00,374.889,0.631374,1.921053,...,10.0,2.0,0.0,3.2,2.0,16.8,4.0,7.0,1.4,0.2
9326,31,29,112,44,4,105.50,68.20,232.740,0.391299,1.447368,...,10.5,3.0,0.0,2.5,1.5,11.0,6.5,6.5,1.5,0.0
9327,40,21,10,90,34,148.85,126.49,143.746,0.240997,2.026316,...,18.0,0.5,0.5,1.5,1.0,13.5,4.5,14.5,1.0,0.0
9328,27,23,141,154,116,214.50,86.05,158.441,0.265816,1.500000,...,11.2,1.0,0.0,0.8,0.2,12.8,5.8,9.8,1.4,0.0


0       2
1       2
2       0
3       1
4       2
       ..
9325    2
9326    0
9327    0
9328    2
9329    1
Name: FTR_Encoded, Length: 9330, dtype: int32

In [24]:
#Checking correlation Matrix
correlation_matrix = pd.concat([df_input, df_output], axis=1).corr()

sorted_correlation = correlation_matrix['FTR_Encoded'].abs().sort_values(ascending = False)

display(sorted_correlation)

FTR_Encoded            1.000000
HomeWinRate            0.309951
AwayWinRate            0.297890
kFTHG                  0.246896
kFTAG                  0.243459
kAS                    0.241129
kHTHG                  0.223033
kHS                    0.214479
kHTAG                  0.212977
kAC                    0.178061
kHC                    0.171585
nAS_Pairwise           0.161303
nFTAG_Pairwise         0.158576
nFTHG_Pairwise         0.134183
nHTAG_Pairwise         0.129286
nAC_Pairwise           0.115603
nHS_Pairwise           0.113863
kHY                    0.111643
nHTHG_Pairwise         0.106688
nHC_Pairwise           0.097164
nHY_Pairwise           0.095584
AwayExpenditure        0.094791
HomeExpenditure        0.084161
HomeManager_Encoded    0.073002
AwayTeam_Encoded       0.063966
kAF                    0.057522
AwayManager_Encoded    0.055152
kHF                    0.054368
HomeTeam_Encoded       0.053972
nHF_Pairwise           0.043265
TravelDistance         0.029707
TravelFa

In [25]:
# Drop the Referee_Encoded column
df_input = df_input.drop('Referee_Encoded', axis = 1)
input_features.remove("Referee_Encoded")

# Display the finalised input feature list
display(df_input)

Unnamed: 0,HomeTeam_Encoded,AwayTeam_Encoded,HomeManager_Encoded,AwayManager_Encoded,HomeExpenditure,AwayExpenditure,TravelDistance,TravelFatigueIndex,kFTHG,kHTHG,...,nHF_Pairwise,nHY_Pairwise,nHR_Pairwise,nFTAG_Pairwise,nHTAG_Pairwise,nAS_Pairwise,nAC_Pairwise,nAF_Pairwise,nAY_Pairwise,nAR_Pairwise
0,12,26,2,78,20.80,20.28,333.367,0.561248,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,13,43,30,61,52.39,18.65,20.392,0.032665,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,14,28,62,164,0.00,0.00,238.785,0.401509,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,16,36,77,160,13.33,0.00,223.421,0.375560,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,22,17,36,174,53.15,34.48,159.522,0.267641,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9325,9,26,49,124,0.00,25.00,374.889,0.631374,1.921053,0.868421,...,10.0,2.0,0.0,3.2,2.0,16.8,4.0,7.0,1.4,0.2
9326,31,29,112,44,105.50,68.20,232.740,0.391299,1.447368,0.631579,...,10.5,3.0,0.0,2.5,1.5,11.0,6.5,6.5,1.5,0.0
9327,40,21,10,90,148.85,126.49,143.746,0.240997,2.026316,0.684211,...,18.0,0.5,0.5,1.5,1.0,13.5,4.5,14.5,1.0,0.0
9328,27,23,141,154,214.50,86.05,158.441,0.265816,1.500000,0.473684,...,11.2,1.0,0.0,0.8,0.2,12.8,5.8,9.8,1.4,0.0


# 4 Methodology Overview

In [26]:
#Splitting the dataset into training and test data
from sklearn.model_selection import train_test_split
testing_size = 0.33
input_train, input_test, output_train, output_test = train_test_split(df_input, df_output, test_size=testing_size, random_state=42)

In [27]:
# Scaling the input training and test data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

input_train_scaled = scaler.fit_transform(input_train)
input_test_scaled = scaler.transform(input_test)


# 5 Model Training & Validation

In [28]:
# Import necessary libraries
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Train the Support Vector Machine model
svm_model = SVC(kernel='linear') # You can choose other kernels like 'rbf', 'poly', etc.
svm_model.fit(input_train_scaled, output_train)

# Predict the outcomes for the test set
predictions = svm_model.predict(input_test_scaled)

# Evaluate the model
accuracy = accuracy_score(output_test, predictions)
print(f'Model Accuracy: {accuracy * 100:.2f}%')
print(classification_report(output_test, predictions, target_names=le_results.classes_))


Model Accuracy: 55.93%
              precision    recall  f1-score   support

           A       0.56      0.51      0.53       906
           D       0.00      0.00      0.00       734
           H       0.56      0.88      0.68      1439

    accuracy                           0.56      3079
   macro avg       0.37      0.46      0.41      3079
weighted avg       0.43      0.56      0.48      3079



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# 6 Results

# 7 Final Predictions on Test Set