# Support Vector Machine

## Section 1: Data Importing

In [55]:
# Importing Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

In [56]:
df_matches = pd.read_csv('com.csv')
df_matches.dtypes

Date         object
HomeTeam     object
AwayTeam     object
FTHG        float64
FTAG        float64
FTR          object
HTHG        float64
HTAG        float64
HTR          object
Referee      object
HS          float64
AS          float64
HST         float64
AST         float64
HC          float64
AC          float64
HF          float64
AF          float64
HY          float64
AY          float64
HR          float64
AR          float64
dtype: object

In [57]:
# Importing Data File
df_matches = pd.read_csv('com.csv')
df_managers = pd.read_excel('PremierLeagueManagers.xlsx')
df_spending = pd.read_excel('Spending_data.xlsx')

# Ensure the match date is in datetime format
df_matches['Date'] = pd.to_datetime(df_matches['Date'],dayfirst = True)

# Sort the df_matches dataframe by ascending date order
df_matches = df_matches.sort_values(by='Date', ascending=True).reset_index(drop=True)

# Drop any rows where all the values are nan
#df_matches = df_matches.dropna()

# Ensure Season_Start and Season_End are in datetime format
df_managers['Season_Start'] = pd.to_datetime(df_managers['Season_Start'],dayfirst = True)
df_managers['Season_End'] = pd.to_datetime(df_managers['Season_End'],dayfirst = True)

df_spending['Season_Start'] = pd.to_datetime(df_spending['Season_Start'],dayfirst = True)
df_spending['Season_End'] = pd.to_datetime(df_spending['Season_End'],dayfirst = True)


# Remove the last row from df_matches
df_matches = df_matches.drop(df_matches.tail(1).index)


#Showing What Each Training Dataset Looks Like
display(df_matches)
display(df_managers)
display(df_spending)

df_matches.dtypes

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,HST,AST,HC,AC,HF,AF,HY,AY,HR,AR
0,2000-08-19,Charlton,Man City,4.0,0.0,H,2.0,0.0,H,Rob Harris,...,14.0,4.0,6.0,6.0,13.0,12.0,1.0,2.0,0.0,0.0
1,2000-08-19,Chelsea,West Ham,4.0,2.0,H,1.0,0.0,H,Graham Barber,...,10.0,5.0,7.0,7.0,19.0,14.0,1.0,2.0,0.0,0.0
2,2000-08-19,Coventry,Middlesbrough,1.0,3.0,A,1.0,1.0,D,Barry Knight,...,3.0,9.0,8.0,4.0,15.0,21.0,5.0,3.0,1.0,0.0
3,2000-08-19,Derby,Southampton,2.0,2.0,D,1.0,2.0,A,Andy D'Urso,...,4.0,6.0,5.0,8.0,11.0,13.0,1.0,1.0,0.0,0.0
4,2000-08-19,Leeds,Everton,2.0,0.0,H,2.0,0.0,H,Dermot Gallagher,...,8.0,6.0,6.0,4.0,21.0,20.0,1.0,3.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9325,2024-11-09,Brighton,Man City,2.0,1.0,H,0.0,1.0,A,S Barrott,...,4.0,6.0,0.0,4.0,12.0,10.0,3.0,3.0,0.0,0.0
9326,2024-11-10,Nott'm Forest,Newcastle,1.0,3.0,A,1.0,0.0,H,A Taylor,...,3.0,6.0,4.0,5.0,13.0,6.0,1.0,1.0,0.0,0.0
9327,2024-11-10,Tottenham,Ipswich,1.0,2.0,A,0.0,2.0,A,D England,...,5.0,3.0,12.0,2.0,10.0,19.0,1.0,5.0,0.0,0.0
9328,2024-11-10,Man United,Leicester,3.0,0.0,H,2.0,0.0,H,P Bankes,...,3.0,5.0,1.0,5.0,9.0,5.0,0.0,1.0,0.0,0.0


Unnamed: 0,Season_Start,Season_End,Manager,Club,Manager_Nationality
0,2024-08-16,2025-05-25,Mikel Arteta,Arsenal,Spain
1,2024-08-16,2025-05-25,Unai Emery,Aston Villa,Spain
2,2024-08-16,2025-05-25,Andoni Iraola,Bournemouth,Spain
3,2024-08-16,2025-05-25,Thomas Frank,Brentford,Denmark
4,2024-08-16,2025-05-25,Fabian Hürzeler,Brighton,Germany
...,...,...,...,...,...
696,2000-08-19,2001-05-19,David Pleat,Tottenham,England
697,2000-08-19,2001-05-19,George Graham,Tottenham,Scotland
698,2000-08-19,2001-05-19,Glenn Hoddle,Tottenham,England
699,2000-08-19,2001-05-19,Glenn Roeder,West Ham,England


Unnamed: 0,Team,Expenditure,Season_Start,Season_End
0,Brighton,244.20,2024-08-16,2025-05-25
1,Chelsea,238.50,2024-08-16,2025-05-25
2,Man United,214.50,2024-08-16,2025-05-25
3,Aston Villa,176.20,2024-08-16,2025-05-25
4,Tottenham,148.85,2024-08-16,2025-05-25
...,...,...,...,...
495,Man United,11.70,2000-08-19,2001-05-19
496,Leicester,11.70,2000-08-19,2001-05-19
497,Ipswich,9.90,2000-08-19,2001-05-19
498,Bradford,6.61,2000-08-19,2001-05-19


Date        datetime64[ns]
HomeTeam            object
AwayTeam            object
FTHG               float64
FTAG               float64
FTR                 object
HTHG               float64
HTAG               float64
HTR                 object
Referee             object
HS                 float64
AS                 float64
HST                float64
AST                float64
HC                 float64
AC                 float64
HF                 float64
AF                 float64
HY                 float64
AY                 float64
HR                 float64
AR                 float64
dtype: object

In [58]:
df_matches.columns

Index(['Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HTHG', 'HTAG',
       'HTR', 'Referee', 'HS', 'AS', 'HST', 'AST', 'HC', 'AC', 'HF', 'AF',
       'HY', 'AY', 'HR', 'AR'],
      dtype='object')

In [59]:
# Checking the distribution of FTR Values
print(df_matches['FTR'].value_counts()) # Raw values
print(df_matches['FTR'].value_counts(normalize=True)*100) # Percentage of the results being H, A, or D

FTR
H    4287
A    2738
D    2305
Name: count, dtype: int64
FTR
H    45.948553
A    29.346195
D    24.705252
Name: proportion, dtype: float64


## Section 2: Combining the Different Data Sources Together

The next section is to stitch the manager and spending information to the main dataframe.

In [60]:

# Function to get the manager for a specific team and match date
def get_manager(team, match_date):
    manager_row = df_managers[
        (df_managers['Club'] == team) &
        (df_managers['Season_Start'] <= match_date) &
        (df_managers['Season_End'] >= match_date)
    ]
    return manager_row['Manager'].iloc[0] if not manager_row.empty else None

# Function to get the spending for a specific team and match date
def get_spending(team, match_date):
    spending_row = df_spending[
        (df_spending['Team'] == team) &
        (df_spending['Season_Start'] <= match_date) &
        (df_spending['Season_End'] >= match_date)
    ]
    return spending_row['Expenditure'].iloc[0] if not spending_row.empty else None


In [61]:
# Add HomeManager and AwayManager columns to df_matches
df_matches['HomeManager'] = df_matches.apply(lambda row: get_manager(row['HomeTeam'], row['Date']), axis=1)
df_matches['AwayManager'] = df_matches.apply(lambda row: get_manager(row['AwayTeam'], row['Date']), axis=1)

# Add HomeSpending and AwaySpending columns to df_matches
df_matches['HomeSpending'] = df_matches.apply(lambda row: get_spending(row['HomeTeam'], row['Date']), axis=1)
df_matches['AwaySpending'] = df_matches.apply(lambda row: get_spending(row['AwayTeam'], row['Date']), axis=1)

In [62]:
# Save the DataFrame to an Excel file
df_matches.to_excel("matches_with_managers_and_spending.xlsx", index=False)

print("Data saved to matches_with_managers_and_spending.xlsx")


Data saved to matches_with_managers_and_spending.xlsx


## Section 3: Calculating Input Features for ML Model
### Section 3.1: Calculating Rolling Average Statistics

In [63]:
# Calculating Rolling Average Statistics
k = 38 # Number of matches to look backwards to.

# Function to Calculate Rolling Average Statistics for the Past k Matches. Each row's kAvg does not include the current results.
def generate_kAvg(df, attribute, HomeTeam=True):
    if HomeTeam:
        group_by_team = "HomeTeam"
    elif not HomeTeam:
        group_by_team = "AwayTeam"
    
    kattribute = "k" + attribute
    df[kattribute] = (
    df.groupby(group_by_team)[attribute]     # Group by HomeTeam or AwayTeam
    .transform(lambda x: x.shift(1).rolling(window=k, min_periods=1).mean())  # Shift by 1 to exclude the current match, and create a rolling window of up to k past matches
    )
    df[kattribute] = df[kattribute].fillna(0) # Filling nan values with 0

Home_kAvg_features = ['FTHG','HTHG','HS','HC','HF','HY','HR']
Away_kAvg_features = ['FTAG','HTAG','AS','AC','AF','AY','AR']

for feature in Home_kAvg_features:
    generate_kAvg(df_matches, feature,True)
for feature in Away_kAvg_features:
    generate_kAvg(df_matches, feature,False)

# Set display options to show all rows and columns
#pd.set_option('display.max_rows', None) 
#pd.set_option('display.max_columns', None)
    
display(df_matches)

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,kHF,kHY,kHR,kFTAG,kHTAG,kAS,kAC,kAF,kAY,kAR
0,2000-08-19,Charlton,Man City,4.0,0.0,H,2.0,0.0,H,Rob Harris,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,2000-08-19,Chelsea,West Ham,4.0,2.0,H,1.0,0.0,H,Graham Barber,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,2000-08-19,Coventry,Middlesbrough,1.0,3.0,A,1.0,1.0,D,Barry Knight,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,2000-08-19,Derby,Southampton,2.0,2.0,D,1.0,2.0,A,Andy D'Urso,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,2000-08-19,Leeds,Everton,2.0,0.0,H,2.0,0.0,H,Dermot Gallagher,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9325,2024-11-09,Brighton,Man City,2.0,1.0,H,0.0,1.0,A,S Barrott,...,9.473684,1.921053,0.052632,2.157895,0.868421,17.473684,7.500000,7.342105,1.526316,0.000000
9326,2024-11-10,Nott'm Forest,Newcastle,1.0,3.0,A,1.0,0.0,H,A Taylor,...,7.868421,1.842105,0.078947,1.868421,0.763158,12.210526,7.868421,7.921053,2.184211,0.000000
9327,2024-11-10,Tottenham,Ipswich,1.0,2.0,A,0.0,2.0,A,D England,...,9.736842,2.289474,0.078947,1.157895,0.552632,9.210526,4.578947,12.552632,1.447368,0.078947
9328,2024-11-10,Man United,Leicester,3.0,0.0,H,2.0,0.0,H,P Bankes,...,9.763158,2.184211,0.026316,1.500000,0.631579,11.026316,4.105263,10.500000,1.842105,0.000000


### Section 3.2 Calculating Cumulative Win Rates for the Home and Away Team

In [64]:
# Create a new column to store home wins and away wins
df_matches['HomeWin'] = (df_matches['FTR'] == 'H').astype(int)
df_matches['AwayWin'] = (df_matches['FTR'] == 'A').astype(int)

# Calculate rolling win rate for home games
df_matches['HomeWinRate'] = (
    df_matches.groupby('HomeTeam')['HomeWin']    # Group by HomeTeam
    .cumsum()                                    # Cumulative sum of home wins
    / df_matches.groupby('HomeTeam').cumcount()  # Divide by cumulative games played
    .add(1)                                      # To avoid division by zero
)

# Calculate rolling win rate for away games
df_matches['AwayWinRate'] = (
    df_matches.groupby('AwayTeam')['AwayWin']    # Group by AwayTeam
    .cumsum()                                    # Cumulative sum of away wins
    / df_matches.groupby('AwayTeam').cumcount()  # Divide by cumulative games played
    .add(1)                                      # To avoid division by zero
)

# Display relevant columns
display(df_matches)


Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,kHTAG,kAS,kAC,kAF,kAY,kAR,HomeWin,AwayWin,HomeWinRate,AwayWinRate
0,2000-08-19,Charlton,Man City,4.0,0.0,H,2.0,0.0,H,Rob Harris,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1,0,1.000000,0.000000
1,2000-08-19,Chelsea,West Ham,4.0,2.0,H,1.0,0.0,H,Graham Barber,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1,0,1.000000,0.000000
2,2000-08-19,Coventry,Middlesbrough,1.0,3.0,A,1.0,1.0,D,Barry Knight,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0,1,0.000000,1.000000
3,2000-08-19,Derby,Southampton,2.0,2.0,D,1.0,2.0,A,Andy D'Urso,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0,0,0.000000,0.000000
4,2000-08-19,Leeds,Everton,2.0,0.0,H,2.0,0.0,H,Dermot Gallagher,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1,0,1.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9325,2024-11-09,Brighton,Man City,2.0,1.0,H,0.0,1.0,A,S Barrott,...,0.868421,17.473684,7.500000,7.342105,1.526316,0.000000,1,0,0.351724,0.463252
9326,2024-11-10,Nott'm Forest,Newcastle,1.0,3.0,A,1.0,0.0,H,A Taylor,...,0.763158,12.210526,7.868421,7.921053,2.184211,0.000000,0,1,0.333333,0.249417
9327,2024-11-10,Tottenham,Ipswich,1.0,2.0,A,0.0,2.0,A,D England,...,0.552632,9.210526,4.578947,12.552632,1.447368,0.078947,0,1,0.596567,0.295455
9328,2024-11-10,Man United,Leicester,3.0,0.0,H,2.0,0.0,H,P Bankes,...,0.631579,11.026316,4.105263,10.500000,1.842105,0.000000,1,0,0.683761,0.277778


### Section 3.3: Generating Rolling Average Statistics for this Specific Particular Home and Away Combination

In [65]:
n = 5; # Number of matches to look back for this particular home and away pair

def generate_nAvg_pairwise(df, row, attribute):
    # Getting the matches with the current row's home and away team lineup
    matches = df[ (df['HomeTeam'] == row['HomeTeam']) & (df['AwayTeam'] == row['AwayTeam']) ]
    
    # Get the current index of the row
    current_index = row.name
    
    # Get previous matches before this match
    previous_matches = matches[matches.index < current_index]
    
    return (
        previous_matches[attribute]
        .rolling(window=n, min_periods=1)
        .mean()
        .iloc[-1]  # Get the latest rolling average value
        if not previous_matches.empty
        else 0  # Default value for no previous matches
    )

In [66]:
# List of attributes to calculate team pair specific rolling averages for
attributes = ['FTHG','HTHG','HS','HC','HF','HY','HR',
              'FTAG','HTAG','AS','AC','AF','AY','AR']

for attribute in attributes:
    # Defining new columns for the specific combination
    new_column = f"n{attribute}_Pairwise"
    df_matches[new_column] = df_matches.apply(
        lambda row: generate_nAvg_pairwise(df_matches, row, attribute), axis = 1
    )


In [67]:
# Showing what columns are present in the current dataframe
df_matches.columns

Index(['Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HTHG', 'HTAG',
       'HTR', 'Referee', 'HS', 'AS', 'HST', 'AST', 'HC', 'AC', 'HF', 'AF',
       'HY', 'AY', 'HR', 'AR', 'HomeManager', 'AwayManager', 'HomeSpending',
       'AwaySpending', 'kFTHG', 'kHTHG', 'kHS', 'kHC', 'kHF', 'kHY', 'kHR',
       'kFTAG', 'kHTAG', 'kAS', 'kAC', 'kAF', 'kAY', 'kAR', 'HomeWin',
       'AwayWin', 'HomeWinRate', 'AwayWinRate', 'nFTHG_Pairwise',
       'nHTHG_Pairwise', 'nHS_Pairwise', 'nHC_Pairwise', 'nHF_Pairwise',
       'nHY_Pairwise', 'nHR_Pairwise', 'nFTAG_Pairwise', 'nHTAG_Pairwise',
       'nAS_Pairwise', 'nAC_Pairwise', 'nAF_Pairwise', 'nAY_Pairwise',
       'nAR_Pairwise'],
      dtype='object')

### Section 3.3: Label Encoding the Categorical Columns for Use in the Machine Learning Model

Input features that needs to be encoded:
1. HomeTeam
2. AwayTeam
3. FTR
4. HTR
5. Referee
6. HomeManager
7. AwayManager

In [69]:
# Team Names
le_teams = LabelEncoder()

# Using the same label encoder for the home and away teams.
le_teams.fit(pd.concat([df_matches['HomeTeam'],df_matches['AwayTeam']])) 

# Using the label encoder to encode the home and away teams
df_matches['HomeTeam_Encoded'] = le_teams.transform(df_matches['HomeTeam'])
df_matches['AwayTeam_Encoded'] = le_teams.transform(df_matches['AwayTeam'])

# FTR and HTR
le_results = LabelEncoder()
le_results.fit(pd.concat([df_matches['FTR'],df_matches['HTR']]))

# Using the label encoder to encode the FTR and HTR
df_matches['FTR_Encoded'] = le_results.transform(df_matches['FTR'])
df_matches['HTR_Encoded'] = le_results.transform(df_matches['HTR'])

# Referee
le_referee = LabelEncoder()
le_referee.fit(df_matches['Referee'])

df_matches['Referee_Encoded'] = le_referee.transform(df_matches['Referee'])

# Managers
le_managers = LabelEncoder() 
le_managers.fit(pd.concat([df_matches['HomeManager'],df_matches['AwayManager']]))
df_matches['HomeManager_Encoded'] = le_managers.transform(df_matches['HomeManager'])
df_matches['AwayManager_Encoded'] = le_managers.transform(df_matches['AwayManager'])

display(df_matches)
df_matches.to_excel('encoded.xlsx', index=False)


Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,nAF_Pairwise,nAY_Pairwise,nAR_Pairwise,HomeTeam_Encoded,AwayTeam_Encoded,FTR_Encoded,HTR_Encoded,Referee_Encoded,HomeManager_Encoded,AwayManager_Encoded
0,2000-08-19,Charlton,Man City,4.0,0.0,H,2.0,0.0,H,Rob Harris,...,0.0,0.0,0.0,12,26,2,2,143,2,78
1,2000-08-19,Chelsea,West Ham,4.0,2.0,H,1.0,0.0,H,Graham Barber,...,0.0,0.0,0.0,13,43,2,2,65,30,61
2,2000-08-19,Coventry,Middlesbrough,1.0,3.0,A,1.0,1.0,D,Barry Knight,...,0.0,0.0,0.0,14,28,0,1,19,62,164
3,2000-08-19,Derby,Southampton,2.0,2.0,D,1.0,2.0,A,Andy D'Urso,...,0.0,0.0,0.0,16,36,1,0,12,77,160
4,2000-08-19,Leeds,Everton,2.0,0.0,H,2.0,0.0,H,Dermot Gallagher,...,0.0,0.0,0.0,22,17,2,2,46,36,174
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9325,2024-11-09,Brighton,Man City,2.0,1.0,H,0.0,1.0,A,S Barrott,...,7.0,1.4,0.2,9,26,2,0,148,49,124
9326,2024-11-10,Nott'm Forest,Newcastle,1.0,3.0,A,1.0,0.0,H,A Taylor,...,6.5,1.5,0.0,31,29,0,2,4,112,44
9327,2024-11-10,Tottenham,Ipswich,1.0,2.0,A,0.0,2.0,A,D England,...,14.5,1.0,0.0,40,21,0,0,34,10,90
9328,2024-11-10,Man United,Leicester,3.0,0.0,H,2.0,0.0,H,P Bankes,...,9.8,1.4,0.0,27,23,2,2,116,141,154


## Section 4: Separating the DataFrame into Input and Output Features

### Section 4.1: Selecting the Input and Output Features of the Model

In [70]:
input_features = ['HomeTeam_Encoded','AwayTeam_Encoded','HomeManager_Encoded',
                 'AwayManager_Encoded','Referee_Encoded','kFTHG', 'kHTHG','kHS', 'kHC', 'kHF', 'kHY', 
                  'kHR', 'kFTAG', 'kHTAG', 'kAS', 'kAC', 'kAF', 'kAY', 'kAR', 
                  'HomeWinRate', 'AwayWinRate', 
                  'nFTHG_Pairwise', 'nHTHG_Pairwise', 'nHS_Pairwise', 'nHC_Pairwise',
                   'nHF_Pairwise', 'nHY_Pairwise', 'nHR_Pairwise', 'nFTAG_Pairwise',
                   'nHTAG_Pairwise', 'nAS_Pairwise', 'nAC_Pairwise', 'nAF_Pairwise',
                   'nAY_Pairwise', 'nAR_Pairwise', 'HomeSpending', 'AwaySpending']
df_input = df_matches[input_features]
df_input = df_matches[input_features].fillna(0)

display(df_input)

Unnamed: 0,HomeTeam_Encoded,AwayTeam_Encoded,HomeManager_Encoded,AwayManager_Encoded,Referee_Encoded,kFTHG,kHTHG,kHS,kHC,kHF,...,nHR_Pairwise,nFTAG_Pairwise,nHTAG_Pairwise,nAS_Pairwise,nAC_Pairwise,nAF_Pairwise,nAY_Pairwise,nAR_Pairwise,HomeSpending,AwaySpending
0,12,26,2,78,143,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.80,20.28
1,13,43,30,61,65,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,52.39,18.65
2,14,28,62,164,19,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00
3,16,36,77,160,12,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.33,0.00
4,22,17,36,174,46,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,53.15,34.48
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9325,9,26,49,124,148,1.921053,0.868421,17.657895,9.368421,9.473684,...,0.0,3.2,2.0,16.8,4.0,7.0,1.4,0.2,0.00,25.00
9326,31,29,112,44,4,1.447368,0.631579,12.605263,7.263158,7.868421,...,0.0,2.5,1.5,11.0,6.5,6.5,1.5,0.0,105.50,68.20
9327,40,21,10,90,34,2.026316,0.684211,17.000000,8.947368,9.736842,...,0.5,1.5,1.0,13.5,4.5,14.5,1.0,0.0,148.85,126.49
9328,27,23,141,154,116,1.500000,0.473684,16.921053,8.394737,9.763158,...,0.0,0.8,0.2,12.8,5.8,9.8,1.4,0.0,214.50,86.05


In [71]:
output_features = ['FTR_Encoded']
df_output = df_matches['FTR_Encoded']
display(df_output)

0       2
1       2
2       0
3       1
4       2
       ..
9325    2
9326    0
9327    0
9328    2
9329    1
Name: FTR_Encoded, Length: 9330, dtype: int32

### Section 4.3: Checking the Correlation Between Each Input Feature With Respect to FTR_Encoded

In [72]:
#Checking correlation Matrix
correlation_matrix = pd.concat([df_input, df_output], axis=1).corr()

sorted_correlation = correlation_matrix['FTR_Encoded'].abs().sort_values(ascending = False)

display(sorted_correlation)

FTR_Encoded            1.000000
HomeWinRate            0.309951
AwayWinRate            0.297890
kFTHG                  0.246896
kFTAG                  0.243459
kAS                    0.241129
kHTHG                  0.223033
kHS                    0.214479
kHTAG                  0.212977
kAC                    0.178061
kHC                    0.171585
nAS_Pairwise           0.161303
nFTAG_Pairwise         0.158576
nFTHG_Pairwise         0.134183
nHTAG_Pairwise         0.129286
nAC_Pairwise           0.115603
nHS_Pairwise           0.113863
kHY                    0.111643
nHTHG_Pairwise         0.106688
nHC_Pairwise           0.097164
nHY_Pairwise           0.095584
AwaySpending           0.094791
HomeSpending           0.084161
HomeManager_Encoded    0.073002
AwayTeam_Encoded       0.063966
kAF                    0.057522
AwayManager_Encoded    0.055152
kHF                    0.054368
HomeTeam_Encoded       0.053972
nHF_Pairwise           0.043265
nHR_Pairwise           0.025481
kHR     

### Section 4.4: Splitting the Dataset into Training and Test Datasets

In [73]:
#Splitting the dataset into training and test data
from sklearn.model_selection import train_test_split
testing_size = 0.33
input_train, input_test, output_train, output_test = train_test_split(df_input, df_output, test_size=testing_size, random_state=42)

### Section 4.5: Scaling the Input and Test Datasets

In [74]:
# Scaling the input training and test data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

input_train_scaled = scaler.fit_transform(input_train)
input_test_scaled = scaler.transform(input_test)


### Section 4.6: Training the Support Vector Machine Model

In [75]:
# Import necessary libraries
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Train the Support Vector Machine model
svm_model = SVC(kernel='linear') # You can choose other kernels like 'rbf', 'poly', etc.
svm_model.fit(input_train_scaled, output_train)

# Predict the outcomes for the test set
predictions = svm_model.predict(input_test_scaled)

# Evaluate the model
accuracy = accuracy_score(output_test, predictions)
print(f'Model Accuracy: {accuracy * 100:.2f}%')
print(classification_report(output_test, predictions, target_names=le_results.classes_))

# Predict the outcome of an arbitrary game
def predict_match(home_team, away_team, home_manager, away_manager, referee, k_fthg, k_hthg, k_hs, k_hc, k_hf, k_hy, k_hr, k_ftag, k_htag, k_as, k_ac, k_af, k_ay, k_ar, home_win_rate, away_win_rate, n_fthg_pairwise, n_hthg_pairwise, n_hs_pairwise, n_hc_pairwise, n_hf_pairwise, n_hy_pairwise, n_hr_pairwise, n_ftag_pairwise, n_htag_pairwise, n_as_pairwise, n_ac_pairwise, n_af_pairwise, n_ay_pairwise, n_ar_pairwise, home_spending, away_spending):
    
    # Encode categorical inputs
    home_team_encoded = le_teams.transform([home_team])[0]
    away_team_encoded = le_teams.transform([away_team])[0]
    home_manager_encoded = le_managers.transform([home_manager])[0]
    away_manager_encoded = le_managers.transform([away_manager])[0]
    referee_encoded = le_referee.transform([referee])[0]
    
    # Create a data frame for the input features
    match_features = [home_team_encoded, away_team_encoded, home_manager_encoded, away_manager_encoded, referee_encoded,
                      k_fthg, k_hthg, k_hs, k_hc, k_hf, k_hy, k_hr,
                      k_ftag, k_htag, k_as, k_ac, k_af, k_ay, k_ar,
                      home_win_rate, away_win_rate, 
                      n_fthg_pairwise, n_hthg_pairwise, n_hs_pairwise, n_hc_pairwise,
                      n_hf_pairwise, n_hy_pairwise, n_hr_pairwise, n_ftag_pairwise,
                      n_htag_pairwise, n_as_pairwise, n_ac_pairwise, n_af_pairwise,
                      n_ay_pairwise, n_ar_pairwise, home_spending, away_spending]

    # Scale the input features
    match_features_scaled = scaler.transform([match_features])

    # Predict the outcome
    prediction = svm_model.predict(match_features_scaled)[0]
    predicted_outcome = le_results.inverse_transform([prediction])[0]
    
    return predicted_outcome



Model Accuracy: 55.89%
              precision    recall  f1-score   support

           A       0.55      0.50      0.53       906
           D       0.00      0.00      0.00       734
           H       0.56      0.88      0.69      1439

    accuracy                           0.56      3079
   macro avg       0.37      0.46      0.40      3079
weighted avg       0.43      0.56      0.48      3079



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### 4.7 Prediction

In [80]:
import pandas as pd

# Example usage
home_teamp = 'Liverpool'
away_teamp = 'Arsenal'
match_date = '2024-12-01'  # Define the match date here

# Define the function to prepare data for prediction
def prediction_data(home_team, away_team, date):
    # Read the Excel file
    file_path = "encoded.xlsx"
    df = pd.read_excel(file_path)
    
    # Ensure dates are parsed correctly
    df['Date'] = pd.to_datetime(df['Date'], dayfirst=True)

    # Function to map team names to encoded names
    def get_encoded_team_name(team_name):
        encoded_name = df['HomeTeam_Encoded'][df['HomeTeam'] == team_name].values
        if len(encoded_name) == 0:
            encoded_name = df['AwayTeam_Encoded'][df['AwayTeam'] == team_name].values
        return encoded_name[0] if len(encoded_name) > 0 else None

    # Function to get the past 5 matches for a given team
    def get_past_matches(team, date):
        past_matches = df[((df['HomeTeam_Encoded'] == team) | (df['AwayTeam_Encoded'] == team)) & (df['Date'] < date)]
        past_matches = past_matches.sort_values(by='Date', ascending=False).head(5)
        return past_matches

    # Function to extract relevant columns from matches
    def extract_relevant_data(matches):
        relevant_columns = ['kFTHG', 'kHTHG', 'kHS', 'kHC', 'kHF', 'kHY', 'kHR', 'kFTAG', 'kHTAG', 'kAS', 'kAC',
                            'kAF', 'kAY', 'kAR', 'HomeWinRate', 'AwayWinRate', 'nFTHG_Pairwise', 'nHTHG_Pairwise',
                            'nHS_Pairwise', 'nHC_Pairwise', 'nHF_Pairwise', 'nHY_Pairwise', 'nHR_Pairwise',
                            'nFTAG_Pairwise', 'nHTAG_Pairwise', 'nAS_Pairwise', 'nAC_Pairwise', 'nAF_Pairwise',
                            'nAY_Pairwise', 'nAR_Pairwise', 'HomeSpending', 'AwaySpending', 
                            'HomeManager_Encoded', 'AwayManager_Encoded', 'Referee_Encoded']
        
        # Ensure only existing columns are accessed
        existing_columns = [col for col in relevant_columns if col in matches.columns]
        return matches[existing_columns].mean()

    # Get match date
    match_date = pd.to_datetime(date, dayfirst=True)
    home_team_encoded = get_encoded_team_name(home_team)
    away_team_encoded = get_encoded_team_name(away_team)
    
    home_past_matches = get_past_matches(home_team_encoded, match_date)
    away_past_matches = get_past_matches(away_team_encoded, match_date)

    # Extract relevant data and compute mean
    home_features = extract_relevant_data(home_past_matches)
    away_features = extract_relevant_data(away_past_matches)

    # Collect input features for prediction
    match_features = {
        'home_team': home_team_encoded,
        'away_team': away_team_encoded,
        'home_manager': home_features.get('HomeManager_Encoded', None),
        'away_manager': away_features.get('AwayManager_Encoded', None),
        'referee': home_features.get('Referee_Encoded', None),
        'k_fthg': home_features.get('kFTHG', 0),
        'k_hthg': home_features.get('kHTHG', 0),
        'k_hs': home_features.get('kHS', 0),
        'k_hc': home_features.get('kHC', 0),
        'k_hf': home_features.get('kHF', 0),
        'k_hy': home_features.get('kHY', 0),
        'k_hr': home_features.get('kHR', 0),
        'k_ftag': away_features.get('kFTAG', 0),
        'k_htag': away_features.get('kHTAG', 0),
        'k_as': away_features.get('kAS', 0),
        'k_ac': away_features.get('kAC', 0),
        'k_af': away_features.get('kAF', 0),
        'k_ay': away_features.get('kAY', 0),
        'k_ar': away_features.get('kAR', 0),
        'home_win_rate': home_features.get('HomeWinRate', 0),
        'away_win_rate': away_features.get('AwayWinRate', 0),
        'n_fthg_pairwise': home_features.get('nFTHG_Pairwise', 0),
        'n_hthg_pairwise': home_features.get('nHTHG_Pairwise', 0),
        'n_hs_pairwise': home_features.get('nHS_Pairwise', 0),
        'n_hc_pairwise': home_features.get('nHC_Pairwise', 0),
        'n_hf_pairwise': home_features.get('nHF_Pairwise', 0),
        'n_hy_pairwise': home_features.get('nHY_Pairwise', 0),
        'n_hr_pairwise': home_features.get('nHR_Pairwise', 0),
        'n_ftag_pairwise': away_features.get('nFTAG_Pairwise', 0),
        'n_htag_pairwise': away_features.get('nHTAG_Pairwise', 0),
        'n_as_pairwise': away_features.get('nAS_Pairwise', 0),
        'n_ac_pairwise': away_features.get('nAC_Pairwise', 0),
        'n_af_pairwise': away_features.get('nAF_Pairwise', 0),
        'n_ay_pairwise': away_features.get('nAY_Pairwise', 0),
        'n_ar_pairwise': away_features.get('nAR_Pairwise', 0),
        'home_spending': home_features.get('HomeSpending', 0),
        'away_spending': away_features.get('AwaySpending', 0)
    }

    return match_features

# Using the function to predict match outcome
features = prediction_data(home_teamp, away_teamp, match_date)

# Ensure the SVM model can handle new labels
try:
    predicted_outcome = predict_match(**features)
    print(f"The predicted outcome of the match is: {predicted_outcome}")
except ValueError as e:
    print(f"Error during prediction: {e}")


Error during prediction: y contains previously unseen labels: 24


In [None]:
home_team = home_teamp
    away_team = away_teamp
    home_manager = 'Mikel Arteta'
    away_manager = 'Arne Slot'
    referee = 'A Taylor'
    k_fthg = 1.5
    k_hthg = 1.0
    k_hs = 10.0
    k_hc = 5.0
    k_hf = 15.0
    k_hy = 3.0
    k_hr = 0.1
    k_ftag = 1.2
    k_htag = 0.8
    k_as = 8.0
    k_ac = 4.0
    k_ay = 2.5
    k_ar = 0.2
    home_win_rate = 0.6
    away_win_rate = 0.4
    n_fthg_pairwise = 1.4
    n_hthg_pairwise = 0.9
    n_hs_pairwise = 9.0
    n_hc_pairwise = 4.5
    n_hf_pairwise = 13.0
    n_hy_pairwise = 2.8
    n_hr_pairwise = 0.2
    n_ftag_pairwise = 1.3
    n_htag_pairwise = 0.7
    n_as_pairwise = 7.5
    n_ac_pairwise = 3.5
    n_af_pairwise = 11.0
    n_ay_pairwise = 2.0
    n_ar_pairwise = 0.3
    home_spending = 10000000
    away_spending = 8000000