In [2]:
import numpy as np # For maths operations
import matplotlib.pyplot as plt # For plotting and creating graphs
import pandas as pd # For handling csv files and data handling
import seaborn as sns # For nicer graphs also

In [5]:
df = pd.read_csv('../data/pl_24-25.csv') # Create a data frame -> converts CSV into usable table
df.head() # .head() shows the first 5 rows with the columns

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,...,B365CAHH,B365CAHA,PCAHH,PCAHA,MaxCAHH,MaxCAHA,AvgCAHH,AvgCAHA,BFECAHH,BFECAHA
0,E0,16/08/2024,20:00,Man United,Fulham,1,0,H,0,0,...,1.86,2.07,1.83,2.11,1.88,2.11,1.82,2.05,1.9,2.08
1,E0,17/08/2024,12:30,Ipswich,Liverpool,0,2,A,0,0,...,2.05,1.88,2.04,1.9,2.2,2.0,1.99,1.88,2.04,1.93
2,E0,17/08/2024,15:00,Arsenal,Wolves,2,0,H,1,0,...,2.02,1.91,2.0,1.9,2.05,1.93,1.99,1.87,2.02,1.96
3,E0,17/08/2024,15:00,Everton,Brighton,0,3,A,0,1,...,1.87,2.06,1.86,2.07,1.92,2.1,1.83,2.04,1.88,2.11
4,E0,17/08/2024,15:00,Newcastle,Southampton,1,0,H,1,0,...,1.87,2.06,1.88,2.06,1.89,2.1,1.82,2.05,1.89,2.1


In [7]:
df.columns # Return index of columns

Index(['Div', 'Date', 'Time', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR',
       'HTHG', 'HTAG',
       ...
       'B365CAHH', 'B365CAHA', 'PCAHH', 'PCAHA', 'MaxCAHH', 'MaxCAHA',
       'AvgCAHH', 'AvgCAHA', 'BFECAHH', 'BFECAHA'],
      dtype='object', length=120)

In [8]:
df.info() # Return data types

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 380 entries, 0 to 379
Columns: 120 entries, Div to BFECAHA
dtypes: float64(96), int64(16), object(8)
memory usage: 356.4+ KB


In [9]:
df.describe() # Some math counts etc

Unnamed: 0,FTHG,FTAG,HTHG,HTAG,HS,AS,HST,AST,HF,AF,...,B365CAHH,B365CAHA,PCAHH,PCAHA,MaxCAHH,MaxCAHA,AvgCAHH,AvgCAHA,BFECAHH,BFECAHA
count,380.0,380.0,380.0,380.0,380.0,380.0,380.0,380.0,380.0,380.0,...,380.0,380.0,380.0,380.0,380.0,380.0,380.0,380.0,380.0,380.0
mean,1.513158,1.421053,0.752632,0.610526,13.752632,12.165789,4.834211,4.265789,10.789474,11.276316,...,1.946421,1.964842,1.954632,1.971605,1.987842,2.003026,1.925105,1.939237,1.983079,2.006079
std,1.277917,1.189922,0.875867,0.828831,5.588045,5.409987,2.48826,2.341636,3.63984,3.513257,...,0.086484,0.085585,0.084978,0.086511,0.087387,0.087857,0.080514,0.080919,0.088226,0.089331
min,0.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,2.0,1.0,...,1.68,1.73,1.71,1.77,1.83,1.8,1.73,1.75,1.75,1.81
25%,1.0,1.0,0.0,0.0,10.0,9.0,3.0,3.0,8.0,9.0,...,1.88,1.9075,1.88,1.9,1.91,1.94,1.8575,1.87,1.91,1.93
50%,1.0,1.0,1.0,0.0,13.0,11.5,5.0,4.0,11.0,11.0,...,1.95,1.97,1.95,1.97,1.99,2.0,1.92,1.94,1.98,2.01
75%,2.0,2.0,1.0,1.0,17.0,15.0,6.0,6.0,13.0,14.0,...,2.02,2.04,2.02,2.04,2.05,2.0725,1.99,2.01,2.0525,2.07
max,7.0,6.0,4.0,5.0,36.0,37.0,16.0,13.0,21.0,21.0,...,2.1,2.15,2.17,2.26,2.34,2.31,2.13,2.14,2.21,2.3


In [12]:
columns_needed = ['HomeTeam', 'AwayTeam', 'FTR', 'FTHG', 'FTAG', 'HS', 'AS', 'HST', 'AST'] # Columns needed to get up and going
df_mvp = df[columns_needed].copy()
df_mvp.head()

Unnamed: 0,HomeTeam,AwayTeam,FTR,FTHG,FTAG,HS,AS,HST,AST
0,Man United,Fulham,H,1,0,14,10,5,2
1,Ipswich,Liverpool,A,0,2,7,18,2,5
2,Arsenal,Wolves,H,2,0,18,9,6,3
3,Everton,Brighton,A,0,3,9,10,1,5
4,Newcastle,Southampton,H,1,0,3,19,1,4


In [13]:
df_mvp['FTR'] = df_mvp['FTR'].map({'H': 0, 'D': 1, 'A': 2}) # Encode the Full-time result to a number avoiding strings
df_mvp.head()

Unnamed: 0,HomeTeam,AwayTeam,FTR,FTHG,FTAG,HS,AS,HST,AST
0,Man United,Fulham,0,1,0,14,10,5,2
1,Ipswich,Liverpool,2,0,2,7,18,2,5
2,Arsenal,Wolves,0,2,0,18,9,6,3
3,Everton,Brighton,2,0,3,9,10,1,5
4,Newcastle,Southampton,0,1,0,3,19,1,4


In [27]:
x = df_mvp.drop(['FTR', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG'], axis = 1) # Set all columns (except FTR) as the 'features' data used to train the model
# UPDATE - Included Home&Away team because theyre not encoded yet - dropping Full time stats as theyre not known until finish bad training
y = df_mvp['FTR'] # Set the Full time result as the 'target' what its trying to predict

In [28]:
from sklearn.model_selection import train_test_split

# Split data 80% for training, 20% to test against
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42 # Random state is a seed to pick random test rows
)

In [29]:
# Now we import and train our model
from sklearn.ensemble import RandomForestClassifier # Random Forest Classifier Model uses decision trees for each input

model = RandomForestClassifier(n_estimators = 100, random_state = 42) # n_estimators set to 100 decision trees

model.fit(x_train, y_train)

In [30]:
# Next evaluate the accuracy
accuracy = model.score(x_test, y_test)
print(f"Test Accuracy: {accuracy:.2f}")

Test Accuracy: 0.59
