<a href="https://colab.research.google.com/github/BrandyTee/My-ML-Projects/blob/main/Liverpool_vs_Atleti_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Lasso, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

In [None]:
#Load our Dataset
leagues = {"SP1":"LaLiga", "E0":"EPL"}

seasons = ["2122", "2223", "2324", "2425", "2526"]

for season in seasons:
    for code, league in leagues.items():
        url = f"https://www.football-data.co.uk/mmz4281/{season}/{code}.csv"
        df = pd.read_csv(url)
        df.to_csv(f"{league}_{season}.csv", index=False)
        print(f"Downloaded {league} {season}")

Downloaded LaLiga 2122
Downloaded EPL 2122
Downloaded LaLiga 2223
Downloaded EPL 2223
Downloaded LaLiga 2324
Downloaded EPL 2324
Downloaded LaLiga 2425
Downloaded EPL 2425
Downloaded LaLiga 2526
Downloaded EPL 2526


In [None]:
#Show Dataset
sp22 = pd.read_csv("/content/LaLiga_2122.csv")
sp23 = pd.read_csv("/content/LaLiga_2223.csv")
sp24 = pd.read_csv("/content/LaLiga_2324.csv")
sp25 = pd.read_csv("/content/LaLiga_2425.csv")
sp26 = pd.read_csv("/content/LaLiga_2526.csv")

ep22 = pd.read_csv("/content/EPL_2122.csv")
ep23 = pd.read_csv("/content/EPL_2223.csv")
ep24 = pd.read_csv("/content/EPL_2324.csv")
ep25 = pd.read_csv("/content/EPL_2425.csv")
ep26 = pd.read_csv("/content/EPL_2526.csv")

In [None]:
#Concatenate Data into 1
data = pd.concat([sp26, ep26, sp25, ep25, sp24, ep24, sp23, ep23, sp22, ep22])
data.head()

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,...,IWA,VCH,VCD,VCA,IWCH,IWCD,IWCA,VCCH,VCCD,VCCA
0,SP1,15/08/2025,18:00,Girona,Vallecano,1,3,A,0,3,...,,,,,,,,,,
1,SP1,15/08/2025,20:30,Villarreal,Oviedo,2,0,H,2,0,...,,,,,,,,,,
2,SP1,16/08/2025,18:30,Mallorca,Barcelona,0,3,A,0,2,...,,,,,,,,,,
3,SP1,16/08/2025,20:30,Alaves,Levante,2,1,H,1,0,...,,,,,,,,,,
4,SP1,16/08/2025,20:30,Valencia,Sociedad,1,1,D,0,0,...,,,,,,,,,,


In [None]:
#Look at only the features we want
df = data.iloc[:, :26]

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3121 entries, 0 to 379
Data columns (total 26 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Div       3121 non-null   object 
 1   Date      3121 non-null   object 
 2   Time      3121 non-null   object 
 3   HomeTeam  3121 non-null   object 
 4   AwayTeam  3121 non-null   object 
 5   FTHG      3121 non-null   int64  
 6   FTAG      3121 non-null   int64  
 7   FTR       3121 non-null   object 
 8   HTHG      3121 non-null   int64  
 9   HTAG      3121 non-null   int64  
 10  HTR       3121 non-null   object 
 11  HS        3121 non-null   int64  
 12  AS        3121 non-null   int64  
 13  HST       3121 non-null   int64  
 14  AST       3121 non-null   int64  
 15  HF        3121 non-null   int64  
 16  AF        3121 non-null   int64  
 17  HC        3121 non-null   int64  
 18  AC        3121 non-null   int64  
 19  HY        3121 non-null   int64  
 20  AY        3121 non-null   int64  
 2

In [None]:
#Drop unnecessary columns
df.drop(["HTR", "HTAG", "HTHG", "Time", "Date", "Div"], axis=1, inplace=True)

In [None]:
df.head(2)

Unnamed: 0,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR,B365H,B365D,B365A
0,Girona,Vallecano,1,3,A,7,16,2,5,8,17,2,4,0,1,1,0,2.25,3.25,3.3
1,Villarreal,Oviedo,2,0,H,25,5,10,4,10,8,10,1,1,2,0,1,1.4,4.75,8.0


In [None]:
#Let's remove all matches
#Not including Liv or Atleti

clubs = ["Liverpool", "Ath Madrid"]

df2 = df #df[(df["HomeTeam"].isin(clubs) | df["AwayTeam"].isin(clubs))]

In [None]:
df2.head()

Unnamed: 0,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR,B365H,B365D,B365A
0,Girona,Vallecano,1,3,A,7,16,2,5,8,17,2,4,0,1,1,0,2.25,3.25,3.3
1,Villarreal,Oviedo,2,0,H,25,5,10,4,10,8,10,1,1,2,0,1,1.4,4.75,8.0
2,Mallorca,Barcelona,0,3,A,4,24,1,8,8,17,3,6,4,1,2,0,7.0,5.0,1.4
3,Alaves,Levante,2,1,H,15,7,4,3,15,13,10,1,0,1,0,0,2.15,3.0,3.8
4,Valencia,Sociedad,1,1,D,18,13,2,3,5,11,8,7,1,2,0,0,2.6,2.9,3.1


In [None]:
#Lets Loaf our in most important features

features = ["HS", "AS", "HST", "AST", "HF", "AF", "HC", "AC", "HY", "AY", "HR", "AR", "B365H", "B365D", "B365A"]

#Get the average of these features
#For the past 4 games
def feature_averages(home, away, df, games=10):
    #Get Club Matches
    club_matches = df[(df["HomeTeam"] == home) | (df["AwayTeam"] == away)]

    #Get last n games
    club_matches = club_matches.tail(games)

    #Get averages
    return club_matches[features].mean().values

In [None]:
#Load our function
feature_averages("Liverpool", "Ath Madrid", df2)

array([22.6  ,  6.4  ,  7.5  ,  2.2  ,  7.4  ,  7.2  ,  8.1  ,  3.   ,
        1.1  ,  1.6  ,  0.   ,  0.   ,  1.233,  6.975, 12.85 ])

In [None]:
#Use one hot encoder for our clubs
encoder = OneHotEncoder()

featureList = ["HomeTeam", "AwayTeam"]

encoded = encoder.fit_transform(df2[featureList])
# Convert the sparse matrix to a dense array before creating the DataFrame
encoded_df = pd.DataFrame(encoded.toarray(), columns=encoder.get_feature_names_out(featureList))

#Drop original home and away columns
df2 = df2.drop(["HomeTeam", "AwayTeam"], axis=1)

#Concatenate original with new
df2 = pd.concat([df2.reset_index(drop=True), encoded_df], axis=1)

In [None]:
#Separate our Data
X = df2.drop(["FTR", "FTAG", "FTHG"], axis=1)
Y = df2[["FTHG", "FTAG"]]

In [None]:
X.head()

Unnamed: 0,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,...,AwayTeam_Southampton,AwayTeam_Sunderland,AwayTeam_Tottenham,AwayTeam_Valencia,AwayTeam_Valladolid,AwayTeam_Vallecano,AwayTeam_Villarreal,AwayTeam_Watford,AwayTeam_West Ham,AwayTeam_Wolves
0,7,16,2,5,8,17,2,4,0,1,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,25,5,10,4,10,8,10,1,1,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4,24,1,8,8,17,3,6,4,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,15,7,4,3,15,13,10,1,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,18,13,2,3,5,11,8,7,1,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
df2.head()

Unnamed: 0,FTHG,FTAG,FTR,HS,AS,HST,AST,HF,AF,HC,...,AwayTeam_Southampton,AwayTeam_Sunderland,AwayTeam_Tottenham,AwayTeam_Valencia,AwayTeam_Valladolid,AwayTeam_Vallecano,AwayTeam_Villarreal,AwayTeam_Watford,AwayTeam_West Ham,AwayTeam_Wolves
0,1,3,A,7,16,2,5,8,17,2,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,2,0,H,25,5,10,4,10,8,10,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,3,A,4,24,1,8,8,17,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2,1,H,15,7,4,3,15,13,10,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,1,D,18,13,2,3,5,11,8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
#Perform StandardScaling

scaler = StandardScaler()

X2 = scaler.fit_transform(X)

In [None]:

#PolynomialFeatures
poly = PolynomialFeatures()

X3 = poly.fit_transform(X)

In [None]:
#Do Train_test_split
X_train, X_test, y_train, y_test = train_test_split(
X2, Y, test_size=0.1, random_state=5
)

In [None]:
#Run our models
#Create Model Function
def model_function(model):
    model.fit(X_train, y_train)

    #Prediction
    pred = model.predict(X_test)
    train_pred = model.predict(X_train)

    train_accuracy = r2_score(y_train, train_pred)
    accuracy = r2_score(y_test, pred)
    error = mean_squared_error(y_test, pred)

    print("Train Score: ", train_accuracy)
    print("Test Score: ", accuracy)
    print("Error: ", error)

In [None]:
linear = LinearRegression()
model_function(linear)

Train Score:  0.455095672014545
Test Score:  0.39753133515523137
Error:  0.8286808318212919


In [None]:
lasso = Lasso()
model_function(lasso)

Train Score:  5.329070518200751e-15
Test Score:  -0.0057636570703861745
Error:  1.3875844962618937


In [None]:
forest = RandomForestRegressor(n_estimators=150)
model_function(forest)

Train Score:  0.9164848145506479
Test Score:  0.3435390058729809
Error:  0.9054058217962377


In [None]:
#Try predictions

input_data = [
22.6, 6.4, 7.5, 2.2, 7.4, 7.2, 8.1, 3.,
1.1, 1.6, 0., 0., 1.233, 6.975, 12.85
]

#Load our other hot encoded data
hot_teams = [0.0] * 106

#Find the 2 teams
home_index = X.columns.get_loc("HomeTeam_Liverpool") - 12
away_index = X.columns.get_loc("AwayTeam_Ath Madrid") - 12

#Set the corresponding index
hot_teams[home_index] = 1.0
hot_teams[away_index] = 1.0

#Add the other stats and the hot teams
input_data.extend(hot_teams)


input_np = np.asarray(input_data)
input_reshaped = input_np.reshape(1, -1)
prediction = forest.predict(input_reshaped)

print(f"Liverpool {round(prediction[0][0])} - {round(prediction[0][1])} Atleti")

Liverpool 3 - 2 Atleti
