In [1]:
import pandas as pd

In [2]:
matches = pd.read_csv("final_data.csv")

In [3]:
matches["HomeTeam_code"] = matches["HomeTeam"].astype("category").cat.codes

In [4]:
matches["AwayTeam_code"] = matches["AwayTeam"].astype("category").cat.codes

In [5]:
matches["target"] = (matches["result"] == "Win").astype("int")

In [6]:
matches

Unnamed: 0,HomeTeam,AwayTeam,Date,FTHG,FTAG,result,home_total_goals_for_year,away_total_goals_for_year,Head_To_Head_Win_Percentage,HomeTeam_code,AwayTeam_code,target
0,La Coruna,Valencia,1995-09-02,3,0,Win,21,14,38.0952,20,42,1
1,Sp Gijon,Albacete,1995-09-02,3,0,Win,22,5,100.0000,40,1,1
2,Athletic Club,Santander,1995-09-03,4,0,Win,14,10,56.2500,3,38,1
3,Athletico Madrid,Real Sociedad,1995-09-03,4,1,Win,19,12,60.8696,4,35,1
4,Celta Vigo,Compostela,1995-09-03,0,1,Loss,12,6,0.0000,7,8,0
...,...,...,...,...,...,...,...,...,...,...,...,...
10942,Athletico Madrid,Cadiz,2023-10-01,3,2,Win,39,11,100.0000,4,6,1
10943,Real Betis,Valencia,2023-10-01,3,0,Win,24,15,50.0000,33,42,1
10944,Real Betis,Valencia,2023-10-01,3,0,Win,24,15,50.0000,33,42,1
10945,Las Palmas,Celta Vigo,2023-10-02,2,1,Win,8,27,57.1429,21,7,1


In [7]:
from sklearn.ensemble import RandomForestClassifier

In [8]:
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)

In [9]:
train = matches[matches["Date"] < '2008-01-01']

In [10]:
test = matches[matches["Date"] > '2008-01-01']

In [11]:
predictors = ["HomeTeam_code", "AwayTeam_code", "FTHG", "home_total_goals_for_year"]

In [12]:
rf.fit(train[predictors], train["target"])

In [13]:
preds = rf.predict(test[predictors])

In [14]:
from sklearn.metrics import accuracy_score

In [15]:
acc_score = accuracy_score(test["target"], preds)

In [16]:
acc_score

0.7850652568974062

In [17]:
combined = pd.DataFrame(dict(actual=test["target"], prediction=preds))

In [18]:
pd.crosstab(index=combined["actual"], columns=combined["prediction"])

prediction,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2500,696
1,605,2252


In [19]:
import numpy as np

In [21]:
def predict_result(home_team, away_team):
    home_team_categories = matches["HomeTeam"].astype("category").cat.categories
    away_team_categories = matches["AwayTeam"].astype("category").cat.categories
    if (home_team in home_team_categories) and (away_team in away_team_categories):
        encoded_home_team = home_team_categories.get_loc(home_team)
        encoded_away_team = away_team_categories.get_loc(away_team)
        feature_vector = pd.DataFrame([{
            'HomeTeam_code': encoded_home_team,
            'AwayTeam_code': encoded_away_team,
            'FTHG': np.random.randint(1,5), #use a random number of goals scored by the home team for the game
            'home_total_goals_for_year': np.random.randint(10, 50), #use a random number of total goals for the year
        }])
        prediction = rf.predict(feature_vector)

        result_map = {0: f"{home_team} will lose", 1: f"{home_team} will win"}
        return f"Predicted result: {result_map[prediction[0]]}"
    else:
        print(f"One of the teams was not found in the list.")

user_input_home = input("Enter the home team: ")
user_input_away = input("Enter the away team: ")
print(predict_result(user_input_home, user_input_away))

Predicted result: Barcelona will win
