Imports and Constants

In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pandas as pd
import requests

matches_file = 'matches.csv'

Scrape Euro qualifying match data

In [2]:
matches_url = 'https://fbref.com/en/comps/678/schedule/UEFA-Euro-Qualifying-Scores-and-Fixtures'
matches_data = requests.get(matches_url)

In [3]:
qualifying_games = pd.read_html(matches_data.text, match='Scores & Fixtures')[0]

Scrape Euro match data

In [4]:
euro_games_url = 'https://fbref.com/en/comps/676/schedule/European-Championship-Scores-and-Fixtures'
euro_games_data = requests.get(euro_games_url)

In [5]:
euro_games = pd.read_html(euro_games_data.text, match='Scores & Fixtures')[0]

Combine data together

In [6]:
# concatenate data
matches = pd.concat([qualifying_games, euro_games])
# drop empty rows
matches = matches.dropna(how='all')
# set row indices
matches.index = range(matches.shape[0])
# lower case all column names
matches.columns = [i.lower() for i in matches.columns]

Create Predictors

In [7]:
def normalize_team_name(name):
    new_name = name.split(' ')
    if new_name[0] != 'san' and len(new_name[0]) <= 3 and new_name[0].islower() and new_name[0].isalpha(): new_name = new_name[1:]
    elif len(new_name[-1]) <= 3 and new_name[-1].islower() and new_name[-1].isalpha(): new_name = new_name[:-1]
    new_name = ' '.join(new_name)
    match new_name:
        case "Bosnia & Herz'na":
            return 'bosnia and herzegovina'
        case 'N. Macedonia':
            return 'north macedonia'
        case 'Rep. of Ireland':
            return 'republic of ireland'
        case 'Türkiye':
            return 'turkiye'
        case _:
            return new_name.lower()

In [8]:
# normalize team names
matches['home'] = [normalize_team_name(i) for i in matches['home']]
matches['away'] = [normalize_team_name(i) for i in matches['away']]

In [9]:
# convert predictor columns to numeric types
matches['date'] = pd.to_datetime(matches['date'])
matches['hour'] = matches['time'].str.replace(':.+', '', regex=True).astype('int')
matches['day_code'] = matches['date'].dt.dayofweek
matches['home_code'] = matches['home'].astype('category').cat.codes
matches['away_code'] = matches['away'].astype('category').cat.codes

Calculate rolling averages for each team

In [10]:
def rolling_averages(group, cols):
    group = group.sort_values('date')
    rolling_stats = group[cols].rolling(3, closed='left').mean().ffill()
    new_cols = [i + '_rolling' for i in cols]
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

In [12]:
# read team data from csv file
team_stats = pd.read_csv(matches_file, index_col=0)

In [13]:
# fix goals columns to not include pen shootout stats
team_stats['gf'] = team_stats['gf'].str.replace(' (.+)', '', regex=True).astype('float')
team_stats['ga'] = team_stats['ga'].str.replace(' (.+)', '', regex=True).astype('float')
# convert date column
team_stats['date'] = pd.to_datetime(team_stats['date'])

In [14]:
# columns to calculate rolling averages for
rolling_cols = ['gf', 'ga', 'poss', 'sh', 'sot', 'crdy', 'fls', 'fld', 'crs', 'int', 'tklw']

In [15]:
team_stats = team_stats.groupby('team').apply(lambda x: rolling_averages(x, rolling_cols))
team_stats = team_stats.droplevel('team')
team_stats.index = range(team_stats.shape[0])

Merge matches data with data from each team playing

In [16]:
# team data columns to merge
merge_cols = ['date', 'team'] + [i + '_rolling' for i in rolling_cols]

In [17]:
# merge on home teams
matches['team'] = matches['home']
matches = matches.merge(team_stats[merge_cols + ['result']], on=['date', 'team'])

In [18]:
# merge on away teams
matches['team'] = matches['away']
matches = matches.merge(team_stats[merge_cols], on=['date', 'team'], suffixes=['_home', '_away'])

In [19]:
# prediction target column
matches['target'] = matches['result'].astype('category').cat.codes

Training the model

In [20]:
rf = RandomForestClassifier(n_estimators=75, min_samples_split=10, random_state=1)

In [21]:
def make_prediction(data, predictors):
    # separate training and testing
    train = data[data['date'] < '2023-11-01']
    test = data[data['date'] > '2023-11-01']
    test = test[test['date'] < '2024-06-13']
    # fit model
    rf.fit(train[predictors], train['target'])
    preds = rf.predict(test[predictors])
    # combine predictions and actuals
    combined = pd.DataFrame(dict(actual=test['target'], prediction=preds), index=test.index)
    # evaluate
    accuracy = accuracy_score(test['target'], preds)
    return combined, accuracy

In [22]:
predictors = ['hour', 'day_code', 'home_code', 'away_code'] + \
 [i + '_rolling_home' for i in rolling_cols] + \
  [i + '_rolling_away' for i in rolling_cols]

In [23]:
combined, acc = make_prediction(matches, predictors)
acc

0.543859649122807

In [24]:
# 0 - draw; 1 - away win; 2 - home win
pd.crosstab(index=combined['actual'], columns=combined['prediction'])

prediction,0,1,2
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2,10,4
1,0,11,2
2,0,10,18


Predicting Euro games

In [25]:
euro_matches = matches[matches['date'] > '2024-06-13']

In [26]:
predictions = rf.predict(euro_matches[predictors])

In [27]:
prediction_table = pd.DataFrame(columns=['home', 'away', 'predicted winner', 'actual winner'])
for i in range(len(predictions)):
    home = euro_matches.iloc[i]['home']
    away = euro_matches.iloc[i]['away']

    if predictions[i] == 2: pred = home
    elif predictions[i] == 1: pred = away
    else: pred = 'draw'

    if euro_matches.iloc[i]['target'] == -1: actual = ''
    elif euro_matches.iloc[i]['target'] == 0: actual = 'draw'
    elif euro_matches.iloc[i]['target'] == 1: actual = away
    else: actual = home
    prediction_table.loc[i] = [home, away, pred, actual]

prediction_table

Unnamed: 0,home,away,predicted winner,actual winner
0,hungary,switzerland,switzerland,switzerland
1,spain,croatia,spain,spain
2,italy,albania,italy,italy
3,poland,netherlands,netherlands,netherlands
4,slovenia,denmark,denmark,draw
5,serbia,england,serbia,england
6,romania,ukraine,ukraine,romania
7,belgium,slovakia,slovakia,slovakia
8,austria,france,france,france
9,turkiye,georgia,turkiye,turkiye
