In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from datetime import date, timedelta
import pickle
import requests
import numpy as np
from glicko2 import Glicko2
from trueskill import TrueSkill
from scipy.stats import norm
from collections import defaultdict

## First look at the data

This dataset contains results from every Bundesliga match from 1993-1994 to 2021-2022. It also includes half time results, but only since 1995-96. Columns include Division (denoted as D1), HomeTeam, AwayTeam, FTHG (final time home goals), FTAG (final time away goals), FTR (full time result), HTHG (half time home goals), HTAG (half time away goals), HTR (half time result), and season.

Data compiled into one file from this site: http://www.football-data.co.uk/germanym.php

In [2]:
df = pd.read_csv('Bundesliga_Results.csv', parse_dates=['Date'])

In [3]:
# some basic cleaning and rewriting
df.sort_values(by='Date', inplace=True)
df.reset_index(inplace=True)
# clean the team names
df.HomeTeam = df.HomeTeam.apply(lambda x: x.lower().replace(' ', ''))
df.AwayTeam = df.AwayTeam.apply(lambda x: x.lower().replace(' ', ''))

# deduct winner and loser from the scores
# add draws to the winner, doesn't really matter
df['home_win'] = np.where(df.FTHG >= df.FTAG, 1, 0)
df['draw'] = np.where(df.FTHG==df.FTAG, 1, 0)
df['away_win'] = np.where(df.FTHG < df.FTAG, 1, 0)

df['winner'] = (df.HomeTeam * (df.home_win) + df.AwayTeam * df.away_win)
df['loser'] = (df.HomeTeam * (1 - df.home_win) + df.AwayTeam * (1 - df.away_win))

print('Dataset contains {} matches from {} to {}. Home won {} ({}%). There were {} draws ({}%).'.format(len(df), df.Date.min().date(), df.Date.max().date(), df.home_win.sum(), int(100*df.home_win.sum()/len(df)), df.draw.sum(), int(100*df.draw.sum()/len(df)) ))

Dataset contains 8874 matches from 1993-08-07 to 2022-05-14. Home won 6379 (71%). There were 2259 draws (25%).


In [4]:
df.tail()

Unnamed: 0.1,level_0,Unnamed: 0,index,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,season,home_win,draw,away_win,winner,loser
8869,8867,8867,298,D1,2022-05-14,bielefeld,rbleipzig,1.0,1.0,D,0.0,0.0,D,2021-2022,1,1,0,bielefeld,rbleipzig
8870,8866,8866,297,D1,2022-05-14,augsburg,greutherfurth,2.0,1.0,H,1.0,1.0,D,2021-2022,1,0,0,augsburg,greutherfurth
8871,8865,8865,305,D1,2022-05-14,wolfsburg,bayernmunich,2.0,2.0,D,1.0,2.0,A,2021-2022,1,1,0,wolfsburg,bayernmunich
8872,8868,8868,299,D1,2022-05-14,dortmund,hertha,2.0,1.0,H,0.0,1.0,A,2021-2022,1,0,0,dortmund,hertha
8873,8873,8873,304,D1,2022-05-14,unionberlin,bochum,3.0,2.0,H,2.0,0.0,H,2021-2022,1,0,0,unionberlin,bochum


In [5]:
df['month'] = df.Date.dt.month
df.tail()

Unnamed: 0.1,level_0,Unnamed: 0,index,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,season,home_win,draw,away_win,winner,loser,month
8869,8867,8867,298,D1,2022-05-14,bielefeld,rbleipzig,1.0,1.0,D,0.0,0.0,D,2021-2022,1,1,0,bielefeld,rbleipzig,5
8870,8866,8866,297,D1,2022-05-14,augsburg,greutherfurth,2.0,1.0,H,1.0,1.0,D,2021-2022,1,0,0,augsburg,greutherfurth,5
8871,8865,8865,305,D1,2022-05-14,wolfsburg,bayernmunich,2.0,2.0,D,1.0,2.0,A,2021-2022,1,1,0,wolfsburg,bayernmunich,5
8872,8868,8868,299,D1,2022-05-14,dortmund,hertha,2.0,1.0,H,0.0,1.0,A,2021-2022,1,0,0,dortmund,hertha,5
8873,8873,8873,304,D1,2022-05-14,unionberlin,bochum,3.0,2.0,H,2.0,0.0,H,2021-2022,1,0,0,unionberlin,bochum,5


In [6]:
df.head()

Unnamed: 0.1,level_0,Unnamed: 0,index,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,season,home_win,draw,away_win,winner,loser,month
0,0,0,0,D1,1993-08-07,bayernmunich,freiburg,3.0,1.0,H,,,,1993-1994,1,0,0,bayernmunich,freiburg,8
1,1,1,8,D1,1993-08-07,werderbremen,stuttgart,5.0,1.0,H,,,,1993-1994,1,0,0,werderbremen,stuttgart,8
2,2,2,7,D1,1993-08-07,wattenscheid,schalke04,3.0,0.0,H,,,,1993-1994,1,0,0,wattenscheid,schalke04,8
3,3,3,6,D1,1993-08-07,m'gladbach,einfrankfurt,0.0,4.0,A,,,,1993-1994,0,0,1,einfrankfurt,m'gladbach,8
4,4,4,1,D1,1993-08-07,dortmund,karlsruhe,2.0,1.0,H,,,,1993-1994,1,0,0,dortmund,karlsruhe,8


In [7]:
df.to_csv('data_preprocessed.csv')