In [1]:
import pandas as pd
import os

csv_files = os.listdir('odds-archive')
csv_files = ["odds-archive/" + e for e in csv_files if not e.startswith('.') ]

def create_df(file_paths):
    full_data = pd.DataFrame()
    for file in file_paths:
        with open(file) as file_in:
            print(f"reading {file}")
            part_df = pd.read_csv(file)
            full_data = pd.concat([full_data, part_df], axis=0)

    return full_data
        
df = create_df(csv_files)

reading odds-archive/football-odds-2007-2008.csv
reading odds-archive/football-odds-2008-2009.csv
reading odds-archive/football-odds-2009-2010.csv
reading odds-archive/football-odds-2010-2011.csv
reading odds-archive/football-odds-2011-2012.csv
reading odds-archive/football-odds-2012-2013.csv
reading odds-archive/football-odds-2013-2014.csv
reading odds-archive/football-odds-2014-2015.csv
reading odds-archive/football-odds-2015-2016.csv
reading odds-archive/football-odds-2016-2017.csv
reading odds-archive/football-odds-2017-2018.csv
reading odds-archive/football-odds-2018-2019.csv
reading odds-archive/football-odds-2019-2020.csv
reading odds-archive/football-odds-2020-2021.csv
reading odds-archive/football-odds-2021-2022.csv


In [2]:
df.columns = ['match', 'bookmaker', 'home-odds', 'draw-odds', 'away-odds', 'date', 'score']
df.shape

(276960, 7)

In [3]:
df.head()

Unnamed: 0,match,bookmaker,home-odds,draw-odds,away-odds,date,score
0,Sunderland\n3\n–\nMiddlesbrough\n2,bet-at-home,120,220,215,"Saturday, 26 Apr 2008, 14:00",3:2
1,Sunderland\n3\n–\nMiddlesbrough\n2,bet365,129,210,240,"Saturday, 26 Apr 2008, 14:00",3:2
2,Sunderland\n3\n–\nMiddlesbrough\n2,bwin,120,210,210,"Saturday, 26 Apr 2008, 14:00",3:2
3,Sunderland\n3\n–\nMiddlesbrough\n2,Interwetten,120,179,229,"Saturday, 26 Apr 2008, 14:00",3:2
4,Sunderland\n3\n–\nMiddlesbrough\n2,Pinnacle,143,200,233,"Saturday, 26 Apr 2008, 14:00",3:2


## Unique bookmakers

In [4]:
df['bookmaker'].unique()

array(['bet-at-home', 'bet365', 'bwin', 'Interwetten', 'Pinnacle',
       'Unibet', 'William Hill', 'Marathonbet', '1xBet', 'Marsbet',
       'GGBET', '10x10bet', 'Curebet', 'Lasbet', 'VOBET'], dtype=object)

# Preprocess

1. create a column describing the match result: home, draw, or away win
2. convert the odds to probability

## Clean data

there is columns that does not retrieve data.
Thankfully, it's only few dozens

In [5]:
df = df.drop(df[df['home-odds'] == '-'].index)

## Create column *match-result*

In [6]:
def create_match_result(score)-> str:
    parts = score.split(':')
    home = int(parts[0])
    away = int(parts[1])
    if home > away:
        'home'
        return 0
    elif home == away:
        'draw'
        return 1 
    else:
        'away'
        return 2
    
    
df['match-result'] = df['score'].apply(create_match_result)

In [7]:
df.sample(10)

Unnamed: 0,match,bookmaker,home-odds,draw-odds,away-odds,date,score,match-result
17317,Newcastle\n0\n–\nManchester City\n2,William Hill,400,275,-149,"Sunday, 17 Aug 2014, 15:00",0:2,2
78,Blackburn\n3\n–\nDerby\n1,bet365,-400,450,1200,"Saturday, 03 May 2008, 14:00",3:1,0
7250,Birmingham\n0\n–\nEverton\n2,bet365,220,210,138,"Saturday, 02 Oct 2010, 14:00",0:2,2
22145,West Brom\n4\n–\nBurnley\n0,Interwetten,100,240,280,"Monday, 21 Nov 2016, 19:00",4:0,0
5323,Chelsea\n3\n–\nWest Ham\n0,Pinnacle,-500,655,1625,"Saturday, 23 Apr 2011, 16:30",3:0,0
1655,Stoke\n1\n–\nWest Brom\n0,bet365,138,240,200,"Saturday, 22 Nov 2008, 14:00",1:0,0
24174,Manchester Utd\n2\n–\nChelsea\n1,bwin,135,220,220,"Sunday, 25 Feb 2018, 13:05",2:1,0
22551,Southampton\n1\n–\nSwansea\n0,Marathonbet,-143,290,490,"Sunday, 18 Sep 2016, 13:15",1:0,0
7492,Swansea\n4\n–\nWolves\n4,Unibet,-200,300,575,"Saturday, 28 Apr 2012, 14:00",4:4,1
17234,Tottenham\n4\n–\nQPR\n0,Marathonbet,-172,340,540,"Sunday, 24 Aug 2014, 12:30",4:0,0


## odds to probability

In [8]:
import numpy as np

def convert_american_to_decimal(odds):
    float_odds = np.float32(odds)
    if float_odds > 0:
        return (float_odds / 100) + 1
    if float_odds < 0:
        return (100 / float_odds) + 1
    if float_odds == 0:
        return 0
    raise Exception("How could this happen?" + float_odds)
    

df['decimal-home'] = df['home-odds'].apply(convert_american_to_decimal)
df['decimal-draw'] = df['draw-odds'].apply(convert_american_to_decimal)
df['decimal-away'] = df['away-odds'].apply(convert_american_to_decimal)

In [9]:
def convert_to_probability(decimal):
    probability = (1 / decimal)
    return round(probability, 3)

def get_margin_of_three_odds(margin):
    """
    probability over 1, multiply by margin will be the odds:
    (1/prob) * margin = odd
    There are 3 odds available, we assume the margin is a fixed percentage for each odd
    home_odd + draw_odd + away_odd = total payout
    (1/home_prob) * margin + (1/draw_prob) * margin + (1/away_prob) * margin = total payout
    to get the margin, we simply calculate the cube root of the overround
    """
    return margin ** (1/3)

df['probability-home'] = df['decimal-home'].apply(convert_to_probability)
df['probability-draw'] = df['decimal-draw'].apply(convert_to_probability)
df['probability-away'] = df['decimal-away'].apply(convert_to_probability)
df['overround'] = df['probability-home'] + df['probability-draw'] + df['probability-away'] - 1
df['margin'] = df['overround'].apply(get_margin_of_three_odds)
df.head()

Unnamed: 0,match,bookmaker,home-odds,draw-odds,away-odds,date,score,match-result,decimal-home,decimal-draw,decimal-away,probability-home,probability-draw,probability-away,overround,margin
0,Sunderland\n3\n–\nMiddlesbrough\n2,bet-at-home,120,220,215,"Saturday, 26 Apr 2008, 14:00",3:2,0,2.2,3.2,3.15,0.455,0.312,0.317,0.084,0.437952
1,Sunderland\n3\n–\nMiddlesbrough\n2,bet365,129,210,240,"Saturday, 26 Apr 2008, 14:00",3:2,0,2.29,3.1,3.4,0.437,0.323,0.294,0.054,0.377976
2,Sunderland\n3\n–\nMiddlesbrough\n2,bwin,120,210,210,"Saturday, 26 Apr 2008, 14:00",3:2,0,2.2,3.1,3.1,0.455,0.323,0.323,0.101,0.465701
3,Sunderland\n3\n–\nMiddlesbrough\n2,Interwetten,120,179,229,"Saturday, 26 Apr 2008, 14:00",3:2,0,2.2,2.79,3.29,0.455,0.358,0.304,0.117,0.489097
4,Sunderland\n3\n–\nMiddlesbrough\n2,Pinnacle,143,200,233,"Saturday, 26 Apr 2008, 14:00",3:2,0,2.43,3.0,3.33,0.412,0.333,0.3,0.045,0.355689


## Ensemble modelling: soft vote

In [10]:
unibet_df = df[df['bookmaker'] == 'Unibet']
william_hill_df = df[df['bookmaker'] == 'William Hill']
bet365_df = df[df['bookmaker'] == 'bet365']


other_bookmakers = df.drop(df[df['bookmaker'] == 'bet365'].index)
other_bookmakers = other_bookmakers.drop(other_bookmakers[other_bookmakers['bookmaker'] == 'William Hill'].index)
other_bookmakers = other_bookmakers.drop(other_bookmakers[other_bookmakers['bookmaker'] == 'Unibet'].index)

In [15]:
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.model_selection import train_test_split

x_var = ['probability-home', 'probability-draw','probability-away']
y = ['match-result']

unibet_x_train , unibet_x_test, unibet_y_train, unibet_y_test = train_test_split(unibet_df[x_var], unibet_df[y], random_state=0)
william_hill_x_train , william_hill_x_test, william_hill_y_train, william_hill_y_test = train_test_split(william_hill_df[x_var], william_hill_df[y], random_state=0)
bet365_x_train , bet365_x_test, bet365_y_train, bet365_y_test = train_test_split(bet365_df[x_var], bet365_df[y], random_state=0)
other_x_train , other_x_test, other_y_train, other_y_test = train_test_split(other_bookmakers[x_var], other_bookmakers[y], random_state=0)

In [16]:


unibet_clf = svm.SVC(random_state=0)
william_hill_clf = svm.SVC(random_state=0)
bet365_clf = svm.SVC(random_state=0)
other_bookmakers_clf = svm.SVC(random_state=0)

ensemble_model = VotingClassifier(
    estimators=[('unibet', unibet_clf), 
                ('william_hill', william_hill_clf), 
                ('bet365', bet365_clf), 
                ('others', other_bookmakers_clf)],
                        voting='soft')



unibet_clf = unibet_clf.fit(unibet_x_train, unibet_y_train)
william_hill_clf = william_hill_clf.fit(william_hill_x_train, william_hill_y_train)
bet365_clf = bet365_clf.fit(bet365_x_train, bet365_y_train)
other_bookmakers_clf = other_bookmakers_clf.fit(other_x_train, other_y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [24]:
from sklearn.metrics import accuracy_score
def evaluate_model(clf, clf_name, X_test, y_test):
    score = clf.score(X_test, y_test)
    print(f"{clf_name}: {score}")
    
clfs = [unibet_clf, william_hill_clf, bet365_clf, other_bookmakers_clf]
clfs_name = ['unibet_clf', 'william_hill_clf', 'bet365_clf', 'other_bookmakers_clf']
test_x = [unibet_x_test, william_hill_x_test, bet365_x_test, other_x_test]
test_y = [unibet_y_test, william_hill_y_test, bet365_y_test, other_y_test]

for (clf, name, x, y) in zip(clfs, clfs_name, test_x, test_y):
    evaluate_model(clf=clf, clf_name=name, X_test=x, y_test=y)

unibet_clf: 0.5390005743825388
william_hill_clf: 0.537820061075875
bet365_clf: 0.5363198900091659
other_bookmakers_clf: 0.5368941787279896


In [27]:
other_y_test

Unnamed: 0,match-result
33122,2
4121,0
21659,0
22016,1
11235,2
...,...
12049,0
33019,1
7008,0
13631,0


In [None]:
X_train, X_test, y_train, y_test = train_test_split(df[x_var], df['match-result'], test_size=0.3, random_state=42)

ensemble_model = ensemble_model.fit(X_train, y_train)
ensemble_model.score(X_test, y_test)

In [None]:
# save to file
from joblib import dump

dump(ensemble_model, 'ensemble_model.joblib') 