# Sportsbetting Project Bundesliga

This Notebook contains the development of a machine learning model to predict future Bundesliga games. The goal is to pass the algorithm two team names (i.e. Home Team and Away Team) to get the probability of the outcome of the game.

### Setup

In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import sklearn

import tensorflow as tf
from tensorflow import keras

# fetching data from website 
import requests
import bs4
import lxml
from bs4 import BeautifulSoup

### Fetching Team market values from website

In [2]:
req = requests.get('https://de.statista.com/statistik/daten/studie/216031/umfrage/marktwert-der-bundesligamannschaften/')
type(req)

requests.models.Response

In [3]:
# raw text
req.text

'<!DOCTYPE html><html lang="de"  prefix="og: http://ogp.me/ns#"><head><meta charset="UTF-8" /><link rel="preconnect" href="https://cdn.statcdn.com" crossorigin><title>1. Fußball-Bundesliga: Marktwerte der Teams 2021 | Statista</title><meta itemprop="description" name="description" content="Rekordmeister FC Bayern München besitzt die Fußballmannschaft mit dem höchsten gesamten Marktwert der 1." /><meta id="gtm_routeName" data-page="statistic.de" /><meta id="gtm_automatedTest" data-page="false" /><meta id="gtm_userProductGroup" data-page="anonymous" /><meta id="gtm_accountTypeId" data-page="31" /><meta id="gtm_locale" data-page="de" /><meta id="gtm_pageType" data-page="statistic.de" /><meta id="gtm_userPhase" data-page="content" /><meta id="gtm_userId" data-page="0" /><meta id="gtm_userProductId" data-page="31" /><meta id="gtm_userLog" data-page="false" /><meta id="gtm_lastContentId" data-page="" /><meta id="gtm_cookieConsentEnabled" data-page="true" /><meta id="dl_gtm_contentview" data-

In [4]:
soup = BeautifulSoup(req.text, 'html.parser')
print(soup.prettify())

<!DOCTYPE html>
<html lang="de" prefix="og: http://ogp.me/ns#">
 <head>
  <meta charset="utf-8"/>
  <link crossorigin="" href="https://cdn.statcdn.com" rel="preconnect"/>
  <title>
   1. Fußball-Bundesliga: Marktwerte der Teams 2021 | Statista
  </title>
  <meta content="Rekordmeister FC Bayern München besitzt die Fußballmannschaft mit dem höchsten gesamten Marktwert der 1." itemprop="description" name="description"/>
  <meta data-page="statistic.de" id="gtm_routeName"/>
  <meta data-page="false" id="gtm_automatedTest"/>
  <meta data-page="anonymous" id="gtm_userProductGroup"/>
  <meta data-page="31" id="gtm_accountTypeId"/>
  <meta data-page="de" id="gtm_locale"/>
  <meta data-page="statistic.de" id="gtm_pageType"/>
  <meta data-page="content" id="gtm_userPhase"/>
  <meta data-page="0" id="gtm_userId"/>
  <meta data-page="31" id="gtm_userProductId"/>
  <meta data-page="false" id="gtm_userLog"/>
  <meta data-page="" id="gtm_lastContentId"/>
  <meta data-page="true" id="gtm_cookieConsen

In [5]:
teams = soup.find('tbody').text
teams

'FC Bayern München \r891,4Borussia Dortmund\r615,2RasenBallsport Leipzig 552,68Bayer 04 Leverkusen\r343,45Borussia Mönchengladbach\r343,13Hertha BSC\r251,93TSG 1899 Hoffenheim\r238,33VfL Wolfsburg\r218,55Eintracht Frankfurt\r186,2FC Schalke 04\r165,8SC Freiburg 111,7SV Werder Bremen\r107,2FC Augsburg\r104,28VfB Stuttgart103,451. FC Köln101,81. FSV Mainz 0589,55Union Berlin60,6Arminia Bielefeld45,15'

In [6]:
# Remove unwanted characters and match teamnames with names used later on

teams_clean = teams.replace(' ', '')
teams_clean = teams_clean.replace('\r', '')
teams_clean = teams_clean.replace('FCBayernMünchen', 'BayernMunich')
teams_clean = teams_clean.replace('BorussiaDortmund', 'Dortmund')
teams_clean = teams_clean.replace('RasenBallsportLeipzig', 'RBLeipzig')
teams_clean = teams_clean.replace('BorussiaMönchengladbach', 'Gladbach')
teams_clean = teams_clean.replace('VfLWolfsburg', 'Wolfsburg')
teams_clean = teams_clean.replace('EintrachtFrankfurt', 'EinFrankfurt')
teams_clean = teams_clean.replace('HerthaBSC', 'Hertha')
teams_clean = teams_clean.replace('FCAugsburg', 'Augsburg')
teams_clean = teams_clean.replace('VfbSturrgart', 'Stuttgart')
teams_clean = teams_clean.replace('SCFreiburg', 'Freiburg')
teams_clean = teams_clean.replace('FCKöln', 'FCKoln')
teams_clean = teams_clean.replace('Bayer04Leverkusen', 'Leverkusen')
teams_clean = teams_clean.replace('FSVMainz05', 'Mainz')
teams_clean = teams_clean.replace('ArminiaBielefeld', 'Bielefeld')
teams_clean = teams_clean.replace('FCSchalke04', 'Schalke')
teams_clean = teams_clean.replace('TSG1899Hoffenheim', 'Hoffenheim')
teams_clean = teams_clean.replace('SVWerderBremen', 'WerderBremen')
teams_clean

'BayernMunich891,4Dortmund615,2RBLeipzig552,68Leverkusen343,45Gladbach343,13Hertha251,93Hoffenheim238,33Wolfsburg218,55EinFrankfurt186,2Schalke165,8Freiburg111,7WerderBremen107,2Augsburg104,28VfBStuttgart103,451.FCKoln101,81.Mainz89,55UnionBerlin60,6Bielefeld45,15'

In [7]:
# Split the string into alphabetic and numeric values
from itertools import groupby 

def split_text(s):
    for k, g in groupby(s, str.isalpha):
        yield ''.join(g)

teamsdone = (list(split_text(teams_clean)))
print(teamsdone)

['BayernMunich', '891,4', 'Dortmund', '615,2', 'RBLeipzig', '552,68', 'Leverkusen', '343,45', 'Gladbach', '343,13', 'Hertha', '251,93', 'Hoffenheim', '238,33', 'Wolfsburg', '218,55', 'EinFrankfurt', '186,2', 'Schalke', '165,8', 'Freiburg', '111,7', 'WerderBremen', '107,2', 'Augsburg', '104,28', 'VfBStuttgart', '103,451.', 'FCKoln', '101,81.', 'Mainz', '89,55', 'UnionBerlin', '60,6', 'Bielefeld', '45,15']


In [8]:
# Convert list into dictionary to bind team names to values 
def Convert(lst):
    res_dct = {lst[i]: lst[i + 1] for i in range(0, len(lst), 2)}
    return res_dct

teamsdone = Convert(teamsdone)
teamsdone

{'BayernMunich': '891,4',
 'Dortmund': '615,2',
 'RBLeipzig': '552,68',
 'Leverkusen': '343,45',
 'Gladbach': '343,13',
 'Hertha': '251,93',
 'Hoffenheim': '238,33',
 'Wolfsburg': '218,55',
 'EinFrankfurt': '186,2',
 'Schalke': '165,8',
 'Freiburg': '111,7',
 'WerderBremen': '107,2',
 'Augsburg': '104,28',
 'VfBStuttgart': '103,451.',
 'FCKoln': '101,81.',
 'Mainz': '89,55',
 'UnionBerlin': '60,6',
 'Bielefeld': '45,15'}

In [9]:
sortedTeams = dict(sorted(teamsdone.items()))
sortedTeams

{'Augsburg': '104,28',
 'BayernMunich': '891,4',
 'Bielefeld': '45,15',
 'Dortmund': '615,2',
 'EinFrankfurt': '186,2',
 'FCKoln': '101,81.',
 'Freiburg': '111,7',
 'Gladbach': '343,13',
 'Hertha': '251,93',
 'Hoffenheim': '238,33',
 'Leverkusen': '343,45',
 'Mainz': '89,55',
 'RBLeipzig': '552,68',
 'Schalke': '165,8',
 'UnionBerlin': '60,6',
 'VfBStuttgart': '103,451.',
 'WerderBremen': '107,2',
 'Wolfsburg': '218,55'}

These values will be used later on in this project as a new feature. After Evaluating the process, it would have been easier to create a simple Excel sheet and update the values from time to time. However, now we have automated the process and don't need to spend more time with this feature.

### Load Data

In [10]:
data1 = pd.read_csv('D1.csv')
data1.head()

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,...,AvgC<2.5,AHCh,B365CAHH,B365CAHA,PCAHH,PCAHA,MaxCAHH,MaxCAHA,AvgCAHH,AvgCAHA
0,D1,18/09/2020,19:30,Bayern Munich,Schalke 04,8,0,H,3,0,...,4.34,-2.5,1.89,2.04,1.87,2.02,1.95,2.18,1.85,2.02
1,D1,19/09/2020,14:30,Ein Frankfurt,Bielefeld,1,1,D,0,0,...,2.33,-0.75,1.96,1.97,1.96,1.96,2.02,1.98,1.94,1.93
2,D1,19/09/2020,14:30,FC Koln,Hoffenheim,2,3,A,1,2,...,2.27,0.0,1.91,2.02,1.92,2.01,1.97,2.08,1.89,1.98
3,D1,19/09/2020,14:30,Stuttgart,Freiburg,2,3,A,0,2,...,2.33,-0.25,1.92,2.01,1.91,2.02,1.94,2.04,1.88,1.99
4,D1,19/09/2020,14:30,Union Berlin,Augsburg,1,3,A,0,1,...,1.71,-0.25,2.02,1.91,2.0,1.92,2.05,1.93,2.0,1.87


In [11]:
data2 = pd.read_csv('MD_10.csv', skiprows=1)
data2.head()

Unnamed: 0,Team,Rank,P,M,W,D,L,G,GA,GD,...,GD.1,Rank.2,P.2,M.2,W.2,D.2,L.2,G.2,GA.2,GD.2
0,Bayern Munich,1,23,10,7,2,1,34,16,18,...,14,2,12,5,4,0,1,13,9,4
1,Bayer Leverkusen,2,22,10,6,4,0,19,9,10,...,3,1,14,6,4,2,0,11,4,7
2,RB Leipzig,3,21,10,6,3,1,21,9,12,...,11,9,6,5,1,3,1,7,6,1
3,Borussia Dortmund,4,19,10,6,1,3,22,10,12,...,8,4,10,5,3,1,1,9,5,4
4,VfL Wolfsburg,5,18,10,4,6,0,16,10,6,...,4,7,7,5,1,4,0,7,5,2


As of first, we will take care of the training features **X** included in the datasets. This includes the selection of important features. The goal is to use as many important features to create a row vector for each team with their according stats.

In [12]:
X1 = data1.iloc[:, 3:15] # Leaving only most relevant features
X1.head()

Unnamed: 0,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,HS,AS,HST,AST
0,Bayern Munich,Schalke 04,8,0,H,3,0,H,22,5,12,1
1,Ein Frankfurt,Bielefeld,1,1,D,0,0,D,18,14,6,4
2,FC Koln,Hoffenheim,2,3,A,1,2,A,13,13,6,7
3,Stuttgart,Freiburg,2,3,A,0,2,A,22,7,7,6
4,Union Berlin,Augsburg,1,3,A,0,1,A,13,9,3,5


### Abbreviation Definition

- **FTHG** = Full time home team goals  $\qquad\qquad$  **FTAG** = Full time away team goals 
- **FTR** = Full time result            $\qquad\qquad\qquad\qquad\;\;$  **HTR** = Half time result           
- **HTHG** = Half time home team goals  $\qquad\quad\;\;\;$  **HTAG** = Half time away team goals 
- **HS** = Home team shots              $\qquad\qquad\qquad\;\;\;\;\;\;\;$  **AS** = Away team shots             
- **HST** = Home team shots on target   $\qquad\quad\;\;\;\;\;$  **AST** Away team shots on target         

In [13]:
X1_clean = X1.drop(['HTR','FTR'], axis=1) # Dropping half time and full time result for training data
X1_clean[0:10]

Unnamed: 0,HomeTeam,AwayTeam,FTHG,FTAG,HTHG,HTAG,HS,AS,HST,AST
0,Bayern Munich,Schalke 04,8,0,3,0,22,5,12,1
1,Ein Frankfurt,Bielefeld,1,1,0,0,18,14,6,4
2,FC Koln,Hoffenheim,2,3,1,2,13,13,6,7
3,Stuttgart,Freiburg,2,3,0,2,22,7,7,6
4,Union Berlin,Augsburg,1,3,0,1,13,9,3,5
5,Werder Bremen,Hertha,1,4,0,2,17,13,7,6
6,Dortmund,M'gladbach,3,0,1,0,9,8,4,2
7,RB Leipzig,Mainz,3,1,2,0,23,8,10,1
8,Wolfsburg,Leverkusen,0,0,0,0,9,6,1,2
9,Hertha,Ein Frankfurt,1,3,0,2,12,10,6,3


In [14]:
X2_clean = data2.iloc[:, 0:9] # Leaving only important features
X2_clean[0:10]

Unnamed: 0,Team,Rank,P,M,W,D,L,G,GA
0,Bayern Munich,1,23,10,7,2,1,34,16
1,Bayer Leverkusen,2,22,10,6,4,0,19,9
2,RB Leipzig,3,21,10,6,3,1,21,9
3,Borussia Dortmund,4,19,10,6,1,3,22,10
4,VfL Wolfsburg,5,18,10,4,6,0,16,10
5,Union Berlin,6,16,10,4,4,2,22,14
6,M&ouml;nchengladbach,7,16,10,4,4,2,19,16
7,VfB Stuttgart,8,14,10,3,5,2,19,16
8,Eintracht Frankfurt,9,13,10,2,7,1,15,17
9,1899 Hoffenheim,10,12,10,3,3,4,18,17


### Integrate both Datasets
In this section, we will write a function that will combine the two datasets and sorts the features respectively. The Idea is that each Team has it's own row, including their rank, goals scored etc. as well as the stats of the last games

In [15]:
# First, we need to make sure the team names of both datasets match

for name in range(len(X2_clean['Team'])):
    if (X2_clean.iloc[name, 0] == '1. FC K&ouml;ln'):
        X2_clean.iloc[name, 0] = 'FC Koln'
    if (X2_clean.iloc[name, 0] == 'VfL Wolfsburg'):
        X2_clean.iloc[name, 0] = 'Wolfsburg'
    if (X2_clean.iloc[name, 0] == 'M&ouml;nchengladbach'):
        X2_clean.iloc[name, 0] = 'Gladbach'
    if (X2_clean.iloc[name, 0] == 'Borussia Dortmund'):
        X2_clean.iloc[name, 0] = 'Dortmund'
    if (X2_clean.iloc[name, 0] == 'VfB Stuttgart'):
        X2_clean.iloc[name, 0] = 'Stuttgart'
    if (X2_clean.iloc[name, 0] == 'Eintracht Frankfurt'):
        X2_clean.iloc[name, 0] = 'Ein Frankfurt'
    if (X2_clean.iloc[name, 0] == '1899 Hoffenheim'):
        X2_clean.iloc[name, 0] = 'Hoffenheim'
    if (X2_clean.iloc[name, 0] == 'FC Augsburg'):
        X2_clean.iloc[name, 0] = 'Augsburg'
    if (X2_clean.iloc[name, 0] == 'Hertha BSC Berlin'):
        X2_clean.iloc[name, 0] = 'Hertha'
    if (X2_clean.iloc[name, 0] == 'SC Freiburg'):
        X2_clean.iloc[name, 0] = 'Freiburg'
    if (X2_clean.iloc[name, 0] == 'Arminia Bielefeld'):
        X2_clean.iloc[name, 0] = 'Bielefeld'
    if (X2_clean.iloc[name, 0] == 'FSV Mainz 05'):
        X2_clean.iloc[name, 0] = 'Mainz'
    if (X2_clean.iloc[name, 0] == 'FC Schalke 04'):
        X2_clean.iloc[name, 0] = 'Schalke'
    if (X2_clean.iloc[name, 0] == 'Bayer Leverkusen'):
        X2_clean.iloc[name, 0] = 'Leverkusen'
        
X2_clean['Team']

0     Bayern Munich
1        Leverkusen
2        RB Leipzig
3          Dortmund
4         Wolfsburg
5      Union Berlin
6          Gladbach
7         Stuttgart
8     Ein Frankfurt
9        Hoffenheim
10         Augsburg
11           Hertha
12    Werder Bremen
13         Freiburg
14          FC Koln
15        Bielefeld
16            Mainz
17          Schalke
Name: Team, dtype: object

In [16]:
X1_clean = X1_clean.replace('M\'gladbach', 'Gladbach') # Need to make one change on this Dataset too
X1_clean = X1_clean.replace('Schalke 04', 'Schalke')
X1_clean[0:7]

Unnamed: 0,HomeTeam,AwayTeam,FTHG,FTAG,HTHG,HTAG,HS,AS,HST,AST
0,Bayern Munich,Schalke,8,0,3,0,22,5,12,1
1,Ein Frankfurt,Bielefeld,1,1,0,0,18,14,6,4
2,FC Koln,Hoffenheim,2,3,1,2,13,13,6,7
3,Stuttgart,Freiburg,2,3,0,2,22,7,7,6
4,Union Berlin,Augsburg,1,3,0,1,13,9,3,5
5,Werder Bremen,Hertha,1,4,0,2,17,13,7,6
6,Dortmund,Gladbach,3,0,1,0,9,8,4,2


In [17]:
# Converting Panda dataframe to Numpy Array (maybe needed later)
X1_clean_num = X1_clean.iloc[:,:].values # using numpy array
X1_clean_num[0:5]

array([['Bayern Munich', 'Schalke', 8, 0, 3, 0, 22, 5, 12, 1],
       ['Ein Frankfurt', 'Bielefeld', 1, 1, 0, 0, 18, 14, 6, 4],
       ['FC Koln', 'Hoffenheim', 2, 3, 1, 2, 13, 13, 6, 7],
       ['Stuttgart', 'Freiburg', 2, 3, 0, 2, 22, 7, 7, 6],
       ['Union Berlin', 'Augsburg', 1, 3, 0, 1, 13, 9, 3, 5]],
      dtype=object)

### Determining Home Team stats
Since the Home and Away performances differ drastically for some teams, we will divide these performances in the following code to HomeTeam stats and AwayTeam stats. Then, we will take the average.

In [18]:
X1_clean.head()

Unnamed: 0,HomeTeam,AwayTeam,FTHG,FTAG,HTHG,HTAG,HS,AS,HST,AST
0,Bayern Munich,Schalke,8,0,3,0,22,5,12,1
1,Ein Frankfurt,Bielefeld,1,1,0,0,18,14,6,4
2,FC Koln,Hoffenheim,2,3,1,2,13,13,6,7
3,Stuttgart,Freiburg,2,3,0,2,22,7,7,6
4,Union Berlin,Augsburg,1,3,0,1,13,9,3,5


##### Total home team stats for each Bundesliga team

In [19]:
teamnames = X2_clean['Team']
sum_HGS = {key: 0 for key in teamnames} #Initialize dictionary (Team : HG) and initialize values to 0
sum_HGC = {key: 0 for key in teamnames} #Initialize dictionary (Team : HGC) and initialize values to 0
sum_HST = {key: 0 for key in teamnames} #Initialize dictionary (Team : HST) and initialize values to 0
sum_HSTC = {key: 0 for key in teamnames} #Initialize dictionary (Team : HST) and initialize values to 0
sum_HS = {key: 0 for key in teamnames} #Initialize dictionary (Team : HS) and initialize values to 0
sum_HSC = {key: 0 for key in teamnames} #Initialize dictionary (Team : HS) and initialize values to 0

hometeams = X1_clean['HomeTeam']
HGS = X1_clean['FTHG']
HGC = X1_clean['FTAG']
HST = X1_clean['HST']
HSTC = X1_clean['AST']
HS = X1_clean['HS']
HSC = X1_clean['AS']

homegames_count = {key: 0 for key in teamnames} # Counting total home games for each team to calculate the average later on

# Loop through all the matches and add up the HST for each team
for team in range(len(hometeams)): 
        sum_HGS[hometeams[team]] = sum_HGS[hometeams[team]] + HGS[team] # Hometeam goals scored
        sum_HGC[hometeams[team]] = sum_HGC[hometeams[team]] + HGC[team] # Hometeam goals conceded
        sum_HST[hometeams[team]] = sum_HST[hometeams[team]] + HST[team] # Hometeam shots on target
        sum_HSTC[hometeams[team]] = sum_HSTC[hometeams[team]] + HSTC[team] # Hometeam shots on target conceded
        sum_HS[hometeams[team]] = sum_HS[hometeams[team]] + HS[team] # Hometeam shots
        sum_HSC[hometeams[team]] = sum_HSC[hometeams[team]] + HSC[team] # Hometeam shots conceded
        homegames_count[hometeams[team]] = homegames_count[hometeams[team]] + 1 # Increasing count of homegames by 1
        #print(hometeams[team], sum_HST[hometeams[team]])

print("Total goals Home Team: ")
print(sum_HGS)
print("\n")
print("Total goals conceded Home Team: ")
print(sum_HGC)
print("\n")
print("Total shots on Target at home for each Team: ")
print(sum_HST)
print("\n")
print("Total shots on Target conceded at home for each Team: ")
print(sum_HSTC)
print("\n")
print("Total shots at home for each Team: ")
print(sum_HS)
print("\n")
print("Total shots conceded at home for each Team: ")
print(sum_HSC)
print("\n")
print("Total Home games for each team: ")
print(homegames_count)

Total goals Home Team: 
{'Bayern Munich': 30, 'Leverkusen': 14, 'RB Leipzig': 17, 'Dortmund': 17, 'Wolfsburg': 14, 'Union Berlin': 20, 'Gladbach': 13, 'Stuttgart': 11, 'Ein Frankfurt': 14, 'Hoffenheim': 12, 'Augsburg': 9, 'Hertha': 10, 'Werder Bremen': 8, 'Freiburg': 18, 'FC Koln': 8, 'Bielefeld': 6, 'Mainz': 6, 'Schalke': 7}


Total goals conceded Home Team: 
{'Bayern Munich': 11, 'Leverkusen': 9, 'RB Leipzig': 6, 'Dortmund': 11, 'Wolfsburg': 8, 'Union Berlin': 11, 'Gladbach': 9, 'Stuttgart': 15, 'Ein Frankfurt': 10, 'Hoffenheim': 13, 'Augsburg': 15, 'Hertha': 12, 'Werder Bremen': 12, 'Freiburg': 12, 'FC Koln': 18, 'Bielefeld': 11, 'Mainz': 15, 'Schalke': 13}


Total shots on Target at home for each Team: 
{'Bayern Munich': 63, 'Leverkusen': 32, 'RB Leipzig': 56, 'Dortmund': 45, 'Wolfsburg': 39, 'Union Berlin': 51, 'Gladbach': 39, 'Stuttgart': 36, 'Ein Frankfurt': 40, 'Hoffenheim': 45, 'Augsburg': 29, 'Hertha': 35, 'Werder Bremen': 29, 'Freiburg': 51, 'FC Koln': 28, 'Bielefeld': 20, '

##### Average stats per game as a home team for each Bundesliga team

In [20]:
avg_HGS = {key: 0 for key in teamnames} #Initialize dictionary (Team : average FTHG)
avg_HGC = {key: 0 for key in teamnames} #Initialize dictionary (Team : average FTHG)
avg_HST = {key: 0 for key in teamnames} #Initialize dictionary (Team : average HST)
avg_HSTC = {key: 0 for key in teamnames} #Initialize dictionary (Team : average HST)
avg_HS = {key: 0 for key in teamnames} #Initialize dictionary (Team : average HS)
avg_HSC = {key: 0 for key in teamnames} #Initialize dictionary (Team : average HS)

for team in range(len(teamnames)):
    avg_HGS[teamnames[team]] = sum_HGS[teamnames[team]] / homegames_count[teamnames[team]]
    avg_HGC[teamnames[team]] = sum_HGC[teamnames[team]] / homegames_count[teamnames[team]]
    avg_HST[teamnames[team]] = sum_HST[teamnames[team]] / homegames_count[teamnames[team]]
    avg_HSTC[teamnames[team]] = sum_HSTC[teamnames[team]] / homegames_count[teamnames[team]]
    avg_HS[teamnames[team]] = sum_HS[teamnames[team]] / homegames_count[teamnames[team]]
    avg_HSC[teamnames[team]] = sum_HSC[teamnames[team]] / homegames_count[teamnames[team]]

print("Average goals scored as a Home team")
rounded_HGS = {key : round(avg_HGS[key], 2) for key in avg_HGS} # Round numbers
print(rounded_HGS)
print('\n')
print("Average goals conceded as a Home team")
rounded_HGC = {key : round(avg_HGC[key], 2) for key in avg_HGC} # Round numbers
print(rounded_HGC)
print('\n')
print("Average shots on target as a Home team")
rounded_HST = {key : round(avg_HST[key], 2) for key in avg_HST} # Round numbers
print(rounded_HST)
print('\n')
print("Average shots on target conceded as a Home team")
rounded_HSTC = {key : round(avg_HSTC[key], 2) for key in avg_HSTC} # Round numbers
print(rounded_HSTC)
print('\n')
print("Average shots as a Home team")
rounded_HS = {key : round(avg_HS[key], 2) for key in avg_HS} # Round numbers
print(rounded_HS)
print('\n')
print("Average shots conceded as a Home team")
rounded_HSC = {key : round(avg_HSC[key], 2) for key in avg_HSC} # Round numbers
print(rounded_HSC)

Average goals scored as a Home team
{'Bayern Munich': 3.75, 'Leverkusen': 2.0, 'RB Leipzig': 2.12, 'Dortmund': 2.12, 'Wolfsburg': 1.75, 'Union Berlin': 2.22, 'Gladbach': 1.62, 'Stuttgart': 1.38, 'Ein Frankfurt': 1.75, 'Hoffenheim': 1.5, 'Augsburg': 1.12, 'Hertha': 1.43, 'Werder Bremen': 1.0, 'Freiburg': 2.25, 'FC Koln': 0.89, 'Bielefeld': 0.75, 'Mainz': 0.75, 'Schalke': 0.88}


Average goals conceded as a Home team
{'Bayern Munich': 1.38, 'Leverkusen': 1.29, 'RB Leipzig': 0.75, 'Dortmund': 1.38, 'Wolfsburg': 1.0, 'Union Berlin': 1.22, 'Gladbach': 1.12, 'Stuttgart': 1.88, 'Ein Frankfurt': 1.25, 'Hoffenheim': 1.62, 'Augsburg': 1.88, 'Hertha': 1.71, 'Werder Bremen': 1.5, 'Freiburg': 1.5, 'FC Koln': 2.0, 'Bielefeld': 1.38, 'Mainz': 1.88, 'Schalke': 1.62}


Average shots on target as a Home team
{'Bayern Munich': 7.88, 'Leverkusen': 4.57, 'RB Leipzig': 7.0, 'Dortmund': 5.62, 'Wolfsburg': 4.88, 'Union Berlin': 5.67, 'Gladbach': 4.88, 'Stuttgart': 4.5, 'Ein Frankfurt': 5.0, 'Hoffenheim': 5.62

Below is a more convenient and shorter way of coding that represents the same findings as above, without creating a dictionary, but a panda dataframe for a cleaner look. However, the above presented dictionary can be used as a double-check to make sure that each statistic has the right value.

In [32]:
StatsHome = pd.DataFrame(columns=('Team','HGS','HGC','HST','HSTC','HS','HSC'))

TeamHome = X1_clean.groupby('HomeTeam')
StatsHome.Team = sorted(teamnames) # Sort alphabetically, take care that statistics match with team
StatsHome.HGS = TeamHome.FTHG.mean().values
StatsHome.HGC = TeamHome.FTAG.mean().values
StatsHome.HST = TeamHome.HST.mean().values
StatsHome.HSTC = TeamHome.AST.mean().values
StatsHome.HS = TeamHome.HS.mean().values
StatsHome.HSC = TeamHome.AS.mean().values
StatsHome = StatsHome.round(2)
StatsHome.head()

Unnamed: 0,Team,HGS,HGC,HST,HSTC,HS,HSC
0,Augsburg,1.12,1.88,3.62,6.12,10.38,14.75
1,Bayern Munich,3.75,1.38,7.88,4.12,18.25,9.62
2,Bielefeld,0.75,1.38,2.5,5.75,8.25,14.25
3,Dortmund,2.12,1.38,5.62,4.38,14.25,11.0
4,Ein Frankfurt,1.75,1.25,5.0,3.38,13.25,10.38


### Calculating Away Team stats
In the following code we sum up the total stats (goals, shots, ..) for each Awayteam and calculate the average for each feature

In [22]:
sum_AGS = {key: 0 for key in teamnames} #Initialize dictionary (Team : AG) and set values to 0
sum_AGC = {key: 0 for key in teamnames} #Initialize dictionary (Team : AGC) and set values to 0
sum_AST = {key: 0 for key in teamnames} #Initialize dictionary (Team : AST) and set values to 0
sum_ASTC = {key: 0 for key in teamnames} #Initialize dictionary (Team : ASTC) and set values to 0
sum_AS = {key: 0 for key in teamnames} #Initialize dictionary (Team : AS) and set values to 0
sum_ASC = {key: 0 for key in teamnames} #Initialize dictionary (Team : ASC) and set values to 0

awayteams = X1_clean['AwayTeam']
AGS = X1_clean['FTAG']
AGC = X1_clean['FTHG']
AST = X1_clean['AST']
ASTC = X1_clean['HST']
AS = X1_clean['AS']
ASC = X1_clean['HS']

awaygames_count = {key: 0 for key in teamnames} # Counting total away games for each team to calculate the average later on

# Loop through all matches and add up the AST for each team
for team in range(len(awayteams)): 
        sum_AGS[awayteams[team]] = sum_AGS[awayteams[team]] + AGS[team] # Awayteam goals scored
        sum_AGC[awayteams[team]] = sum_AGC[awayteams[team]] + AGC[team] # Awayteam goals conceded
        sum_AST[awayteams[team]] = sum_AST[awayteams[team]] + AST[team] # Awayteam shots on target
        sum_ASTC[awayteams[team]] = sum_ASTC[awayteams[team]] + ASTC[team] # Awayteam shots on target conceded
        sum_AS[awayteams[team]] = sum_AS[awayteams[team]] + AS[team] # Awayteam shots
        sum_ASC[awayteams[team]] = sum_ASC[awayteams[team]] + ASC[team] # Awayteam shots conceded
        awaygames_count[awayteams[team]] = awaygames_count[awayteams[team]] + 1

print("Total goals scored away for each Team: ")
print(sum_AGS)
print("\n")
print("Total goals conceded away for each Team: ")
print(sum_AGC)
print("\n")
print("Total shots on Target away for each Team: ")
print(sum_AST)
print("\n")
print("Total shots on Target conceded away for each Team: ")
print(sum_ASTC)
print("\n")
print("Total shots away for each Team: ")
print(sum_AS)
print("\n")
print("Total shots conceded away for each Team: ")
print(sum_ASC)
print("\n")
print("Total Away games for each team: ")
print(awaygames_count)

Total goals scored away for each Team: 
{'Bayern Munich': 18, 'Leverkusen': 16, 'RB Leipzig': 11, 'Dortmund': 15, 'Wolfsburg': 10, 'Union Berlin': 12, 'Gladbach': 17, 'Stuttgart': 21, 'Ein Frankfurt': 14, 'Hoffenheim': 10, 'Augsburg': 8, 'Hertha': 13, 'Werder Bremen': 11, 'Freiburg': 11, 'FC Koln': 5, 'Bielefeld': 4, 'Mainz': 9, 'Schalke': 6}


Total goals conceded away for each Team: 
{'Bayern Munich': 14, 'Leverkusen': 7, 'RB Leipzig': 8, 'Dortmund': 9, 'Wolfsburg': 11, 'Union Berlin': 9, 'Gladbach': 17, 'Stuttgart': 9, 'Ein Frankfurt': 14, 'Hoffenheim': 17, 'Augsburg': 10, 'Hertha': 13, 'Werder Bremen': 12, 'Freiburg': 14, 'FC Koln': 9, 'Bielefeld': 13, 'Mainz': 19, 'Schalke': 29}


Total shots on Target away for each Team: 
{'Bayern Munich': 43, 'Leverkusen': 46, 'RB Leipzig': 39, 'Dortmund': 55, 'Wolfsburg': 42, 'Union Berlin': 27, 'Gladbach': 51, 'Stuttgart': 52, 'Ein Frankfurt': 39, 'Hoffenheim': 41, 'Augsburg': 25, 'Hertha': 38, 'Werder Bremen': 34, 'Freiburg': 32, 'FC Koln': 2

##### Average shots on target per game as an away team for each Bundesliga team

In [23]:
avg_AGS = {key: 0 for key in teamnames} #Initialize dictionary (Team : average AGS)
avg_AGC = {key: 0 for key in teamnames} #Initialize dictionary (Team : average AGC)
avg_AST = {key: 0 for key in teamnames} #Initialize dictionary (Team : average AST)
avg_ASTC = {key: 0 for key in teamnames} #Initialize dictionary (Team : average ASTC)
avg_AS = {key: 0 for key in teamnames} #Initialize dictionary (Team : average AS)
avg_ASC = {key: 0 for key in teamnames} #Initialize dictionary (Team : average ASC)


for team in range(len(teamnames)):
    avg_AGS[teamnames[team]] = sum_AGS[teamnames[team]] / awaygames_count[teamnames[team]]
    avg_AGC[teamnames[team]] = sum_AGC[teamnames[team]] / awaygames_count[teamnames[team]]
    avg_AST[teamnames[team]] = sum_AST[teamnames[team]] / awaygames_count[teamnames[team]]
    avg_ASTC[teamnames[team]] = sum_ASTC[teamnames[team]] / awaygames_count[teamnames[team]]
    avg_AS[teamnames[team]] = sum_AS[teamnames[team]] / awaygames_count[teamnames[team]]
    avg_ASC[teamnames[team]] = sum_ASC[teamnames[team]] / awaygames_count[teamnames[team]]
    
print("Average goals scored as an Away team")
rounded_AGS = {key : round(avg_AGS[key], 2) for key in avg_AGS} # Round numbers
print(rounded_AGS)
print('\n')
print("Average goals conceded as an Away team")
rounded_AGC = {key : round(avg_AGC[key], 2) for key in avg_AGC} # Round numbers
print(rounded_AGC)
print('\n')
print("Average shots on target as an Away team")
rounded_AST = {key : round(avg_AST[key], 2) for key in avg_AST} # Round numbers
print(rounded_AST)
print('\n')
print("Average shots on target conceded as an Away team")
rounded_ASTC = {key : round(avg_ASTC[key], 2) for key in avg_ASTC} # Round numbers
print(rounded_ASTC)
print('\n')
print("Average shots as an Away team")
rounded_AS = {key : round(avg_AS[key], 2) for key in avg_AS} # Round numbers
print(rounded_AS)
print('\n')
print("Average shots conceded as an Away team")
rounded_ASC = {key : round(avg_ASC[key], 2) for key in avg_ASC} # Round numbers
print(rounded_ASC)

Average goals scored as an Away team
{'Bayern Munich': 2.25, 'Leverkusen': 1.78, 'RB Leipzig': 1.38, 'Dortmund': 1.88, 'Wolfsburg': 1.25, 'Union Berlin': 1.71, 'Gladbach': 2.12, 'Stuttgart': 2.62, 'Ein Frankfurt': 1.75, 'Hoffenheim': 1.25, 'Augsburg': 1.0, 'Hertha': 1.44, 'Werder Bremen': 1.38, 'Freiburg': 1.38, 'FC Koln': 0.71, 'Bielefeld': 0.5, 'Mainz': 1.12, 'Schalke': 0.75}


Average goals conceded as an Away team
{'Bayern Munich': 1.75, 'Leverkusen': 0.78, 'RB Leipzig': 1.0, 'Dortmund': 1.12, 'Wolfsburg': 1.38, 'Union Berlin': 1.29, 'Gladbach': 2.12, 'Stuttgart': 1.12, 'Ein Frankfurt': 1.75, 'Hoffenheim': 2.12, 'Augsburg': 1.25, 'Hertha': 1.44, 'Werder Bremen': 1.5, 'Freiburg': 1.75, 'FC Koln': 1.29, 'Bielefeld': 1.62, 'Mainz': 2.38, 'Schalke': 3.62}


Average shots on target as an Away team
{'Bayern Munich': 5.38, 'Leverkusen': 5.11, 'RB Leipzig': 4.88, 'Dortmund': 6.88, 'Wolfsburg': 5.25, 'Union Berlin': 3.86, 'Gladbach': 6.38, 'Stuttgart': 6.5, 'Ein Frankfurt': 4.88, 'Hoffenhei

Again, a shorter way to visualize each teams individual performances

In [24]:
StatsAway = pd.DataFrame(columns=('Team','AGS','AGC','AST','ASTC','AS','ASC'))

TeamAway = X1_clean.groupby('AwayTeam')
StatsAway.Team = sorted(teamnames) # Sort alphabetically, take care that statistics match with team
StatsAway.AGS = TeamAway.FTAG.mean().values
StatsAway.AGC = TeamAway.FTHG.mean().values
StatsAway.AST = TeamAway.AST.mean().values
StatsAway.ASTC = TeamAway.HST.mean().values
StatsAway.AS = TeamAway.AS.mean().values
StatsAway.ASC = TeamAway.HS.mean().values
StatsAway = StatsAway.round(2)
StatsAway.head()

Unnamed: 0,Team,AGS,AGC,AST,ASTC,AS,ASC
0,Augsburg,1.0,1.25,3.12,4.75,9.12,13.0
1,Bayern Munich,2.25,1.75,5.38,4.62,13.88,12.0
2,Bielefeld,0.5,1.62,2.5,5.5,8.88,12.62
3,Dortmund,1.88,1.12,6.88,3.12,16.12,8.5
4,Ein Frankfurt,1.75,1.75,4.88,5.62,12.38,13.88


Now, that we have the Home and Away performances gathered together for each team, we will crate a feature table that includes the games of each matchgame and includes the specific performances for the home and away team

In [31]:
FeatureTable = X1[['HomeTeam', 'AwayTeam', 'FTR']]
FeatureTable

Unnamed: 0,HomeTeam,AwayTeam,FTR
0,Bayern Munich,Schalke 04,H
1,Ein Frankfurt,Bielefeld,D
2,FC Koln,Hoffenheim,A
3,Stuttgart,Freiburg,A
4,Union Berlin,Augsburg,A
...,...,...,...
139,Werder Bremen,Augsburg,H
140,Wolfsburg,RB Leipzig,D
141,Stuttgart,M'gladbach,D
142,Bayern Munich,Freiburg,H


In [30]:
# Including home Team stats
f_HGS = []
f_HGC = []
f_HST = []
f_HSTC = []
f_HS = []
f_HSC = []

for index,row in FeatureTable.iterrows():
    f_HGS.append(StatsHome[StatsHome['Team'] == row['HomeTeam']]['HGS'].values[0])
    f_HGC.append(StatsHome[StatsHome['Team'] == row['HomeTeam']]['HGC'].values[0])
    f_HST.append(StatsHome[StatsHome['Team'] == row['HomeTeam']]['HST'].values[0])
    f_HSTC.append(StatsHome[StatsHome['Team'] == row['HomeTeam']]['HSTC'].values[0])
    f_HS.append(StatsHome[StatsHome['Team'] == row['HomeTeam']]['HS'].values[0])
    f_HSC.append(StatsHome[StatsHome['Team'] == row['HomeTeam']]['HSC'].values[0])
    
FeatureTable['HGS'] = f_HGS
FeatureTable['HGC'] = f_HGC
FeatureTable['HST'] = f_HST
FeatureTable['HSTC'] = f_HSTC
FeatureTable['HS'] = f_HST
FeatureTable['HSC'] = f_HST

FeatureTable.head()

IndexError: index 0 is out of bounds for axis 0 with size 0

In [26]:
# Including Away Team stats

f_AGS = []
f_AGC = []
f_AST = []
f_ASTC = []
f_AS = []
f_ASC = []

for index,row in FeatureTable.iterrows():
    f_AGS.append(StatsAway[StatsAway['Team'] == row['AwayTeam']]['AGS'].values[0])
    f_AGC.append(StatsAway[StatsAway['Team'] == row['AwayTeam']]['AGC'].values[0])
    f_AST.append(StatsAway[StatsAway['Team'] == row['AwayTeam']]['AST'].values[0])
    f_ASTC.append(StatsAway[StatsAway['Team'] == row['AwayTeam']]['ASTC'].values[0])
    f_AS.append(StatsAway[StatsAway['Team'] == row['AwayTeam']]['AS'].values[0])
    f_ASC.append(StatsAway[StatsAway['Team'] == row['AwayTeam']]['ASC'].values[0])
    
FeatureTable['AGS'] = f_AGS
FeatureTable['AGC'] = f_AGC
FeatureTable['AST'] = f_AST
FeatureTable['ASTC'] = f_ASTC
FeatureTable['AS'] = f_AS
FeatureTable['ASC'] = f_ASC
FeatureTable.head()

IndexError: index 0 is out of bounds for axis 0 with size 0

#### Using Ordinal Encoder instead of one Hot Encoder to ensure a one-dimensional target shape

In [None]:
#from sklearn.preprocessing import OrdinalEncoder
#
#ordinal_encoder = OrdinalEncoder()
#y_encoded = ordinal_encoder.fit_transform(y)
#y_encoded[0:10] # Home win = 2, Draw = 1, Away win = 0

In [None]:
def transformResult(row):
    '''Converts results (H,A or D) into numeric values'''
    if(row.FTR == 'H'): # Win HomeTeam = 1
        return 1
    elif(row.FTR == 'A'): # Win AwayTeam = -1
        return -1
    else:
        return 0 # Draw = 0

In [None]:
FeatureTable["FTR"] = FeatureTable.apply(lambda row: transformResult(row),axis=1)
FeatureTable.head()

In [None]:
X_train = FeatureTable.drop(['HomeTeam', 'AwayTeam', 'FTR'], axis=1) # Dropping Team names and FTR
y_train = FeatureTable['FTR']

#### Scaling the Data

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

# Training a Logistic regression Model for game prediction

In [None]:
from sklearn.linear_model import LogisticRegression

reg_clf = LogisticRegression(random_state=42)
reg_clf.fit(X_train, y_train)

### Evaluation of training model

In [None]:
## After further research, corss valiodation is not valid to use in this application since it shuffles the matches randomly
#from sklearn.model_selection import cross_val_score
#cross_val_score(reg_clf, X_clean, y_encoded, cv=3, scoring='accuracy')

In [None]:
y_pred = reg_clf.predict(X_test)
y_pred

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score 

y_pred = reg_clf.fit(X_train, y_train).predict(X_train)
accuracy_score(y_pred, y_train)
scores = cross_val_score(reg_clf, X_train, y_train, cv=10)
print(scores)
print(scores.mean())

### Evaluation score of Prediction from Logistic Regression model

In [None]:
from sklearn.metrics import f1_score

f1_score(y_test, y_pred, average='micro')

In [None]:
comparison = [y_pred, y_test.ravel()]
comparison

# Train a NN for Game predictions

In [None]:
model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=(X_train.shape)),
    keras.layers.Dense(10, activation="sigmoid"),
    keras.layers.Dense(10, activation="softmax")
])

In [None]:
model