# Sportbetting Project

## Setup

In [2]:
import pandas as pd
import matplotlib as mpl
import numpy as np
import sklearn

## Load Data

In [3]:
data = pd.read_csv('D1.csv')
data.head()

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,...,AvgC<2.5,AHCh,B365CAHH,B365CAHA,PCAHH,PCAHA,MaxCAHH,MaxCAHA,AvgCAHH,AvgCAHA
0,D1,18/09/2020,19:30,Bayern Munich,Schalke 04,8,0,H,3,0,...,4.34,-2.5,1.89,2.04,1.87,2.02,1.95,2.18,1.85,2.02
1,D1,19/09/2020,14:30,Ein Frankfurt,Bielefeld,1,1,D,0,0,...,2.33,-0.75,1.96,1.97,1.96,1.96,2.02,1.98,1.94,1.93
2,D1,19/09/2020,14:30,FC Koln,Hoffenheim,2,3,A,1,2,...,2.27,0.0,1.91,2.02,1.92,2.01,1.97,2.08,1.89,1.98
3,D1,19/09/2020,14:30,Stuttgart,Freiburg,2,3,A,0,2,...,2.33,-0.25,1.92,2.01,1.91,2.02,1.94,2.04,1.88,1.99
4,D1,19/09/2020,14:30,Union Berlin,Augsburg,1,3,A,0,1,...,1.71,-0.25,2.02,1.91,2.0,1.92,2.05,1.93,2.0,1.87


### Abbreviation Definition
##### FTHG = Full time home team goals --- FTAG = Full time away team goals
##### FTR = Full time result Half time result
##### HTHG = Half time home team goals --- HTAG = Half time away team goals
##### HS = Home team shots --- Away team shots
##### HST = Home team shots on goal --- Away team shots on goal

# Splitting set into Data X and target y

In [5]:
X = data.iloc[:, 3:15]
X.head()

Unnamed: 0,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,HS,AS,HST,AST
0,Bayern Munich,Schalke 04,8,0,H,3,0,H,22,5,12,1
1,Ein Frankfurt,Bielefeld,1,1,D,0,0,D,18,14,6,4
2,FC Koln,Hoffenheim,2,3,A,1,2,A,13,13,6,7
3,Stuttgart,Freiburg,2,3,A,0,2,A,22,7,7,6
4,Union Berlin,Augsburg,1,3,A,0,1,A,13,9,3,5


In [6]:
X = X.drop(['HTR','FTR'], axis=1) # Dropping half time and full time result
X

Unnamed: 0,HomeTeam,AwayTeam,FTHG,FTAG,HTHG,HTAG,HS,AS,HST,AST
0,Bayern Munich,Schalke 04,8,0,3,0,22,5,12,1
1,Ein Frankfurt,Bielefeld,1,1,0,0,18,14,6,4
2,FC Koln,Hoffenheim,2,3,1,2,13,13,6,7
3,Stuttgart,Freiburg,2,3,0,2,22,7,7,6
4,Union Berlin,Augsburg,1,3,0,1,13,9,3,5
...,...,...,...,...,...,...,...,...,...,...
58,Stuttgart,Ein Frankfurt,2,2,2,0,16,15,5,8
59,Union Berlin,Bielefeld,5,0,3,0,9,5,8,1
60,Dortmund,Bayern Munich,2,3,1,1,15,16,5,8
61,Wolfsburg,Hoffenheim,2,1,2,0,14,17,8,5


In [7]:
#X = pd.DataFrame()
X = X.iloc[:,:].values # using numpy array
X

array([['Bayern Munich', 'Schalke 04', 8, 0, 3, 0, 22, 5, 12, 1],
       ['Ein Frankfurt', 'Bielefeld', 1, 1, 0, 0, 18, 14, 6, 4],
       ['FC Koln', 'Hoffenheim', 2, 3, 1, 2, 13, 13, 6, 7],
       ['Stuttgart', 'Freiburg', 2, 3, 0, 2, 22, 7, 7, 6],
       ['Union Berlin', 'Augsburg', 1, 3, 0, 1, 13, 9, 3, 5],
       ['Werder Bremen', 'Hertha', 1, 4, 0, 2, 17, 13, 7, 6],
       ['Dortmund', "M'gladbach", 3, 0, 1, 0, 9, 8, 4, 2],
       ['RB Leipzig', 'Mainz', 3, 1, 2, 0, 23, 8, 10, 1],
       ['Wolfsburg', 'Leverkusen', 0, 0, 0, 0, 9, 6, 1, 2],
       ['Hertha', 'Ein Frankfurt', 1, 3, 0, 2, 12, 10, 6, 3],
       ['Augsburg', 'Dortmund', 2, 0, 1, 0, 6, 16, 4, 7],
       ['Bielefeld', 'FC Koln', 1, 0, 0, 0, 7, 12, 3, 2],
       ['Leverkusen', 'RB Leipzig', 1, 1, 1, 1, 11, 7, 4, 2],
       ['Mainz', 'Stuttgart', 1, 4, 1, 1, 11, 15, 3, 7],
       ["M'gladbach", 'Union Berlin', 1, 1, 0, 0, 14, 15, 5, 2],
       ['Schalke 04', 'Werder Bremen', 1, 3, 0, 2, 17, 17, 4, 8],
       ['Hoffenheim',

In [8]:
y = data[['FTR']]
y.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63 entries, 0 to 62
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   FTR     63 non-null     object
dtypes: object(1)
memory usage: 632.0+ bytes


### Changing the categorical values to 0 or 1 values trough OneHotEncoder

In [9]:
from sklearn.preprocessing import OneHotEncoder

hot = OneHotEncoder()
y = hot.fit_transform(y)
y

<63x3 sparse matrix of type '<class 'numpy.float64'>'
	with 63 stored elements in Compressed Sparse Row format>

In [10]:
y.toarray() ## creating a numpy array
#y = y.iloc[:,:].values

array([[0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0

### Splitting train and test set

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

### Further Data preprocessing

In [12]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train[:, 2:] = scaler.fit_transform(X_train[:, 2:])
X_test[:, 2:] = scaler.transform(X_test[:, 2:])

In [13]:
print(X_train)

[['Bayern Munich' 'Ein Frankfurt' 2.55835439070771 -1.218543591689885
  1.4263397393121144 -0.9305415914315354 1.209714563796368
  -0.6830290187405538 2.1923736068209188 -0.9209824856474367]
 ['Hertha' 'Wolfsburg' -0.5055430831937391 -0.3481553119113957
  0.2572088054497255 0.326947045638107 0.8408991480047924
  -0.2451899041632756 -0.01768043231307194 -0.20146491873537664]
 ['Werder Bremen' 'FC Koln' -0.5055430831937391 -0.3481553119113957
  -0.9119221284126633 -0.9305415914315354 0.8408991480047924
  -0.9019485760291929 -0.45969124013987006 -1.2807412691034665]
 ['RB Leipzig' 'Schalke 04' 1.7923800222323478 -1.218543591689885
  2.595470673174503 -0.9305415914315354 1.3941222716921557
  -1.7776268051837492 0.8663411833405242 -1.2807412691034665]
 ['Union Berlin' 'Mainz' 1.7923800222323478 -1.218543591689885
  0.2572088054497255 -0.9305415914315354 -0.08113939147414653
  -1.55870724789511 0.8663411833405242 -1.6405000525594966]
 ['Freiburg' 'Werder Bremen' -0.5055430831937391 -0.348155

In [14]:
X_test

array([['Bayern Munich', 'Hertha', 1.7923800222323478, 1.392621247645583,
        0.2572088054497255, -0.9305415914315354, 1.5785299795879435,
        -0.6830290187405538, 1.7503627989941204, 0.15829386472065332],
       ['Dortmund', 'Schalke 04', 1.0264056537569857, -1.218543591689885,
        -0.9119221284126633, -0.9305415914315354, 0.6564914401090046,
        -1.9965463624723883, 0.8663411833405242, -1.6405000525594966],
       ['Union Berlin', 'Augsburg', -0.5055430831937391,
        1.392621247645583, -0.9119221284126633, 0.326947045638107,
        -0.08113939147414653, -0.6830290187405538, -0.9017020479666681,
        0.15829386472065332],
       ['Augsburg', 'Mainz', 1.0264056537569857, -0.3481553119113957,
        0.2572088054497255, -0.9305415914315354, 0.10326831642164125,
        -0.9019485760291929, 0.8663411833405242, -0.20146491873537664],
       ['Ein Frankfurt', 'Werder Bremen', -0.5055430831937391,
        -0.3481553119113957, -0.9119221284126633, -0.9305415914315354,

In [15]:
from sklearn.linear_model import LogisticRegression

reg_clf = LogisticRegression(random_state=42)
reg_clf.fit(X_train, y_train)

ValueError: could not convert string to float: 'Bayern Munich'