In [1]:
#Import relevant libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, accuracy_score, mean_squared_error
#Import EPL data
url_EPLSeason1819 = 'https://datahub.io/sports-data/english-premier-league/r/season-1819.csv'
EPLSeason1819 = pd.read_csv(url_EPLSeason1819)
EPLSeason1819.head()

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
0,2018-08-10,Man United,Leicester,2,1,H,1,0,H,A Marriner,...,6,4,11,8,2,5,2,1,0,0
1,2018-08-11,Bournemouth,Cardiff,2,0,H,1,0,H,K Friend,...,4,1,11,9,7,4,1,1,0,0
2,2018-08-11,Fulham,Crystal Palace,0,2,A,0,1,A,M Dean,...,6,9,9,11,5,5,1,2,0,0
3,2018-08-11,Huddersfield,Chelsea,0,3,A,0,2,A,C Kavanagh,...,1,4,9,8,2,5,2,1,0,0
4,2018-08-11,Newcastle,Tottenham,1,2,A,1,2,A,M Atkinson,...,2,5,11,12,3,5,2,2,0,0


In [2]:
#View shape of dataframe
EPLSeason1819.shape

(335, 22)

In [3]:
#View columns
EPLSeason1819.columns

#'Date': Match Date (dd/mm/yy)
#'HomeTeam': Home Team
#'AwayTeam': Away Team
#'FTHG': Full Time Home Team Goals
#'FTAG': Full Time Away Team Goals
#'FTR': Full Time Result (H=Home Win, D=Draw, A=Away Win)
#'HTHG': Half Time Home Team Goals
#'HTAG': Half Time Away Team Goals
#'HTR': Half Time Result (H=Home Win, D=Draw, A=Away Win)
#'Referee': 
#'HS': Home Team Shots
#'AS': Away Team Shots
#'HST': Home Team Shots on Target
#'AST': Away Team Shots on Target
#'HF': Home Team Fouls Committed
#'AF': Away Team Fouls Committed
#'HC': Home Team Corners
#'AC': Away Team Corners
#'HY': Home Team Yellow Cards
#'AY': Away Team Yellow Cards
#'HR': Home Team Red Cards
#'AR': Away Team Red Cards 

Index(['Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HTHG', 'HTAG',
       'HTR', 'Referee', 'HS', 'AS', 'HST', 'AST', 'HF', 'AF', 'HC', 'AC',
       'HY', 'AY', 'HR', 'AR'],
      dtype='object')

In [4]:
#Create another column that yields goal difference
EPLSeason1819['GD'] = EPLSeason1819['FTHG']-EPLSeason1819['FTAG']
EPLSeason1819.shape

(335, 23)

In [5]:
#Encode categorical data - Convert categorical column in the dataset to numerical data.
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
EPLSeason1819['HomeTeam'] = enc.fit_transform(EPLSeason1819['HomeTeam'])
EPLSeason1819['AwayTeam'] = enc.fit_transform(EPLSeason1819['AwayTeam'])
EPLSeason1819['FTR'] = enc.fit_transform(EPLSeason1819['FTR'])
EPLSeason1819['HTR'] = enc.fit_transform(EPLSeason1819['HTR'])
EPLSeason1819['Referee'] = enc.fit_transform(EPLSeason1819['Referee'])

enc.classes_ #Maintains the information of the encoded values. Encode a non-numerical data into a numerical data

EPLSeason1819.head()
#Notice how all of the non-numerical columns are now numerical

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,AST,HF,AF,HC,AC,HY,AY,HR,AR,GD
0,2018-08-10,13,10,2,1,2,1,0,2,1,...,4,11,8,2,5,2,1,0,0,1
1,2018-08-11,1,4,2,0,2,1,0,2,8,...,1,11,9,7,4,1,1,0,0,2
2,2018-08-11,8,6,0,2,0,0,1,0,12,...,9,9,11,5,5,1,2,0,0,-2
3,2018-08-11,9,5,0,3,0,0,2,0,3,...,4,9,8,2,5,2,1,0,0,-3
4,2018-08-11,14,16,1,2,0,1,2,0,11,...,5,11,12,3,5,2,2,0,0,-1


In [6]:
#View columns
EPLSeason1819.columns

Index(['Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HTHG', 'HTAG',
       'HTR', 'Referee', 'HS', 'AS', 'HST', 'AST', 'HF', 'AF', 'HC', 'AC',
       'HY', 'AY', 'HR', 'AR', 'GD'],
      dtype='object')

In [7]:
#Convert date string to float
EPLSeason1819['Date'] = pd.to_datetime(arg=EPLSeason1819['Date'])
#Separating the days, months, and years into descrete columns
EPLSeason1819["Month"] = EPLSeason1819["Date"].dt.month
EPLSeason1819["Day"] = EPLSeason1819["Date"].dt.day
EPLSeason1819["Year"] = EPLSeason1819["Date"].dt.year
#Delete the 'Date' column as it causes problems later on in scaling
del EPLSeason1819['Date']
#View latest EPLSeason1819
EPLSeason1819.head()

Unnamed: 0,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,HS,...,HC,AC,HY,AY,HR,AR,GD,Month,Day,Year
0,13,10,2,1,2,1,0,2,1,8,...,2,5,2,1,0,0,1,8,10,2018
1,1,4,2,0,2,1,0,2,8,12,...,7,4,1,1,0,0,2,8,11,2018
2,8,6,0,2,0,0,1,0,12,15,...,5,5,1,2,0,0,-2,8,11,2018
3,9,5,0,3,0,0,2,0,3,6,...,2,5,2,1,0,0,-3,8,11,2018
4,14,16,1,2,0,1,2,0,11,15,...,3,5,2,2,0,0,-1,8,11,2018


In [8]:
#Scale data (0 to 1) to normalize the relationships
from sklearn import preprocessing
scaler = preprocessing.MinMaxScaler()
scaler.fit(EPLSeason1819)
EPLSeason1819_scaled = pd.DataFrame(scaler.transform(EPLSeason1819), index=EPLSeason1819.index, columns=EPLSeason1819.columns)
#View scaled data
EPLSeason1819_scaled.columns

Index(['HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HTHG', 'HTAG', 'HTR',
       'Referee', 'HS', 'AS', 'HST', 'AST', 'HF', 'AF', 'HC', 'AC', 'HY', 'AY',
       'HR', 'AR', 'GD', 'Month', 'Day', 'Year'],
      dtype='object')

In [9]:
#Seperate features from output
x = EPLSeason1819[['HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'HTHG', 'HTAG', 'HTR',
       'Referee', 'HS', 'AS', 'HST', 'AST', 'HF', 'AF', 'HC', 'AC', 'HY', 'AY',
       'HR', 'AR', 'GD', 'Month', 'Day', 'Year']]
y = EPLSeason1819[['FTR']]
print('Class labels:', np.unique(y))


#Split test and training data for target object 1
#By default 75% training data and 25% testing data but we will do 80% training data and 20% testing data
from sklearn.cross_validation import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=.2,
                                                    random_state=0)

Class labels: [0 1 2]




In [10]:
#View shape of train and test data sets for both feature and response
print (x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(268, 24) (67, 24) (268, 1) (67, 1)


In [11]:
#Train a logistic Regression Model on the training set
logReg = LogisticRegression()
logReg.fit(x_train,y_train)
#Make predictions for the testing set using Logistic Regression
logReg_y_predict = logReg.predict(x_test)
log_score = accuracy_score(y_test,logReg_y_predict)
#Print coefficients of Logistic Regression Model
print('Coefficients per Logistic Regression Model:',logReg.coef_)
#Print the accuracy score of the model
print('Accuracy: %.2f' % accuracy_score(y_test, logReg_y_predict))
print('Accuracy: %.2f' % logReg.score(x_test, y_test))

  y = column_or_1d(y, warn=True)


Coefficients per Logistic Regression Model: [[ 1.94628976e-02 -2.68709801e-02 -1.44672372e+00  1.64637180e+00
  -2.82130978e-01  3.35017514e-01 -4.10344640e-01  4.18944107e-02
  -5.88029013e-02  5.02940155e-02 -4.27578465e-02  5.79018631e-02
  -5.09742994e-02 -6.35266773e-02  7.02898157e-02  2.29328632e-02
  -1.27031665e-01  6.95597355e-03  1.57225648e-01 -6.99568666e-02
  -3.09309553e+00 -8.55828846e-03 -7.68455000e-03 -6.16275902e-04]
 [-3.26400158e-02  7.12166874e-02 -7.19311398e-01 -5.88609748e-01
   6.46440959e-02 -5.76014416e-03  2.83105199e-01 -3.15222371e-02
   4.33996980e-02 -6.39727686e-02  2.78437316e-02  9.09033686e-02
   7.71837278e-02  6.97137702e-02 -1.66884905e-03  4.18571250e-03
   3.09619989e-02  2.43157588e-01 -3.55417701e-01  2.71605024e-02
  -1.30701649e-01 -5.16297351e-04  4.50220915e-03 -1.51977919e-03]
 [ 4.21533917e-02 -4.60911672e-02  1.88481175e+00 -1.32277804e+00
   1.30111616e-01 -1.15095757e-01  1.59384400e-01  3.54633783e-02
  -3.14234543e-02  8.39129597e