In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import numpy as np

## 0. DataFrame

In [2]:
football_df = pd.read_csv('all_data.csv', low_memory = False)
football_df

Unnamed: 0.1,Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,...,Unnamed: 61,Unnamed: 62,Unnamed: 44,Unnamed: 45,Unnamed: 46,Unnamed: 47,Unnamed: 42,Unnamed: 43,Unnamed: 39,Unnamed: 40
0,0,F1,2000-07-28,,Marseille,Troyes,3.0,1.0,H,2.0,...,,,,,,,,,,
1,1,F1,2000-07-28,,Paris SG,Strasbourg,3.0,1.0,H,1.0,...,,,,,,,,,,
2,2,F1,2000-07-29,,Auxerre,Sedan,0.0,1.0,A,0.0,...,,,,,,,,,,
3,3,F1,2000-07-29,,Bordeaux,Metz,1.0,1.0,D,1.0,...,,,,,,,,,,
4,4,F1,2000-07-29,,Guingamp,St Etienne,2.0,2.0,D,2.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37412,37412,,,,,,,,,,...,,,,,,,,,,
37413,37413,,,,,,,,,,...,,,,,,,,,,
37414,37414,,,,,,,,,,...,,,,,,,,,,
37415,37415,,,,,,,,,,...,,,,,,,,,,


## 1. Descriptive Statistics 

**1.1 DataFrame Shape**

In [3]:
# no. rows and no. cols
football_df.shape

(37417, 187)

In [4]:
# feature names
football_df.columns.tolist()

['Unnamed: 0',
 'Div',
 'Date',
 'Time',
 'HomeTeam',
 'AwayTeam',
 'FTHG',
 'FTAG',
 'FTR',
 'HTHG',
 'HTAG',
 'HTR',
 'Referee',
 'HS',
 'AS',
 'HST',
 'AST',
 'HF',
 'AF',
 'HC',
 'AC',
 'HY',
 'AY',
 'HR',
 'AR',
 'B365H',
 'B365D',
 'B365A',
 'BWH',
 'BWD',
 'BWA',
 'IWH',
 'IWD',
 'IWA',
 'PSH',
 'PSD',
 'PSA',
 'WHH',
 'WHD',
 'WHA',
 'VCH',
 'VCD',
 'VCA',
 'MaxH',
 'MaxD',
 'MaxA',
 'AvgH',
 'AvgD',
 'AvgA',
 'B365>2.5',
 'B365<2.5',
 'P>2.5',
 'P<2.5',
 'Max>2.5',
 'Max<2.5',
 'Avg>2.5',
 'Avg<2.5',
 'AHh',
 'B365AHH',
 'B365AHA',
 'PAHH',
 'PAHA',
 'MaxAHH',
 'MaxAHA',
 'AvgAHH',
 'AvgAHA',
 'B365CH',
 'B365CD',
 'B365CA',
 'BWCH',
 'BWCD',
 'BWCA',
 'IWCH',
 'IWCD',
 'IWCA',
 'PSCH',
 'PSCD',
 'PSCA',
 'WHCH',
 'WHCD',
 'WHCA',
 'VCCH',
 'VCCD',
 'VCCA',
 'MaxCH',
 'MaxCD',
 'MaxCA',
 'AvgCH',
 'AvgCD',
 'AvgCA',
 'B365C>2.5',
 'B365C<2.5',
 'PC>2.5',
 'PC<2.5',
 'MaxC>2.5',
 'MaxC<2.5',
 'AvgC>2.5',
 'AvgC<2.5',
 'AHCh',
 'B365CAHH',
 'B365CAHA',
 'PCAHH',
 'PCAHA',
 'MaxC

**1.2 NaN Values**

In [5]:
football_df.isnull().sum()

Unnamed: 0         0
Div               18
Date              18
Time           33400
HomeTeam          18
               ...  
Unnamed: 47    37417
Unnamed: 42    37417
Unnamed: 43    37417
Unnamed: 39    37417
Unnamed: 40    37417
Length: 187, dtype: int64

In [6]:
# total elements in 
football_df.size

6996979

In [7]:
# total number of NaN
football_df.size - football_df.count().sum()

4513631

In [8]:
# total number of NaN rows
football_df.isnull().any(axis = 1).sum()

37417

In [9]:
# total number of NaN columns
football_df.isnull().any(axis = 0).sum()

186

## 2. Data Wrangling and Feature Transformation/Development

**2.1 NaN Handling**

`TODO`: drop NaN values along columns: {Date, Home Team, Away Team, FTR} <br>
`TODO`: identify betting odds w/ most available data

In [10]:
#football_df.FTR.replace('nan', np.nan, inplace=True)
nan_football_df = football_df.dropna(subset = ['Div', 'Date', 'HomeTeam', 'AwayTeam', 'FTR'])
nan_football_df

Unnamed: 0.1,Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,...,Unnamed: 61,Unnamed: 62,Unnamed: 44,Unnamed: 45,Unnamed: 46,Unnamed: 47,Unnamed: 42,Unnamed: 43,Unnamed: 39,Unnamed: 40
0,0,F1,2000-07-28,,Marseille,Troyes,3.0,1.0,H,2.0,...,,,,,,,,,,
1,1,F1,2000-07-28,,Paris SG,Strasbourg,3.0,1.0,H,1.0,...,,,,,,,,,,
2,2,F1,2000-07-29,,Auxerre,Sedan,0.0,1.0,A,0.0,...,,,,,,,,,,
3,3,F1,2000-07-29,,Bordeaux,Metz,1.0,1.0,D,1.0,...,,,,,,,,,,
4,4,F1,2000-07-29,,Guingamp,St Etienne,2.0,2.0,D,2.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37394,37394,SP1,2021-10-24,13:00,Sevilla,Levante,5.0,3.0,H,3.0,...,,,,,,,,,,
37395,37395,D1,2021-10-24,16:30,Stuttgart,Union Berlin,1.0,1.0,D,0.0,...,,,,,,,,,,
37396,37396,I1,2021-10-24,14:00,Verona,Lazio,4.0,1.0,H,2.0,...,,,,,,,,,,
37397,37397,E0,2021-10-24,14:00,West Ham,Tottenham,1.0,0.0,H,0.0,...,,,,,,,,,,


**2.2 Feature Transformation** <br>
* One hot encode Division, Home and Away Teams
* Label encode Full Time Result (Win/Draw/Loss)
* $\phi(Date)$ $\Rightarrow$ one column for *year*, second column for *day of year*
* $\phi(x)$ feature transformation $\Rightarrow$ win/loss streak to date
* Betting odds (select website odds w/ least number of NaN or most available)

In [11]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

In [12]:
learning_df = nan_football_df.copy()[['Div', 'Date', 'HomeTeam', 'AwayTeam', 'FTR']]
learning_df

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTR
0,F1,2000-07-28,Marseille,Troyes,H
1,F1,2000-07-28,Paris SG,Strasbourg,H
2,F1,2000-07-29,Auxerre,Sedan,A
3,F1,2000-07-29,Bordeaux,Metz,D
4,F1,2000-07-29,Guingamp,St Etienne,D
...,...,...,...,...,...
37394,SP1,2021-10-24,Sevilla,Levante,H
37395,D1,2021-10-24,Stuttgart,Union Berlin,D
37396,I1,2021-10-24,Verona,Lazio,H
37397,E0,2021-10-24,West Ham,Tottenham,H


**2.2.1 Division and Home/Away Team Encoding**

In [13]:
div_encoder = OneHotEncoder()
home_encoder = OneHotEncoder()
away_encoder = OneHotEncoder()

In [14]:
onehot_div = div_encoder.fit_transform(learning_df.Div.values.reshape(-1,1)).toarray().astype(int)
onehot_div_df = pd.DataFrame(onehot_div, columns = ["Div "+str(int(i)) for i in range(onehot_div.shape[1])])

onehot_home = home_encoder.fit_transform(learning_df.HomeTeam.values.reshape(-1,1)).toarray().astype(int)
onehot_home_df = pd.DataFrame(onehot_home, columns = ['HomeTeam ' + str(int(i)) for i in np.arange(onehot_home.shape[1])])

onehot_away = away_encoder.fit_transform(learning_df.AwayTeam.values.reshape(-1,1)).toarray().astype(int)
onehot_away_df = pd.DataFrame(onehot_away, columns = ['AwayTeam ' + str(int(i)) for i in np.arange(onehot_away.shape[1])])

In [15]:
learning_df = pd.concat([learning_df, onehot_div_df, onehot_home_df, onehot_away_df], axis = 1)
learning_df.drop(columns = ['Div'], inplace = True)

**2.2.2 Date Transformation**

In [16]:
learning_df['Year'] = pd.DatetimeIndex(learning_df.Date).year
learning_df['DayofYear'] = pd.DatetimeIndex(learning_df.Date).dayofyear
learning_df.drop(columns = ['Date'], inplace = True)

**2.2.3 Full Time Result Encoding**

In [17]:
target_encoder = LabelEncoder()
learning_df['Result'] = target_encoder.fit_transform(learning_df.FTR) 
learning_df.drop(columns = ['FTR'], inplace = True)

**2.2.4 Win/Loss Streak Feature Creation** <br>
Important note about this feature: the win/loss streak is the teams *home* and *away* win streak, *not* its ***consecutive*** win/loss streak.

In [18]:
# https://stackoverflow.com/questions/52976336/compute-winning-streak-with-pandas
# https://joshdevlin.com/blog/calculate-streaks-in-pandas/

In [19]:
def compute_winstreak(df):
    
    years = df.Year.unique()
    df_lst = []    
    for year in years:
        
        year_df = df[df.Year == year]
        year_df['HomeWin'] = year_df.Result.replace([0, 1, 2], [0, 0, 1])
        year_df['AwayWin'] = year_df.Result.replace([0, 1, 2], [1, 0, 0])
        year_df['HomeWinStreak'] = None
        year_df['AwayWinStreak'] = None
        
        hometeams = year_df.HomeTeam.unique()
        awayteams = year_df.AwayTeam.unique()
        
        for team in hometeams:
            team_df = year_df[(year_df.HomeTeam == team)]
            team_df = team_df.sort_values(['Year', 'DayofYear'], ascending = (True, True))

            team_grouper = (team_df.HomeWin != team_df.HomeWin.shift()).cumsum()
            team_df['HomeWinStreak'] = team_df[['HomeWin']].groupby(team_grouper).cumsum()
            team_df.loc[team_df.HomeWinStreak >0, 'HomeWinStreak'] -= 1
            year_df.loc[team_df.index, 'HomeWinStreak'] = team_df.HomeWinStreak
            
        for team in awayteams:
            team_df = year_df[(year_df.AwayTeam == team)]
            team_df = team_df.sort_values(['Year', 'DayofYear'], ascending = (True, True))

            team_grouper = (team_df.AwayWin != team_df.AwayWin.shift()).cumsum()
            team_df['AwayWinStreak'] = team_df[['AwayWin']].groupby(team_grouper).cumsum()
            team_df.loc[team_df.AwayWinStreak >0, 'AwayWinStreak'] -= 1
            year_df.loc[team_df.index, 'AwayWinStreak'] = team_df.AwayWinStreak
            
        df_lst.append(year_df)
        
    return pd.concat(df_lst, axis = 0).drop(columns = ['HomeWin', 'AwayWin'])

In [20]:
learning_df = compute_winstreak(learning_df)
learning_df.drop(columns = ['HomeTeam', 'AwayTeam'], inplace = True)

**2.2.5 Last Match Result** <br>
Indicate the result from the last match played between both teams

In [21]:
## TODO ##

**2.2.6 Team Wins to Date**

In [None]:
## TODO ##

**2.2.7 Website Odds** <br>
The `betting odds` recorded by various betting websites offer insight into sentiment surrounding the outcome of a particular game. 

In [22]:
## TODO ##

**2.2.8 Peek @ Learning DataFrame**

In [23]:
learning_df

Unnamed: 0,Div 0,Div 1,Div 2,Div 3,Div 4,HomeTeam 0,HomeTeam 1,HomeTeam 2,HomeTeam 3,HomeTeam 4,...,AwayTeam 206,AwayTeam 207,AwayTeam 208,AwayTeam 209,AwayTeam 210,Year,DayofYear,Result,HomeWinStreak,AwayWinStreak
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,2000,210,2,0,0
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,2000,210,2,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,2000,211,0,0,0
3,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,2000,211,1,0,0
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,2000,211,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37394,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,2021,297,2,5,0
37395,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,2021,297,1,0,0
37396,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,2021,297,2,2,0
37397,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,2021,297,2,0,0


## 3. Preliminary Regression

* Establish a baseline Linear Regression model fit over the entire learning dataframe without special regard to *division* and *team*. 
* Train model over 18 seasons, and predict for the remaining 3 seasons (approximate 80-20 split)

In [24]:
from sklearn.metrics import accuracy_score

**3.1 Train and Test Split**

In [25]:
split = 0.80
no_seasons = 22

print('No. seasons to train over: ' + str(round(split*no_seasons)))

No. seasons to train over: 18


In [26]:
X, y = learning_df.loc[:, learning_df.columns != 'Result'], learning_df[['Result']]

In [27]:
X

Unnamed: 0,Div 0,Div 1,Div 2,Div 3,Div 4,HomeTeam 0,HomeTeam 1,HomeTeam 2,HomeTeam 3,HomeTeam 4,...,AwayTeam 205,AwayTeam 206,AwayTeam 207,AwayTeam 208,AwayTeam 209,AwayTeam 210,Year,DayofYear,HomeWinStreak,AwayWinStreak
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2000,210,0,0
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2000,210,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2000,211,0,0
3,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2000,211,0,0
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2000,211,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37394,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,2021,297,5,0
37395,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2021,297,0,0
37396,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,2021,297,2,0
37397,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2021,297,0,0


In [28]:
xTr, xTe = X[X.Year <= 2018], X[X.Year > 2018]
yTr, yTe = y.loc[xTr.index, :], y.loc[xTe.index, :]

**3.2 Normalization** <br>
Following our various feature transformations and development, we arrived to a sparse dataframe with the exception of a few features(*Year, DayofYear*). It will be important to *normalize* these features as they are in gross magnitudes compared to the remaining features. During model training, having dominating features (in scale relative to others) can be dangerous as the weight updates may mistakengly favor these larger-scale features because it will have the largest influence on the target output. 

In [29]:
from sklearn.preprocessing import MinMaxScaler
minmax_scaler = MinMaxScaler()
xTr.loc[:, ['Year', 'DayofYear']] = minmax_scaler.fit_transform(xTr.loc[:, ['Year', 'DayofYear']])
xTe.loc[:, ['Year', 'DayofYear']] = minmax_scaler.transform(xTe.loc[:, ['Year', 'DayofYear']])

In [31]:
xTr

Unnamed: 0,Div 0,Div 1,Div 2,Div 3,Div 4,HomeTeam 0,HomeTeam 1,HomeTeam 2,HomeTeam 3,HomeTeam 4,...,AwayTeam 205,AwayTeam 206,AwayTeam 207,AwayTeam 208,AwayTeam 209,AwayTeam 210,Year,DayofYear,HomeWinStreak,AwayWinStreak
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.0,0.572603,0,0
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.0,0.572603,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.0,0.575342,0,0
3,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.0,0.575342,0,0
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.0,0.575342,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32442,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1.0,0.991781,0,0
32443,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1.0,0.994521,0,0
32444,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1.0,0.994521,0,2
32445,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1.0,0.994521,2,0


**3.3 HomeWins Baseline Model**

In [43]:
baseline_preds = np.full((4952, 1), 2) #predicts home wins all the time
accuracy_score(yTe.Result.values, baseline_preds.ravel())

0.42851373182552505

In [47]:
accuracy_score(yTr.Result.values, np.full((32447, 1), 2))

0.46512774678706814

**3.4 Multinomial Logistic Regression**

**3.4.1 Full Model Fit**

In [44]:
from sklearn.linear_model import LogisticRegression
linear_model = LogisticRegression(max_iter = 10000).fit(xTr, yTr.values.ravel())

In [45]:
lr_preds = linear_model.predict(xTe)
accuracy_score(yTe.Result.values, lr_preds)

0.574313408723748

In [46]:
accuracy_score(yTr.Result.values, linear_model.predict(xTr))

0.6099177119610442

**3.4.2 Team Fit Model**

In [64]:
X_barcelona = X[X['HomeTeam 18'] == 1].loc[:, 'AwayTeam 0':]
y_barcelona = y.loc[X_barcelona.index, :]

In [65]:
bxTr, bxTe = X_barcelona[X_barcelona.Year <= 2018], X_barcelona[X_barcelona.Year > 2018]
byTr, byTe = y_barcelona.loc[bxTr.index, :], y_barcelona.loc[bxTe.index, :]

In [66]:
bminmax_scaler = MinMaxScaler()
bxTr.loc[:, ['Year', 'DayofYear']] = minmax_scaler.fit_transform(bxTr.loc[:, ['Year', 'DayofYear']])
bxTe.loc[:, ['Year', 'DayofYear']] = minmax_scaler.transform(bxTe.loc[:, ['Year', 'DayofYear']])

In [67]:
barcelona_model = LogisticRegression(max_iter = 10000).fit(bxTr, byTr.values.ravel())

In [68]:
barcelona_preds = barcelona_model.predict(bxTe)
accuracy_score(byTe.Result.values, barcelona_preds)

0.8333333333333334

In [70]:
fullModel_preds = linear_model.predict(xTe[xTe['HomeTeam 18'] == 1])
accuracy_score(byTe.Result.values, fullModel_preds)

0.8333333333333334

**3.5 Ridge Classifier**

In [None]:
from sklearn.linear_model import RidgeClassifier
ridge_model = RidgeClassifier().fit(xTr, yTr.values.ravel())

In [None]:
ridge_preds = ridge_model.predict(xTe)
accuracy_score(yTe.Result.values, ridge_preds)

In [None]:
yTe.shape