In [3]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import numpy as np

## 0. DataFrame

In [62]:
football_df = pd.read_csv('data/all_data_with_elo.csv', low_memory = False)
football_df

Unnamed: 0.1,Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HS,AS,...,B365D,B365A,IWH,IWD,IWA,WHH,WHD,WHA,HomeTeamELO,AwayTeamELO
0,0,D1,2003-08-01,Bayern Munich,Ein Frankfurt,3.0,1.0,H,17.0,6.0,...,5.000,11.000,1.25,4.5,9.0,1.25,4.50,10.00,,
1,1,F1,2003-08-01,Lille,Lyon,1.0,0.0,H,,,...,3.000,2.250,2.70,2.9,2.4,3.00,3.00,2.20,,
2,2,F1,2003-08-02,Auxerre,Nice,1.0,2.0,A,,,...,3.100,4.500,1.70,3.1,4.4,1.66,3.10,5.00,,
3,3,F1,2003-08-02,Guingamp,Marseille,0.0,1.0,A,,,...,2.875,2.625,2.70,2.9,2.4,2.60,3.10,2.40,,
4,4,D1,2003-08-02,Hamburg,Hannover,0.0,3.0,A,10.0,16.0,...,3.500,5.000,1.65,3.3,4.4,1.57,3.50,5.00,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37776,37776,F1,2024-10-06,Reims,Montpellier,4.0,2.0,H,18.0,10.0,...,3.800,4.330,,,,1.83,3.80,4.00,1633.626221,1645.806641
37777,37777,SP1,2024-10-06,Sevilla,Betis,1.0,0.0,H,14.0,11.0,...,3.200,3.000,,,,2.60,3.25,2.88,1676.242676,1709.259521
37778,37778,SP1,2024-10-06,Sociedad,Ath Madrid,1.0,1.0,D,16.0,4.0,...,3.200,2.300,,,,3.30,3.20,2.30,1766.551880,1828.522095
37779,37779,F1,2024-10-06,Strasbourg,Lens,2.0,2.0,D,8.0,12.0,...,3.300,2.400,,,,2.90,3.40,2.40,1608.732544,1719.138184


## 1. Descriptive Statistics 

**1.1 DataFrame Shape**

In [45]:
# no. rows and no. cols
football_df.shape

(37781, 23)

In [46]:
# feature names
print(football_df.columns.tolist())

['Unnamed: 0', 'Div', 'Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HS', 'AS', 'HST', 'AST', 'B365H', 'B365D', 'B365A', 'IWH', 'IWD', 'IWA', 'WHH', 'WHD', 'WHA', 'HomeTeamELO', 'AwayTeamELO']


**1.2 NaN Values**

In [47]:
football_df.isnull().sum()

Unnamed: 0         0
Div                0
Date               0
HomeTeam           0
AwayTeam           0
FTHG               0
FTAG               0
FTR                0
HS              1762
AS              1762
HST             2568
AST             2568
B365H             49
B365D             49
B365A             49
IWH             1324
IWD             1324
IWA             1324
WHH              573
WHD              573
WHA              573
HomeTeamELO    37422
AwayTeamELO    37424
dtype: int64

In [55]:
# total elements in 
football_df.size

868963

In [56]:
# total number of NaN
football_df.size - football_df.count().sum()

89344

In [57]:
# total number of NaN rows
football_df.isnull().any(axis = 1).sum()

37780

In [58]:
# total number of NaN columns
football_df.isnull().any(axis = 0).sum()

15

## 2. Data Wrangling and Feature Transformation/Development

**2.1 NaN Handling**

`TODO`: drop NaN values along columns: {Date, Home Team, Away Team, FTR} <br>
`TODO`: identify betting odds w/ most available data

In [59]:
nan_mask = ['Div', 'Date', 'HomeTeam', 'AwayTeam', 'FTR', 'B365H', 'B365D', 'B365A', 
            'IWH', 'IWD', 'IWA', 'WHH', 'WHD', 'WHA', 'HomeTeamELO', 'AwayTeamELO']

In [60]:
#football_df.FTR.replace('nan', np.nan, inplace=True)
nan_football_df = football_df.dropna(subset = nan_mask)
nan_football_df

Unnamed: 0.1,Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HS,AS,...,B365D,B365A,IWH,IWD,IWA,WHH,WHD,WHA,HomeTeamELO,AwayTeamELO
15761,15761,D1,2012-08-25,Stuttgart,Wolfsburg,0.0,1.0,A,10.0,13.0,...,3.5,3.6,1.9,3.3,3.7,2.05,3.6,3.4,1773.486572,1690.165649


In [61]:
# resize shape
football_df.shape[0] - nan_football_df.shape[0]

37780

**2.2 Feature Transformation** <br>
* One hot encode Division, Home and Away Teams
* Label encode Full Time Result (Win/Draw/Loss)
* $\phi(Date)$ $\Rightarrow$ one column for *year*, second column for *day of year*
* $\phi(x)$ feature transformation $\Rightarrow$ win/loss streak to date
* Betting odds $\Rightarrow$ average the home, away, and draw odds from the two odd sites

In [17]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

In [18]:
feats = nan_mask

In [19]:
learning_df = nan_football_df.copy()[feats]
learning_df.reset_index(inplace=True, drop=True)

**2.2.1 Division and Home/Away Team Encoding**

In [20]:
div_encoder = OneHotEncoder()
home_encoder = OneHotEncoder()
away_encoder = OneHotEncoder()

In [21]:
onehot_div = div_encoder.fit_transform(learning_df.Div.values.reshape(-1,1)).toarray().astype(int)
onehot_div_df = pd.DataFrame(onehot_div, columns = ["Div "+str(int(i)) for i in range(onehot_div.shape[1])])

onehot_home = home_encoder.fit_transform(learning_df.HomeTeam.values.reshape(-1,1)).toarray().astype(int)
onehot_home_df = pd.DataFrame(onehot_home, columns = ['HomeTeam ' + str(int(i)) for i in np.arange(onehot_home.shape[1])])

onehot_away = away_encoder.fit_transform(learning_df.AwayTeam.values.reshape(-1,1)).toarray().astype(int)
onehot_away_df = pd.DataFrame(onehot_away, columns = ['AwayTeam ' + str(int(i)) for i in np.arange(onehot_away.shape[1])])

In [22]:
learning_df = pd.concat([learning_df, onehot_div_df, onehot_home_df, onehot_away_df], axis = 1)
learning_df.drop(columns = ['Div'], inplace = True)

In [23]:
learning_df

Unnamed: 0,Date,HomeTeam,AwayTeam,FTR,B365H,B365D,B365A,IWH,IWD,IWA,...,AwayTeam 194,AwayTeam 195,AwayTeam 196,AwayTeam 197,AwayTeam 198,AwayTeam 199,AwayTeam 200,AwayTeam 201,AwayTeam 202,AwayTeam 203
0,2003-08-01,Bayern Munich,Ein Frankfurt,H,1.200,5.000,11.000,1.25,4.50,9.00,...,0,0,0,0,0,0,0,0,0,0
1,2003-08-01,Lille,Lyon,H,2.875,3.000,2.250,2.70,2.90,2.40,...,0,0,0,0,0,0,0,0,0,0
2,2003-08-02,Auxerre,Nice,A,1.727,3.100,4.500,1.70,3.10,4.40,...,0,0,0,0,0,0,0,0,0,0
3,2003-08-02,Guingamp,Marseille,A,2.500,2.875,2.625,2.70,2.90,2.40,...,0,0,0,0,0,0,0,0,0,0
4,2003-08-02,Hamburg,Hannover,A,1.571,3.500,5.000,1.65,3.30,4.40,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35629,2024-01-03,Granada,Cadiz,H,2.250,3.100,3.500,2.25,3.20,3.35,...,0,0,0,0,0,0,0,0,0,0
35630,2024-01-03,Real Madrid,Mallorca,H,1.200,6.500,15.000,1.22,6.50,13.00,...,0,0,0,0,0,0,0,0,0,0
35631,2024-01-04,Las Palmas,Barcelona,A,5.500,4.330,1.570,5.50,4.30,1.57,...,0,0,0,0,0,0,0,0,0,0
35632,2024-01-04,Osasuna,Almeria,H,1.700,4.000,4.750,1.70,3.90,4.80,...,0,0,0,0,0,0,0,0,0,0


**2.2.2 Date Transformation**

In [24]:
learning_df['Year'] = pd.DatetimeIndex(learning_df.Date).year
learning_df['DayofYear'] = pd.DatetimeIndex(learning_df.Date).dayofyear
learning_df.drop(columns = ['Date'], inplace = True)

**2.2.3 Full Time Result Encoding**

In [25]:
target_encoder = LabelEncoder()
learning_df['Result'] = target_encoder.fit_transform(learning_df.FTR) 
learning_df.drop(columns = ['FTR'], inplace = True)

**2.2.4 Win/Loss Streak Feature Creation** <br>
Important note about this feature: the win/loss streak is the teams *home* and *away* win streak, *not* its ***consecutive*** win/loss streak.

In [24]:
# https://stackoverflow.com/questions/52976336/compute-winning-streak-with-pandas
# https://joshdevlin.com/blog/calculate-streaks-in-pandas/

In [26]:
def compute_winstreak(df):
    
    years = df.Year.unique()
    df_lst = []    
    for year in years:
        
        year_df = df[df.Year == year]
        year_df['HomeWin'] = year_df.Result.replace([0, 1, 2], [0, 0, 1])
        year_df['AwayWin'] = year_df.Result.replace([0, 1, 2], [1, 0, 0])
        year_df['HomeWinStreak'] = None
        year_df['AwayWinStreak'] = None
        
        hometeams = year_df.HomeTeam.unique()
        awayteams = year_df.AwayTeam.unique()
        
        for team in hometeams:
            team_df = year_df[(year_df.HomeTeam == team)]
            team_df = team_df.sort_values(['Year', 'DayofYear'], ascending = (True, True))

            team_grouper = (team_df.HomeWin != team_df.HomeWin.shift()).cumsum()
            team_df['HomeWinStreak'] = team_df[['HomeWin']].groupby(team_grouper).cumsum()
            team_df.loc[team_df.HomeWinStreak >0, 'HomeWinStreak'] -= 1
            year_df.loc[team_df.index, 'HomeWinStreak'] = team_df.HomeWinStreak
            
        for team in awayteams:
            team_df = year_df[(year_df.AwayTeam == team)]
            team_df = team_df.sort_values(['Year', 'DayofYear'], ascending = (True, True))

            team_grouper = (team_df.AwayWin != team_df.AwayWin.shift()).cumsum()
            team_df['AwayWinStreak'] = team_df[['AwayWin']].groupby(team_grouper).cumsum()
            team_df.loc[team_df.AwayWinStreak >0, 'AwayWinStreak'] -= 1
            year_df.loc[team_df.index, 'AwayWinStreak'] = team_df.AwayWinStreak
            
        df_lst.append(year_df)
        
    return pd.concat(df_lst, axis = 0).drop(columns = ['HomeWin', 'AwayWin'])

In [27]:
learning_df = compute_winstreak(learning_df)
learning_df.drop(columns = ['HomeTeam', 'AwayTeam'], inplace = True)

**2.2.5 Last Match Result** <br>
Indicate the result from the last match played between both teams

In [27]:
## TODO ##

**2.2.6 Team Wins to Date**

In [28]:
## TODO ##

**2.2.7 Website Odds** <br>
The `betting odds` recorded by various betting websites offer insight into sentiment surrounding the outcome of a particular game. 

In [29]:
## TODO ##

In [28]:
betting_feats = ['B365H', 'B365D', 'B365A', 'IWH', 'IWD', 'IWA', 'WHH', 'WHD', 'WHA']
betting_feats

['B365H', 'B365D', 'B365A', 'IWH', 'IWD', 'IWA', 'WHH', 'WHD', 'WHA']

In [29]:
def compute_odds(df, betting_feats):
    """
    """
    home_odds = []
    away_odds = []
    draw_odds = []
    for odd in betting_feats:
        odd_type = odd[-1]
        if odd_type == 'H':
            home_odds.append(odd)
        elif odd_type == 'A':
            away_odds.append(odd)
        else:
            draw_odds.append(odd)
    avg_home_odds = df[home_odds].mean(axis=1)
    avg_away_odds = df[away_odds].mean(axis=1)
    avg_draw_odds = df[draw_odds].mean(axis=1)
    
    ordered_cols = ['HomeOdds', 'AwayOdds', 'DrawOdds'] + df.columns.tolist()
    
    df['HomeOdds'] = avg_home_odds
    df['AwayOdds'] = avg_away_odds
    df['DrawOdds'] = avg_draw_odds
    
    return df[ordered_cols]

In [30]:
learning_df = compute_odds(learning_df, betting_feats)

**2.2.8 Elo Difference**

In [33]:
# hometeamELO = learning_df.HomeTeamELO - learning_df.AwayTeamELO

In [34]:
# learning_df.drop(columns = ['HomeTeamELO', 'AwayTeamELO'], inplace = True)
# learning_df['HomeTeamELO'] = hometeamELO

**2.2.8 Peek @ Learning DataFrame**

In [31]:
learning_df

Unnamed: 0,HomeOdds,AwayOdds,DrawOdds,B365H,B365D,B365A,IWH,IWD,IWA,WHH,...,AwayTeam 199,AwayTeam 200,AwayTeam 201,AwayTeam 202,AwayTeam 203,Year,DayofYear,Result,HomeWinStreak,AwayWinStreak
0,1.233333,10.000000,4.666667,1.200,5.000,11.000,1.25,4.50,9.00,1.25,...,0,0,0,0,0,2003,213,2,0,0
1,2.858333,2.283333,2.966667,2.875,3.000,2.250,2.70,2.90,2.40,3.00,...,0,0,0,0,0,2003,213,2,0,0
2,1.695667,4.633333,3.100000,1.727,3.100,4.500,1.70,3.10,4.40,1.66,...,0,0,0,0,0,2003,214,0,0,0
3,2.600000,2.475000,2.958333,2.500,2.875,2.625,2.70,2.90,2.40,2.60,...,0,0,0,0,0,2003,214,0,0,0
4,1.597000,4.800000,3.433333,1.571,3.500,5.000,1.65,3.30,4.40,1.57,...,0,0,0,0,0,2003,214,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35629,2.216667,3.416667,3.166667,2.250,3.100,3.500,2.25,3.20,3.35,2.15,...,0,0,0,0,0,2024,3,2,0,0
35630,1.206667,13.666667,6.333333,1.200,6.500,15.000,1.22,6.50,13.00,1.20,...,0,0,0,0,0,2024,3,2,0,0
35631,5.500000,1.556667,4.276667,5.500,4.330,1.570,5.50,4.30,1.57,5.50,...,0,0,0,0,0,2024,4,0,0,0
35632,1.683333,4.716667,3.933333,1.700,4.000,4.750,1.70,3.90,4.80,1.65,...,0,0,0,0,0,2024,4,2,0,0


## 3. Preliminary Regression

* Establish a baseline Linear Regression model fit over the entire learning dataframe without special regard to *division* and *team*. 
* Train model over 18 seasons, and predict for the remaining 3 seasons (approximate 80-20 split)

In [32]:
from sklearn.metrics import accuracy_score

**3.1 Train and Test Split**

In [33]:
split = 0.80
no_seasons = 20

print('No. seasons to train over: ' + str(round(split*no_seasons)))

No. seasons to train over: 16


In [34]:
X, y = learning_df.loc[:, learning_df.columns != 'Result'], learning_df[['Result']]

In [35]:
X

Unnamed: 0,HomeOdds,AwayOdds,DrawOdds,B365H,B365D,B365A,IWH,IWD,IWA,WHH,...,AwayTeam 198,AwayTeam 199,AwayTeam 200,AwayTeam 201,AwayTeam 202,AwayTeam 203,Year,DayofYear,HomeWinStreak,AwayWinStreak
0,1.233333,10.000000,4.666667,1.200,5.000,11.000,1.25,4.50,9.00,1.25,...,0,0,0,0,0,0,2003,213,0,0
1,2.858333,2.283333,2.966667,2.875,3.000,2.250,2.70,2.90,2.40,3.00,...,0,0,0,0,0,0,2003,213,0,0
2,1.695667,4.633333,3.100000,1.727,3.100,4.500,1.70,3.10,4.40,1.66,...,0,0,0,0,0,0,2003,214,0,0
3,2.600000,2.475000,2.958333,2.500,2.875,2.625,2.70,2.90,2.40,2.60,...,0,0,0,0,0,0,2003,214,0,0
4,1.597000,4.800000,3.433333,1.571,3.500,5.000,1.65,3.30,4.40,1.57,...,0,0,0,0,0,0,2003,214,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35629,2.216667,3.416667,3.166667,2.250,3.100,3.500,2.25,3.20,3.35,2.15,...,0,0,0,0,0,0,2024,3,0,0
35630,1.206667,13.666667,6.333333,1.200,6.500,15.000,1.22,6.50,13.00,1.20,...,0,0,0,0,0,0,2024,3,0,0
35631,5.500000,1.556667,4.276667,5.500,4.330,1.570,5.50,4.30,1.57,5.50,...,0,0,0,0,0,0,2024,4,0,0
35632,1.683333,4.716667,3.933333,1.700,4.000,4.750,1.70,3.90,4.80,1.65,...,0,0,0,0,0,0,2024,4,0,0


In [92]:
xTr, xTe = X[X.Year <= 2016], X[X.Year > 2016]
yTr, yTe = y.loc[xTr.index, :], y.loc[xTe.index, :]

In [93]:
xTe

Unnamed: 0,HomeOdds,AwayOdds,DrawOdds,B365H,B365D,B365A,IWH,IWD,IWA,WHH,...,AwayTeam 198,AwayTeam 199,AwayTeam 200,AwayTeam 201,AwayTeam 202,AwayTeam 203,Year,DayofYear,HomeWinStreak,AwayWinStreak
24233,1.306667,9.333333,5.500000,1.29,5.50,9.50,1.30,5.50,9.50,1.33,...,0,0,0,0,0,0,2017,1,0,0
24234,5.333333,1.683333,3.683333,5.50,3.75,1.62,5.00,3.70,1.70,5.50,...,0,0,0,0,0,0,2017,1,0,0
24235,2.200000,3.500000,3.333333,2.20,3.40,3.60,2.10,3.40,3.50,2.30,...,0,0,0,0,0,0,2017,2,0,0
24236,1.166667,17.333333,7.666667,1.17,8.50,19.00,1.15,7.50,18.00,1.18,...,0,0,0,0,0,0,2017,2,0,0
24237,2.666667,2.860000,3.166667,2.75,3.20,2.88,2.50,3.20,2.90,2.75,...,0,0,0,0,0,0,2017,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36845,2.216667,3.416667,3.166667,2.25,3.10,3.50,2.25,3.20,3.35,2.15,...,0,0,0,0,0,0,2024,3,0,0
36846,1.206667,13.666667,6.333333,1.20,6.50,15.00,1.22,6.50,13.00,1.20,...,0,0,0,0,0,0,2024,3,0,0
36847,5.500000,1.556667,4.276667,5.50,4.33,1.57,5.50,4.30,1.57,5.50,...,0,0,0,0,0,0,2024,4,0,0
36848,1.683333,4.716667,3.933333,1.70,4.00,4.75,1.70,3.90,4.80,1.65,...,0,0,0,0,0,0,2024,4,0,0


**3.2 Normalization** <br>
Following our various feature transformations and development, we arrived to a sparse dataframe with the exception of a few features(*Year, DayofYear*). It will be important to *normalize* these features as they are in gross magnitudes compared to the remaining features. During model training, having dominating features (in scale relative to others) can be dangerous as the weight updates may mistakengly favor these larger-scale features because it will have the largest influence on the target output. 

In [94]:
from sklearn.preprocessing import MinMaxScaler
minmax_scaler = MinMaxScaler()
xTr.loc[:, ['Year', 'DayofYear']] = minmax_scaler.fit_transform(xTr.loc[:, ['Year', 'DayofYear']])
xTe.loc[:, ['Year', 'DayofYear']] = minmax_scaler.transform(xTe.loc[:, ['Year', 'DayofYear']])

In [95]:
from sklearn.preprocessing import StandardScaler
std_scaler = StandardScaler()
to_scale = ['HomeTeamELO', 'AwayTeamELO', 'HomeOdds', 'AwayOdds', 'DrawOdds'] + betting_feats

xTr.loc[:, to_scale] = std_scaler.fit_transform(xTr.loc[:, to_scale])
xTe.loc[:, to_scale] = std_scaler.transform(xTe.loc[:, to_scale])

In [96]:
xTr

Unnamed: 0,HomeOdds,AwayOdds,DrawOdds,B365H,B365D,B365A,IWH,IWD,IWA,WHH,...,AwayTeam 198,AwayTeam 199,AwayTeam 200,AwayTeam 201,AwayTeam 202,AwayTeam 203,Year,DayofYear,HomeWinStreak,AwayWinStreak
0,0.062661,-0.570352,-0.706764,0.001623,-0.757236,-0.557594,0.176368,-0.887701,-0.566919,0.029169,...,0,0,0,0,0,0,0.0,0.586301,0,0
1,-0.191482,-0.414957,-0.659551,-0.252136,-0.636665,-0.391415,-0.137369,-0.887701,-0.376579,-0.170640,...,0,0,0,0,0,0,0.0,0.586301,0,0
2,-0.156826,-0.446681,-0.659551,-0.156977,-0.636665,-0.485342,-0.137369,-0.758063,-0.414647,-0.170640,...,0,0,0,0,0,0,0.0,0.586301,0,0
3,-0.391793,-0.175143,-0.621780,-0.421521,-0.636665,-0.160210,-0.372671,-0.758063,-0.110104,-0.370449,...,0,0,0,0,0,0,0.0,0.586301,0,0
4,-0.367303,-0.228913,-0.584010,-0.373306,-0.636665,-0.232462,-0.294237,-0.758063,-0.262376,-0.417071,...,0,0,0,0,0,0,0.0,0.586301,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24228,-0.237690,-0.223536,-0.092994,-0.252136,-0.057923,-0.246912,-0.215803,-0.109874,-0.224308,-0.237243,...,1,0,0,0,0,0,1.0,1.000000,0,0
24229,-0.041307,-0.417108,-0.168535,-0.093537,-0.057923,-0.420316,0.019499,-0.239512,-0.433681,-0.037434,...,0,0,0,0,0,0,1.0,1.000000,1,0
24230,-0.822219,3.276881,2.513167,-0.791375,2.257045,3.307861,-0.866806,3.001437,2.440450,-0.803367,...,0,0,0,0,0,0,1.0,1.000000,2,0
24231,-0.480281,0.319539,0.020317,-0.474176,0.038534,0.273298,-0.529540,0.279040,0.346712,-0.437051,...,0,0,0,0,0,0,1.0,1.000000,0,0


In [97]:
xTe

Unnamed: 0,HomeOdds,AwayOdds,DrawOdds,B365H,B365D,B365A,IWH,IWD,IWA,WHH,...,AwayTeam 198,AwayTeam 199,AwayTeam 200,AwayTeam 201,AwayTeam 202,AwayTeam 203,Year,DayofYear,HomeWinStreak,AwayWinStreak
24233,-0.787563,1.610016,2.211003,-0.765999,1.774760,1.429322,-0.843276,2.612523,2.059771,-0.750085,...,0,0,0,0,0,0,1.071429,0.000000,0,0
24234,2.003390,-0.858020,0.152514,1.904817,0.086762,-0.848045,2.058786,0.279040,-0.909531,2.027254,...,0,0,0,0,0,0,1.071429,0.000000,0,0
24235,-0.168378,-0.271929,-0.244076,-0.188696,-0.250837,-0.275813,-0.215803,-0.109874,-0.224308,-0.104037,...,0,0,0,0,0,0,1.071429,0.002740,0,0
24236,-0.884600,4.190969,4.666083,-0.842126,4.668470,4.174879,-0.960927,5.205283,5.295548,-0.849989,...,0,0,0,0,0,0,1.071429,0.002740,0,0
24237,0.155077,-0.478405,-0.432928,0.160222,-0.443751,-0.483897,0.097933,-0.369150,-0.452715,0.195676,...,0,0,0,0,0,0,1.071429,0.002740,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36845,-0.156826,-0.298814,-0.432928,-0.156977,-0.540208,-0.304713,-0.098152,-0.369150,-0.281410,-0.203941,...,0,0,0,0,0,0,1.571429,0.005479,0,0
36846,-0.856875,3.008032,3.155265,-0.823094,2.739330,3.018855,-0.906023,3.908903,3.392149,-0.836669,...,0,0,0,0,0,0,1.571429,0.005479,0,0
36847,2.118910,-0.898886,0.824828,1.904817,0.646213,-0.862495,2.450957,1.056868,-0.959019,2.027254,...,0,0,0,0,0,0,1.571429,0.008219,0,0
36848,-0.526489,0.120591,0.435792,-0.505895,0.327905,0.056544,-0.529540,0.538316,0.270576,-0.536956,...,0,0,0,0,0,0,1.571429,0.008219,0,0


**3.3 HomeWins Baseline Model**

In [98]:
yTe.Result.values.shape

(12617,)

In [99]:
baseline_preds = np.full((yTe.Result.values.shape, 1), 2) #predicts home wins all the time
accuracy_score(yTe.Result.values, baseline_preds.ravel())

0.4415471189664738

**3.4 Multinomial Logistic Regression**

**3.4.1 Full Model Fit**

In [100]:
from sklearn.linear_model import LogisticRegression
linear_model = LogisticRegression(max_iter = 10000).fit(xTr, yTr.values.ravel())

In [101]:
lr_preds = linear_model.predict(xTe)
accuracy_score(yTe.Result.values, lr_preds)

0.6126654513751288

In [102]:
accuracy_score(yTr.Result.values, linear_model.predict(xTr))

0.6336813436223332

**3.4.2 Parameter Tuning**

In [117]:
from sklearn.model_selection import RandomizedSearchCV

logistic_params = {'penalty': ['l1','l2','elasticnet'],
                   'C':[0.001,0.01,0.10,0.50,1.0],
                   'l1_ratio': [0.2, 0.5, 0.8]}

logistic_randsearch = RandomizedSearchCV(estimator=LogisticRegression(max_iter=10000, solver='saga'),
                                         param_distributions=logistic_params,
                                         scoring='accuracy',
                                         verbose=1,
                                         cv=5)

logistic_rand_results = logistic_randsearch.fit(xTr, yTr.values.ravel())
print("Best: %f using %s" % (logistic_rand_results.best_score_, logistic_rand_results.best_params_))

Fitting 5 folds for each of 10 candidates, totalling 50 fits




Best: 0.609252 using {'penalty': 'elasticnet', 'l1_ratio': 0.8, 'C': 0.01}


In [118]:
rs_preds = logistic_rand_results.best_estimator_.predict(xTe)

In [116]:
accuracy_score(yTe.Result.values, rs_preds)

0.6229690100657843

**3.4.3 Team Fit Model**

In [83]:
X_barcelona = X[X['HomeTeam 17'] == 1].loc[:, 'AwayTeam 0':]
y_barcelona = y.loc[X_barcelona.index, :]

In [84]:
bxTr, bxTe = X_barcelona[X_barcelona.Year <= 2016], X_barcelona[X_barcelona.Year > 2018]
byTr, byTe = y_barcelona.loc[bxTr.index, :], y_barcelona.loc[bxTe.index, :]

In [85]:
bminmax_scaler = MinMaxScaler()
bxTr.loc[:, ['Year', 'DayofYear']] = minmax_scaler.fit_transform(bxTr.loc[:, ['Year', 'DayofYear']])
bxTe.loc[:, ['Year', 'DayofYear']] = minmax_scaler.transform(bxTe.loc[:, ['Year', 'DayofYear']])

In [86]:
barcelona_model = LogisticRegression(max_iter = 10000).fit(bxTr, byTr.values.ravel())

In [87]:
barcelona_preds = barcelona_model.predict(bxTe)
accuracy_score(byTe.Result.values, barcelona_preds)

0.7916666666666666

In [88]:
fullModel_preds = linear_model.predict(xTe[xTe['HomeTeam 18'] == 1])
accuracy_score(byTe.Result.values, fullModel_preds)

ValueError: Found array with 0 sample(s) (shape=(0, 431)) while a minimum of 1 is required by LogisticRegression.

**3.5 Ridge Classifier**

In [81]:
from sklearn.linear_model import RidgeClassifier
ridge_model = RidgeClassifier().fit(xTr, yTr.values.ravel())

In [82]:
ridge_preds = ridge_model.predict(xTe)
accuracy_score(yTe.Result.values, ridge_preds)

0.594436078307046