In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as st
from scipy.special import logit, expit
import statsmodels.formula.api as smf
import statsmodels.api as sm
from collections import defaultdict
from sklearn.metrics import brier_score_loss
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
pd.set_option('display.max_columns', 100)

In [3]:
df = pd.read_csv('soccer18.csv', index_col = False)

In [4]:
df['GD_Home'] = df.FTHG - df.FTAG
df['GD_Away'] = df.FTAG - df.FTHG
df['Pts_Home'] = np.where(df['FTHG'] == df['FTAG'], 1, np.where( df['FTHG'] >  df['FTAG'], 3, 0))
df['Pts_Away'] = np.where(df['FTAG'] == df['FTHG'], 1, np.where( df['FTAG'] >  df['FTHG'], 3, 0))
df['xPts_Home'] = np.where(round(df['home_xG']) == round(df['away_xG']), 1, np.where( round(df['home_xG']) >  round(df['away_xG']), 3, 0))
df['xPts_Away'] = np.where(round(df['away_xG']) == round(df['home_xG']), 1, np.where( round(df['away_xG']) >  round(df['home_xG']), 3, 0))
df['Conversion_Home'] = df.FTHG/df.home_xG
df['Conversion_Away'] = df.FTAG/df.away_xG
df['Home_win'] = (df['FTHG'] - df['FTAG'] > 0).astype(int)

In [5]:
df = df.rename({'HomeTeam':'Team_Home','AwayTeam':'Team_Away','FTHG':'FTG_Home','FTAG':'FTG_Away', 
               'HTHG':'HTG_Home','HTAG':'HTG_Away', 'HS':'S_Home','AS':'S_Away', 'HST':'ST_Home','AST':'ST_Away',
               'home_xG':'xG_Home','away_xG':'xG_Away'}, axis = 1)
df.loc[:,'GameID'] = df.index

In [6]:
df = pd.wide_to_long(df, ['Team','FTG','HTG','S','ST','xG','GD','Pts','xPts','Conversion'], i = ['GameID'], j = 'isHome', sep = '_', suffix = r'\w+')

In [7]:
df = df.reset_index().sort_values(['Date','GameID'])
df.loc[:,'isHome'] = 1*(df.isHome == 'Home')

In [8]:
df.loc[:,'GP'] = df.groupby('Team').Pts.transform(lambda x : x.rolling(len(df), 1).count().shift(1, fill_value = 0))
df['GP_Season'] = (df['GP']) % np.where(df['Div'] == 'Bundesliga', 34, 38)
df.loc[:,'G_total'] = df.groupby('Team').FTG.transform(lambda x : x.cumsum().shift(1, fill_value = 0))
df.loc[:,'Pts_total'] = df.groupby('Team').Pts.transform(lambda x : x.cumsum().shift(1, fill_value = 0))
df.loc[:,'Form3'] = df.groupby(['Team','Y']).Pts.transform(lambda x : x.rolling(3, 1).sum().shift(1, fill_value = 0))
df.loc[:,'Form5'] = df.groupby(['Team','Y']).Pts.transform(lambda x : x.rolling(5, 1).sum().shift(1, fill_value = 0))
df.loc[:,'Form7'] = df.groupby(['Team','Y']).Pts.transform(lambda x : x.rolling(7, 1).sum().shift(1, fill_value = 0))
df.loc[:,'Form3g'] = df.groupby(['Team','Y']).FTG.transform(lambda x : x.rolling(3, 1).sum().shift(1, fill_value = 0))
df.loc[:,'Form5g'] = df.groupby(['Team','Y']).FTG.transform(lambda x : x.rolling(5, 1).sum().shift(1, fill_value = 0))
df.loc[:,'Form7g'] = df.groupby(['Team','Y']).FTG.transform(lambda x : x.rolling(7, 1).sum().shift(1, fill_value = 0))
df.loc[:,'G_season'] = df.groupby(['Y','Team']).FTG.transform(lambda x : x.cumsum().shift(1, fill_value = 0))
df.loc[:,'Pts_season'] = df.groupby(['Y','Team']).Pts.transform(lambda x : x.cumsum().shift(1, fill_value = 0))
df.loc[:,'Rank'] = df.groupby(['Y','Div', 'GP_Season'])['Pts_season'].rank("average", ascending=False)
df.loc[:,'GD_total'] = df.groupby('Team').GD.transform(lambda x : x.cumsum().shift(1, fill_value = 0))
df.loc[:,'xG_total'] = df.groupby('Team').xG.transform(lambda x : x.cumsum().shift(1, fill_value = 0))
df.loc[:,'xPts_total'] = df.groupby(['Team']).xPts.transform(lambda x : x.cumsum().shift(1, fill_value = 0))
df.loc[:,'xPts_season'] = df.groupby(['Y','Team']).xPts.transform(lambda x : x.cumsum().shift(1, fill_value = 0))
df.loc[:,'xRank'] = df.groupby(['Y','Div', 'GP_Season'])['xPts_season'].rank("average", ascending=False)
df['AGD'] = df['GD_total']/df['GP']
df['AxG'] = df['xG_total']/df['GP']
df['Reason'] = np.where((df['Rank'] >= 15) & (df['GP_Season'] >= 30), 1, 0)
df = df.fillna(0)

In [9]:
df1 = df.loc[(df.isHome == 1)]
df2 = df.loc[(df.isHome == 0)]
dfmerge = pd.merge(df1, df2, on=('GameID', 'Y', 'Date','Home_win','Div'), how='outer', suffixes=('_Home', '_Away'))
dfmerge.tail(6)

Unnamed: 0,GameID,isHome_Home,Date,Div,Home_win,Y,Team_Home,FTG_Home,HTG_Home,S_Home,ST_Home,xG_Home,GD_Home,Pts_Home,xPts_Home,Conversion_Home,GP_Home,GP_Season_Home,G_total_Home,Pts_total_Home,Form3_Home,Form5_Home,Form7_Home,Form3g_Home,Form5g_Home,Form7g_Home,G_season_Home,Pts_season_Home,Rank_Home,GD_total_Home,xG_total_Home,xPts_total_Home,xPts_season_Home,xRank_Home,AGD_Home,AxG_Home,Reason_Home,isHome_Away,Team_Away,FTG_Away,HTG_Away,S_Away,ST_Away,xG_Away,GD_Away,Pts_Away,xPts_Away,Conversion_Away,GP_Away,GP_Season_Away,G_total_Away,Pts_total_Away,Form3_Away,Form5_Away,Form7_Away,Form3g_Away,Form5g_Away,Form7g_Away,G_season_Away,Pts_season_Away,Rank_Away,GD_total_Away,xG_total_Away,xPts_total_Away,xPts_season_Away,xRank_Away,AGD_Away,AxG_Away,Reason_Away
9124,9080,1,2019-05-26,Serie_A,0,18,Fiorentina,0,0,5,5,0.296039,0,1,1,0.0,189,37,285,285,0,0,1,0,1,1,47,40,15.5,49,276.947506,301,57,8.0,0.259259,1.465331,1,0,Genoa,0,0,2,1,0.074391,0,1,1,0.0,189,37,217,219,2,3,4,3,4,5,39,37,18.0,-42,234.476966,221,39,13.0,-0.222222,1.240619,1
9125,9081,1,2019-05-26,Serie_A,1,18,Inter,2,0,20,15,2.74302,1,3,3,0.729123,189,37,302,322,4,6,10,3,5,8,55,66,3.5,105,304.790265,336,75,3.0,0.555556,1.612647,0,0,Empoli,1,0,9,5,1.77275,-1,0,0,0.564095,151,37,165,158,9,9,10,7,10,12,50,38,17.0,-65,172.881224,151,35,15.0,-0.430464,1.144909,1
9126,9082,1,2019-05-26,Serie_A,1,18,Roma,2,1,16,8,1.72281,1,3,3,1.160894,189,37,352,377,5,9,15,3,7,9,64,63,6.0,167,330.576535,371,67,5.0,0.883598,1.749082,0,0,Parma,1,0,9,5,1.08938,-1,0,0,0.917953,75,37,73,67,4,6,8,5,7,7,40,41,13.0,-61,71.775209,52,32,16.0,-0.813333,0.957003,0
9127,9083,1,2019-05-26,Serie_A,1,18,Sampdoria,2,0,10,3,0.686933,2,3,3,2.911492,189,37,259,248,2,2,5,4,5,7,58,50,9.0,-10,239.932721,215,50,9.0,-0.05291,1.269485,0,0,Juventus,0,0,6,1,0.487175,-2,0,0,0.0,189,37,380,454,2,6,9,2,5,8,70,90,1.0,257,314.071637,423,76,2.0,1.359788,1.661755,0
9128,9084,1,2019-05-26,Serie_A,0,18,Spal,2,1,7,4,0.433984,-1,0,0,4.608465,75,37,81,80,3,7,10,7,12,15,42,42,11.0,-31,77.463586,70,39,13.0,-0.413333,1.032848,0,0,Milan,3,2,16,8,1.7846,1,3,3,1.681049,189,37,270,301,9,10,13,5,6,8,52,65,5.0,56,261.907505,285,59,7.0,0.296296,1.385754,0
9129,9085,1,2019-05-26,Serie_A,1,18,Torino,3,0,9,7,2.39804,2,3,3,1.251022,189,37,274,266,4,10,12,5,8,9,49,60,7.0,26,249.261002,243,45,10.0,0.137566,1.318841,0,0,Lazio,1,0,9,4,1.1792,-2,0,0,0.848033,189,37,341,324,4,7,10,6,9,11,55,59,8.0,108,309.183868,324,61,6.0,0.571429,1.635893,0


In [10]:
dfhw = dfmerge.loc[dfmerge['Y'] < 17]
df17 = dfmerge.loc[dfmerge['Y'] == 17]
df18 = dfmerge.loc[dfmerge['Y'] == 18]
result = smf.glm('Home_win ~ AGD_Home + AGD_Away + Form5g_Home + Form5g_Away + AxG_Home + xPts_season_Home + Rank_Home + xRank_Away + Reason_Home', data = dfhw, family = sm.families.Binomial()).fit()
result.summary(), print(brier_score_loss(df17.Home_win, result.predict(df17))), print(brier_score_loss(df18.Home_win, result.predict(df18)))


0.2124450746097292
0.2140392383540378


(<class 'statsmodels.iolib.summary.Summary'>
 """
                  Generalized Linear Model Regression Results                  
 Dep. Variable:               Home_win   No. Observations:                 5478
 Model:                            GLM   Df Residuals:                     5468
 Model Family:                Binomial   Df Model:                            9
 Link Function:                  logit   Scale:                          1.0000
 Method:                          IRLS   Log-Likelihood:                -3429.7
 Date:                Wed, 17 Feb 2021   Deviance:                       6859.4
 Time:                        20:29:16   Pearson chi2:                 5.46e+03
 No. Iterations:                     4                                         
 Covariance Type:            nonrobust                                         
                        coef    std err          z      P>|z|      [0.025      0.975]
 ---------------------------------------------------------------

## Other Attempts

In [11]:
def getBS(model, X_train, X_test, y_train, y_test):
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  brier = brier_score_loss(y_test, y_pred)
  return brier

def forward_selection(model, X_train, X_test, y_train, y_test):
    initial_features = X_train.columns
    temp = 1
    best_features = []
    best = 'none'
    while (len(initial_features)>0):
        remaining_features = list(set(initial_features)-set(best_features))
        for new_column in remaining_features:
            temp_list = best_features+[new_column]
            brier = getBS(model, X_train[temp_list], X_test[temp_list], y_train, y_test)
            if brier<temp:
                temp = brier
                best = new_column
        if best not in best_features:
            best_features.append(best)
        else:
            break
    return brier, best_features

def backward_selection(model, X_train, X_test, y_train, y_test):
    initial_features = X_train.columns
    temp = 1
    worst_features = []
    worst = 'none'
    while (len(worst_features)<len(initial_features)):
        subset = list(set(initial_features)-set(worst_features))
        for new_column in subset:
            remaining_features = list(set(subset)-set([new_column]))
            brier = getBS(model, X_train[remaining_features], X_test[remaining_features], y_train, y_test)
            if brier<temp:
                temp = brier
                worst = new_column
        if worst not in worst_features:
            worst_features.append(worst)
        else:
            break
    
    best_features = list(set(initial_features)-set(worst_features))
    
    brier = getBS(model, X_train[best_features], X_test[best_features], y_train, y_test)
    return brier, best_features

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

rf = RandomForestClassifier(criterion = 'entropy')
lr = LogisticRegression(C=1000)
svm = SVC(probability=True)

In [12]:
X_train = dfhw[['GP_Away','GP_Season_Away','G_total_Away','Pts_total_Away','Form3_Away','Form5_Away','Form7_Away','Form3g_Away','Form5g_Away','Form7g_Away','Form3g_Home','Form5g_Home','Form7g_Home','G_season_Away','Pts_season_Away','Rank_Away','GD_total_Away','xG_total_Away','xPts_total_Away','xPts_season_Away','xRank_Away','AGD_Away','AxG_Away','Reason_Away','GP_Home','GP_Season_Home','G_total_Home','Pts_total_Home','Form3_Home','Form5_Home','Form7_Home','G_season_Home','Pts_season_Home','Rank_Home','GD_total_Home','xG_total_Home','xPts_total_Home','xPts_season_Home','xRank_Home','AGD_Home','AxG_Home','Reason_Home']]
X_val = df17[['GP_Away','GP_Season_Away','G_total_Away','Pts_total_Away','Form3_Away','Form5_Away','Form7_Away','Form3g_Away','Form5g_Away','Form7g_Away','Form3g_Home','Form5g_Home','Form7g_Home','G_season_Away','Pts_season_Away','Rank_Away','GD_total_Away','xG_total_Away','xPts_total_Away','xPts_season_Away','xRank_Away','AGD_Away','AxG_Away','Reason_Away','GP_Home','GP_Season_Home','G_total_Home','Pts_total_Home','Form3_Home','Form5_Home','Form7_Home','G_season_Home','Pts_season_Home','Rank_Home','GD_total_Home','xG_total_Home','xPts_total_Home','xPts_season_Home','xRank_Home','AGD_Home','AxG_Home','Reason_Home']]
X_test = df18[['GP_Away','GP_Season_Away','G_total_Away','Pts_total_Away','Form3_Away','Form5_Away','Form7_Away','Form3g_Away','Form5g_Away','Form7g_Away','Form3g_Home','Form5g_Home','Form7g_Home','G_season_Away','Pts_season_Away','Rank_Away','GD_total_Away','xG_total_Away','xPts_total_Away','xPts_season_Away','xRank_Away','AGD_Away','AxG_Away','Reason_Away','GP_Home','GP_Season_Home','G_total_Home','Pts_total_Home','Form3_Home','Form5_Home','Form7_Home','G_season_Home','Pts_season_Home','Rank_Home','GD_total_Home','xG_total_Home','xPts_total_Home','xPts_season_Home','xRank_Home','AGD_Home','AxG_Home','Reason_Home']]
y_train = dfhw['Home_win']
y_val = df17['Home_win']
y_test = df18['Home_win']

In [13]:
# column_maxes = X_train.max()
# df_max = column_maxes.max()
# column_mins = X_train.min()
# df_min = column_mins.min()
# X_train = (X_train - df_min) / (df_max - df_min)

# column_maxes = X_val.max()
# df_max = column_maxes.max()
# column_mins = X_val.min()
# df_min = column_mins.min()
# X_val = (X_val - df_min) / (df_max - df_min)

# column_maxes = X_test.max()
# df_max = column_maxes.max()
# column_mins = X_test.min()
# df_min = column_mins.min()
# X_test = (X_test - df_min) / (df_max - df_min)

In [14]:
rf.fit(X_train,y_train)
lr.fit(X_train,y_train)
svm.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


SVC(probability=True)

In [15]:
print(brier_score_loss(y_test, rf.predict(X_test)))
print(brier_score_loss(y_test, lr.predict(X_test)))
print(brier_score_loss(y_test, svm.predict(X_test)))

0.3614457831325301
0.37130339539978097
0.37513691128148957


In [16]:
# Forward selection
# RF
brier_rf, frf = forward_selection(rf, X_train, X_test, y_train, y_test)
print(brier_rf, frf)

0.4167579408543264 ['xRank_Home']


In [17]:
# Logistic Regression
brier_lr, flr = forward_selection(lr, X_train, X_test, y_train, y_test)
print(brier_lr, flr)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

0.34446878422782035 ['GD_total_Home', 'xPts_total_Away', 'Form3g_Away', 'Form5_Home', 'Form5g_Home', 'Reason_Home', 'Reason_Away']


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [18]:
# SVM
brier_svm, fsvm = forward_selection(svm, X_train, X_test, y_train, y_test)
print(brier_svm, fsvm)

0.3466593647316539 ['AxG_Home', 'xRank_Away', 'AGD_Away', 'Rank_Home', 'Form7g_Home', 'GP_Season_Home', 'Form3g_Away']


In [19]:
# Backward selection

# RF
brier_rf, frf = backward_selection(rf, X_train, X_test, y_train, y_test)
print(brier_rf, frf)

0.35104052573932093 ['GP_Season_Home', 'Form5g_Home', 'GP_Season_Away', 'G_total_Away', 'Pts_season_Home', 'G_season_Home', 'xPts_season_Away', 'Form7_Home', 'G_season_Away', 'xPts_season_Home', 'xPts_total_Away', 'Reason_Home', 'GP_Away', 'Form3_Home', 'Pts_season_Away', 'GD_total_Away', 'xG_total_Home', 'Form7g_Home', 'AxG_Home', 'Form3g_Away', 'Rank_Home', 'xPts_total_Home', 'Form7g_Away', 'AGD_Away', 'AxG_Away', 'xRank_Home', 'xRank_Away', 'xG_total_Away', 'Pts_total_Home', 'Pts_total_Away', 'Form7_Away', 'Form3g_Home', 'Reason_Away', 'Form5_Away', 'GP_Home', 'AGD_Home', 'Form5g_Away', 'G_total_Home', 'GD_total_Home', 'Form3_Away']


In [20]:
# Logistic Regression
brier_lr, flr = backward_selection(lr, X_train, X_test, y_train, y_test)
print(brier_lr, flr)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

0.35706462212486306 ['Form5_Home', 'GP_Season_Home', 'Form5g_Home', 'GP_Season_Away', 'G_total_Away', 'Pts_season_Home', 'G_season_Home', 'xPts_season_Away', 'Form7_Home', 'G_season_Away', 'xPts_season_Home', 'xPts_total_Away', 'Reason_Home', 'GP_Away', 'Form3_Home', 'Pts_season_Away', 'GD_total_Away', 'xG_total_Home', 'Form7g_Home', 'Form3g_Away', 'Rank_Home', 'xPts_total_Home', 'Form7g_Away', 'AGD_Away', 'AxG_Away', 'xRank_Home', 'xRank_Away', 'xG_total_Away', 'Pts_total_Home', 'Rank_Away', 'Pts_total_Away', 'Form7_Away', 'Form3g_Home', 'Reason_Away', 'Form5_Away', 'GP_Home', 'AGD_Home', 'Form5g_Away', 'GD_total_Home', 'Form3_Away']


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [21]:
# SVM
brier_svm, fsvm = backward_selection(svm, X_train, X_test, y_train, y_test)
print(brier_svm, fsvm)

0.3663745892661555 ['Form5_Home', 'GP_Season_Home', 'Form5g_Home', 'GP_Season_Away', 'G_total_Away', 'Pts_season_Home', 'xPts_season_Away', 'Form7_Home', 'G_season_Away', 'xPts_season_Home', 'xPts_total_Away', 'Reason_Home', 'GP_Away', 'Form3_Home', 'Pts_season_Away', 'GD_total_Away', 'xG_total_Home', 'Form7g_Home', 'AxG_Home', 'Form3g_Away', 'Rank_Home', 'xPts_total_Home', 'Form7g_Away', 'AGD_Away', 'AxG_Away', 'xRank_Home', 'xRank_Away', 'xG_total_Away', 'Rank_Away', 'Form7_Away', 'Form3g_Home', 'Reason_Away', 'Form5_Away', 'GP_Home', 'AGD_Home', 'Form5g_Away', 'G_total_Home', 'GD_total_Home']


In [22]:
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.linear_model import LinearRegression
# rfr = RandomForestRegressor()
# lin = LinearRegression()

# rfr.fit(X_train, y_train)
# y_pred = rfr.predict(X_test)

# features = X_train.columns
# importances = rfr.feature_importances_
# indices = np.argsort(importances)  
# plt.figure(figsize=(7,12))
# plt.title('Feature Importances')
# plt.barh(range(len(indices)), importances[indices], color='b', align='center')
# plt.yticks(range(len(indices)), [features[i] for i in indices])
# plt.xlabel('Relative Importance')