# Import & Ingest

In [354]:
import sys
import pathlib
SOURCE_PATH = pathlib.Path.cwd().resolve().parent
sys.path.append(str(SOURCE_PATH))

In [355]:
from config import *
from utils import *
import pandas as pd
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error, classification_report
import matplotlib.pyplot as plt

In [356]:
data = DATA_PATH.joinpath("modeling_table.csv")
df = pd.read_csv(data)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16979 entries, 0 to 16978
Data columns (total 57 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   match_api_id                             16979 non-null  int64  
 1   season                                   16979 non-null  object 
 2   stage                                    16979 non-null  int64  
 3   match_date                               16979 non-null  object 
 4   Country                                  16979 non-null  object 
 5   League                                   16979 non-null  object 
 6   buildUpPlayPositioningClass              16554 non-null  object 
 7   chanceCreationPositioningClass           16554 non-null  object 
 8   defenceDefenderLineClass                 16554 non-null  object 
 9   away_buildUpPlayPositioningClass         16979 non-null  object 
 10  away_chanceCreationPositioningClass      16979

# Regression Test

In [357]:
df = df.drop(columns = "buildUpPlayDribbling_home_diff")
df = df.dropna()
df.info()
df.columns

<class 'pandas.core.frame.DataFrame'>
Index: 16554 entries, 0 to 16978
Data columns (total 56 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   match_api_id                             16554 non-null  int64  
 1   season                                   16554 non-null  object 
 2   stage                                    16554 non-null  int64  
 3   match_date                               16554 non-null  object 
 4   Country                                  16554 non-null  object 
 5   League                                   16554 non-null  object 
 6   buildUpPlayPositioningClass              16554 non-null  object 
 7   chanceCreationPositioningClass           16554 non-null  object 
 8   defenceDefenderLineClass                 16554 non-null  object 
 9   away_buildUpPlayPositioningClass         16554 non-null  object 
 10  away_chanceCreationPositioningClass      16554 non-

Index(['match_api_id', 'season', 'stage', 'match_date', 'Country', 'League',
       'buildUpPlayPositioningClass', 'chanceCreationPositioningClass',
       'defenceDefenderLineClass', 'away_buildUpPlayPositioningClass',
       'away_chanceCreationPositioningClass', 'away_defenceDefenderLineClass',
       'buildUpPlaySpeed_home_diff', 'buildUpPlayPassing_home_diff',
       'chanceCreationPassing_home_diff', 'chanceCreationCrossing_home_diff',
       'chanceCreationShooting_home_diff', 'defencePressure_home_diff',
       'defenceAggression_home_diff', 'defenceTeamWidth_home_diff',
       'Home_Score_Adv', 'home_diff_avg_player_overall_rating',
       'home_diff_avg_player_potential', 'home_diff_avg_player_crossing',
       'home_diff_avg_player_finishing',
       'home_diff_avg_player_heading_accuracy',
       'home_diff_avg_player_short_passing', 'home_diff_avg_player_volleys',
       'home_diff_avg_player_dribbling', 'home_diff_avg_player_curve',
       'home_diff_avg_player_free_kick_

In [358]:
X = df.iloc[:, 12:].drop(columns = "Home_Score_Adv")
y = df.Home_Score_Adv
y.describe()

count    16554.000000
mean         0.384982
std          1.811005
min         -9.000000
25%         -1.000000
50%          0.000000
75%          1.000000
max         10.000000
Name: Home_Score_Adv, dtype: float64

In [359]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, test_size = 0.3, shuffle = False)

In [360]:
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_train_pred = lr.predict(X_train)
lr_test_pred = lr.predict(X_test)

In [361]:
mean_absolute_error(y_train, lr_train_pred), mean_absolute_error(y_test, lr_test_pred)

(1.2725819556719797, 1.2744565373742582)

In [362]:
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
rf_train_pred = rf.predict(X_train)
rf_test_pred = rf.predict(X_test)

In [363]:
mean_absolute_error(y_train, rf_train_pred), mean_absolute_error(y_test, rf_test_pred)

(0.47578838353327013, 1.2892852828669217)

In [364]:
lr = pd.DataFrame(lr_test_pred)
rf = pd.DataFrame(rf_test_pred)
target = pd.DataFrame(y_test)
target = target.reset_index().drop(columns = "index")
df = pd.concat([lr, rf, target], axis = 1)
df.columns = ["lr", "rf", "target"]

In [365]:
df.head(50)

Unnamed: 0,lr,rf,target
0,0.352203,0.66,-1
1,0.48716,0.71,2
2,0.431056,0.3,-2
3,0.403014,0.01,-3
4,0.298129,0.42,1
5,1.853346,1.76,3
6,0.050862,-0.03,-2
7,0.113308,0.32,1
8,-0.359027,0.26,0
9,1.334639,1.44,2


In [366]:
df.describe() #clearly from the standard deviations, the predictions hug the mean to minimize error--not useful
#classification time

Unnamed: 0,lr,rf,target
count,4967.0,4967.0,4967.0
mean,0.40126,0.396292,0.335212
std,0.743835,0.779578,1.824944
min,-2.069363,-3.42,-9.0
25%,-0.082098,-0.05,-1.0
50%,0.404107,0.4,0.0
75%,0.882752,0.85,1.0
max,2.930913,4.08,8.0


# Classification: Multinomial Logistic Regression vs. XGBoost

In [367]:
data = DATA_PATH.joinpath("modeling_table.csv")
df = pd.read_csv(data)
df = df.drop(columns = "buildUpPlayDribbling_home_diff")
df = df.dropna()

def win_lose_draw(df):
    if df["Home_Score_Adv"] > 0:
        return "W"
    elif df["Home_Score_Adv"] < 0:
        return "L"
    else:
        return "D"

df["Home_Result"] = df.apply(win_lose_draw, axis = 1)
df.head(10)

Unnamed: 0,match_api_id,season,stage,match_date,Country,League,buildUpPlayPositioningClass,chanceCreationPositioningClass,defenceDefenderLineClass,away_buildUpPlayPositioningClass,...,home_diff_avg_player_penalties,home_diff_avg_player_marking,home_diff_avg_player_standing_tackle,home_diff_avg_player_sliding_tackle,home_diff_gk_diving,home_diff_gk_handling,home_diff_gk_kicking,home_diff_gk_positioning,home_diff_gk_reflexes,Home_Result
0,684955,2009/2010,23,2010-02-22,Spain,Spain LIGA BBVA,Free Form,Organised,Offside Trap,Organised,...,9.181818,-1.636364,-4.909091,-0.636364,0.272727,-0.454545,-0.636364,1.181818,-0.545455,W
1,659091,2009/2010,28,2010-02-23,England,England Premier League,Organised,Free Form,Cover,Organised,...,6.727273,7.272727,3.727273,2.909091,-0.272727,-0.272727,-0.090909,-0.909091,-0.727273,W
2,704654,2009/2010,17,2010-02-24,Italy,Italy Serie A,Organised,Organised,Offside Trap,Organised,...,5.636364,6.181818,8.272727,1.727273,-0.363636,0.0,0.090909,-0.181818,-0.636364,W
3,704632,2009/2010,17,2010-02-24,Italy,Italy Serie A,Free Form,Free Form,Offside Trap,Free Form,...,-4.363636,2.636364,3.090909,4.636364,0.636364,0.363636,-0.181818,0.636364,0.454545,L
4,674484,2009/2010,24,2010-02-27,Germany,Germany 1. Bundesliga,Organised,Organised,Cover,Organised,...,2.454545,-3.454545,-3.909091,-6.181818,-0.545455,-0.090909,0.0,0.090909,-0.636364,D
5,674486,2009/2010,24,2010-02-27,Germany,Germany 1. Bundesliga,Organised,Organised,Cover,Organised,...,2.909091,-4.090909,-2.272727,-1.0,0.636364,1.181818,1.090909,0.636364,0.272727,L
6,674480,2009/2010,24,2010-02-27,Germany,Germany 1. Bundesliga,Organised,Organised,Cover,Organised,...,5.454545,-0.909091,-0.454545,-0.909091,1.363636,0.636364,0.0,0.545455,1.545455,D
7,659090,2009/2010,28,2010-02-27,England,England Premier League,Organised,Organised,Cover,Free Form,...,-13.363636,-8.545455,-7.181818,-9.818182,-0.181818,-0.363636,-0.909091,0.181818,-0.727273,L
8,659024,2009/2010,27,2010-02-27,Scotland,Scotland Premier League,Organised,Organised,Cover,Organised,...,7.909091,-6.454545,-8.272727,-13.272727,-0.363636,0.181818,0.090909,-0.272727,-0.454545,D
9,659094,2009/2010,28,2010-02-27,England,England Premier League,Free Form,Free Form,Cover,Organised,...,1.909091,-11.727273,-5.818182,-7.363636,-0.909091,-1.363636,-1.636364,-1.272727,-0.727273,L


In [368]:
df.iloc[:, 1:12].nunique()

season                                    7
stage                                    38
match_date                             1252
Country                                  11
League                                   11
buildUpPlayPositioningClass               2
chanceCreationPositioningClass            2
defenceDefenderLineClass                  2
away_buildUpPlayPositioningClass          2
away_chanceCreationPositioningClass       2
away_defenceDefenderLineClass             2
dtype: int64

In [369]:
X = df.iloc[:, 12:-1].drop(columns = "Home_Score_Adv")
y = df.Home_Result

In [370]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.6, test_size = 0.4, shuffle = False)

In [371]:
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
rfc_train_pred = rfc.predict(X_train)
rfc_test_pred = rfc.predict(X_test)

rfc_df = pd.DataFrame(rfc_test_pred)
target = pd.DataFrame(y_test)
target_df = target.reset_index().drop(columns = "index")
df = pd.concat([rfc_df, target_df], axis = 1)
df.columns = ["rfc", "target"]

In [372]:
df.head(50)

Unnamed: 0,rfc,target
0,W,L
1,W,W
2,W,W
3,W,W
4,W,D
5,L,L
6,W,D
7,W,L
8,W,W
9,W,W


In [373]:
report = classification_report(y_train, rfc_train_pred)
print("Train\n", report)

Train
               precision    recall  f1-score   support

           D       1.00      1.00      1.00      2521
           L       1.00      1.00      1.00      2802
           W       1.00      1.00      1.00      4609

    accuracy                           1.00      9932
   macro avg       1.00      1.00      1.00      9932
weighted avg       1.00      1.00      1.00      9932



In [374]:
report = classification_report(y_test, rfc_test_pred)
print("Test\n", report)

Test
               precision    recall  f1-score   support

           D       0.29      0.04      0.08      1654
           L       0.50      0.41      0.45      2016
           W       0.52      0.82      0.64      2952

    accuracy                           0.50      6622
   macro avg       0.43      0.43      0.39      6622
weighted avg       0.45      0.50      0.44      6622



In [375]:
logr = LogisticRegression(multi_class = "multinomial", solver = "lbfgs")
logr.fit(X_train, y_train)
logr_train_pred = logr.predict(X_train)
logr_test_pred = logr.predict(X_test)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [376]:
logr_df = pd.DataFrame(logr_test_pred)
target = pd.DataFrame(y_test)
target_df = target.reset_index().drop(columns = "index")
df = pd.concat([rfc_df, target_df], axis = 1)
df.columns = ["logr", "target"]

In [377]:
df.head(50)

Unnamed: 0,logr,target
0,W,L
1,W,W
2,W,W
3,W,W
4,W,D
5,L,L
6,W,D
7,W,L
8,W,W
9,W,W


In [378]:
print("Train\n", classification_report(y_train, logr_train_pred))

Train
               precision    recall  f1-score   support

           D       0.36      0.01      0.02      2521
           L       0.49      0.48      0.48      2802
           W       0.55      0.84      0.66      4609

    accuracy                           0.53      9932
   macro avg       0.46      0.44      0.39      9932
weighted avg       0.48      0.53      0.45      9932



In [379]:
print("Test\n", classification_report(y_test, logr_test_pred))

Test
               precision    recall  f1-score   support

           D       0.17      0.00      0.01      1654
           L       0.50      0.44      0.47      2016
           W       0.52      0.84      0.64      2952

    accuracy                           0.51      6622
   macro avg       0.40      0.43      0.37      6622
weighted avg       0.42      0.51      0.43      6622

