# Import & Ingest

In [1]:
import sys
import pathlib
SOURCE_PATH = pathlib.Path.cwd().resolve().parent
sys.path.append(str(SOURCE_PATH))

In [2]:
from config import *
from utils import *
import pandas as pd
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error, classification_report, make_scorer, f1_score, precision_score, recall_score
from sklearn.utils import class_weight
import matplotlib.pyplot as plt
from xgboost import XGBClassifier

In [3]:
data = DATA_PATH.joinpath("match_predict.csv")
df = pd.read_csv(data)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16554 entries, 0 to 16553
Data columns (total 53 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   stage                                    16554 non-null  int64  
 1   Country                                  16554 non-null  object 
 2   League                                   16554 non-null  object 
 3   home_buildUpPlayPositioningClass         16554 non-null  object 
 4   home_chanceCreationPositioningClass      16554 non-null  object 
 5   home_defenceDefenderLineClass            16554 non-null  object 
 6   away_buildUpPlayPositioningClass         16554 non-null  object 
 7   away_chanceCreationPositioningClass      16554 non-null  object 
 8   away_defenceDefenderLineClass            16554 non-null  object 
 9   buildUpPlaySpeed_home_diff               16554 non-null  float64
 10  buildUpPlayPassing_home_diff             16554

# Regression Test

In [4]:
#one-hot encode
cat_cols = ['Country',
 'League',
 'home_buildUpPlayPositioningClass',
 'home_chanceCreationPositioningClass',
 'home_defenceDefenderLineClass',
 'away_buildUpPlayPositioningClass',
 'away_chanceCreationPositioningClass',
 'away_defenceDefenderLineClass']

df[cat_cols].nunique()

Country                                11
League                                 11
home_buildUpPlayPositioningClass        2
home_chanceCreationPositioningClass     2
home_defenceDefenderLineClass           2
away_buildUpPlayPositioningClass        2
away_chanceCreationPositioningClass     2
away_defenceDefenderLineClass           2
dtype: int64

In [5]:
multi_cat_cols = ['Country',
'League']

binary_cat_cols = [col for col in df.columns if "class" in col.lower()]

def one_hot_encode(df):
    df_multi = pd.get_dummies(df[multi_cat_cols], drop_first = False).astype(int)
    df_binary = pd.get_dummies(df[binary_cat_cols], drop_first = True).astype(int)
    df_num = df.drop(columns = cat_cols)
    df = pd.concat([df_num, df_binary, df_multi], axis = 1)
    return df

In [6]:
df = one_hot_encode(df)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16554 entries, 0 to 16553
Data columns (total 73 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   stage                                          16554 non-null  int64  
 1   buildUpPlaySpeed_home_diff                     16554 non-null  float64
 2   buildUpPlayPassing_home_diff                   16554 non-null  float64
 3   chanceCreationPassing_home_diff                16554 non-null  float64
 4   chanceCreationCrossing_home_diff               16554 non-null  float64
 5   chanceCreationShooting_home_diff               16554 non-null  float64
 6   defencePressure_home_diff                      16554 non-null  float64
 7   defenceAggression_home_diff                    16554 non-null  float64
 8   defenceTeamWidth_home_diff                     16554 non-null  float64
 9   Home_Score_Adv                                 165

In [8]:
df.head()

Unnamed: 0,stage,buildUpPlaySpeed_home_diff,buildUpPlayPassing_home_diff,chanceCreationPassing_home_diff,chanceCreationCrossing_home_diff,chanceCreationShooting_home_diff,defencePressure_home_diff,defenceAggression_home_diff,defenceTeamWidth_home_diff,Home_Score_Adv,...,League_England Premier League,League_France Ligue 1,League_Germany 1. Bundesliga,League_Italy Serie A,League_Netherlands Eredivisie,League_Poland Ekstraklasa,League_Portugal Liga ZON Sagres,League_Scotland Premier League,League_Spain LIGA BBVA,League_Switzerland Super League
0,23,0.0,-5.0,20.0,10.0,0.0,15.0,30.0,10.0,1,...,0,0,0,0,0,0,0,0,1,0
1,28,12.0,15.0,14.0,0.0,15.0,10.0,-20.0,10.0,3,...,1,0,0,0,0,0,0,0,0,0
2,17,24.0,5.0,0.0,15.0,15.0,-5.0,-25.0,-5.0,1,...,0,0,0,1,0,0,0,0,0,0
3,17,22.0,0.0,-5.0,15.0,-10.0,15.0,-5.0,0.0,-1,...,0,0,0,1,0,0,0,0,0,0
4,24,5.0,15.0,15.0,-25.0,-20.0,-15.0,5.0,0.0,0,...,0,0,1,0,0,0,0,0,0,0


In [9]:
X, y = df.drop(columns = ["Home_Score_Adv"]), df.Home_Score_Adv
y.describe()

count    16554.000000
mean         0.384982
std          1.811005
min         -9.000000
25%         -1.000000
50%          0.000000
75%          1.000000
max         10.000000
Name: Home_Score_Adv, dtype: float64

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, test_size = 0.3, shuffle = False)

In [11]:
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_train_pred = lr.predict(X_train)
lr_test_pred = lr.predict(X_test)

In [12]:
mean_absolute_error(y_train, lr_train_pred), mean_absolute_error(y_test, lr_test_pred)

(1.2691620804206911, 1.2723404327993317)

In [13]:
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
rf_train_pred = rf.predict(X_train)
rf_test_pred = rf.predict(X_test)

In [14]:
mean_absolute_error(y_train, rf_train_pred), mean_absolute_error(y_test, rf_test_pred)

(0.4781591438681281, 1.2861143547412928)

In [15]:
lr = pd.DataFrame(lr_test_pred)
rf = pd.DataFrame(rf_test_pred)
target = pd.DataFrame(y_test)
target = target.reset_index().drop(columns = "index")
df = pd.concat([lr, rf, target], axis = 1)
df.columns = ["lr", "rf", "target"]

In [16]:
df.head(50)

Unnamed: 0,lr,rf,target
0,0.286304,0.63,-1
1,0.310011,0.31,2
2,0.510287,0.48,-2
3,0.469057,0.09,-3
4,0.264682,0.43,1
5,1.782994,1.69,3
6,0.030939,0.01,-2
7,0.084212,0.3,1
8,-0.252838,0.38,0
9,1.36748,1.16,2


In [17]:
df.describe() #clearly from the standard deviations, the predictions hug the mean to minimize error--not useful

#it's classification time

Unnamed: 0,lr,rf,target
count,4967.0,4967.0,4967.0
mean,0.390003,0.391003,0.335212
std,0.747804,0.78087,1.824944
min,-2.074012,-3.49,-9.0
25%,-0.102181,-0.07,-1.0
50%,0.383508,0.4,0.0
75%,0.866152,0.86,1.0
max,3.141341,3.56,8.0


# Match Outcome Classification: Multinomial Logistic Regression vs. XGBoost

In [18]:
df = pd.read_csv(data)

df = one_hot_encode(df)

def home_away_draw(df):
    if df["Home_Score_Adv"] > 0:
        return "H"
    elif df["Home_Score_Adv"] < 0:
        return "A"
    else:
        return "D"

df["Outcome"] = df.apply(home_away_draw, axis = 1)
df = df.drop(columns = "Home_Score_Adv")
df.head(10)

Unnamed: 0,stage,buildUpPlaySpeed_home_diff,buildUpPlayPassing_home_diff,chanceCreationPassing_home_diff,chanceCreationCrossing_home_diff,chanceCreationShooting_home_diff,defencePressure_home_diff,defenceAggression_home_diff,defenceTeamWidth_home_diff,home_diff_avg_player_overall_rating,...,League_France Ligue 1,League_Germany 1. Bundesliga,League_Italy Serie A,League_Netherlands Eredivisie,League_Poland Ekstraklasa,League_Portugal Liga ZON Sagres,League_Scotland Premier League,League_Spain LIGA BBVA,League_Switzerland Super League,Outcome
0,23,0.0,-5.0,20.0,10.0,0.0,15.0,30.0,10.0,5.090909,...,0,0,0,0,0,0,0,1,0,H
1,28,12.0,15.0,14.0,0.0,15.0,10.0,-20.0,10.0,7.272727,...,0,0,0,0,0,0,0,0,0,H
2,17,24.0,5.0,0.0,15.0,15.0,-5.0,-25.0,-5.0,3.0,...,0,0,1,0,0,0,0,0,0,H
3,17,22.0,0.0,-5.0,15.0,-10.0,15.0,-5.0,0.0,-3.181818,...,0,0,1,0,0,0,0,0,0,A
4,24,5.0,15.0,15.0,-25.0,-20.0,-15.0,5.0,0.0,0.636364,...,0,1,0,0,0,0,0,0,0,D
5,24,-25.0,0.0,-25.0,30.0,0.0,-20.0,5.0,-20.0,-4.272727,...,0,1,0,0,0,0,0,0,0,A
6,24,15.0,-35.0,5.0,30.0,10.0,25.0,5.0,0.0,5.363636,...,0,1,0,0,0,0,0,0,0,D
7,28,-1.0,40.0,40.0,25.0,20.0,5.0,30.0,-15.0,-8.272727,...,0,0,0,0,0,0,0,0,0,A
8,27,0.0,0.0,0.0,0.0,10.0,0.0,5.0,0.0,1.727273,...,0,0,0,0,0,0,1,0,0,D
9,28,0.0,0.0,1.0,0.0,0.0,-15.0,5.0,-10.0,2.272727,...,0,0,0,0,0,0,0,0,0,A


In [19]:
X, y = df.drop(columns = "Outcome"), df.Outcome.map(dict(zip(["D", "H", "A"], [0, 1, 2])))

In [20]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16554 entries, 0 to 16553
Data columns (total 72 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   stage                                          16554 non-null  int64  
 1   buildUpPlaySpeed_home_diff                     16554 non-null  float64
 2   buildUpPlayPassing_home_diff                   16554 non-null  float64
 3   chanceCreationPassing_home_diff                16554 non-null  float64
 4   chanceCreationCrossing_home_diff               16554 non-null  float64
 5   chanceCreationShooting_home_diff               16554 non-null  float64
 6   defencePressure_home_diff                      16554 non-null  float64
 7   defenceAggression_home_diff                    16554 non-null  float64
 8   defenceTeamWidth_home_diff                     16554 non-null  float64
 9   home_diff_avg_player_overall_rating            165

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, shuffle = False)

In [22]:
xgb = XGBClassifier()

f1 = make_scorer(f1_score, average = "weighted") #ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].

param_grid = {"max_depth": [3, 4, 5, 6], "learning_rate": [0.01, 0.02, 0.03, 0.05], "gamma": [1, 2, 4, 5]}
grid = GridSearchCV(estimator = xgb, param_grid = param_grid, n_jobs = -1, scoring = f1)

class_weights = class_weight.compute_sample_weight(class_weight = "balanced", y = y_train)

In [23]:
grid.fit(X_train, y_train, sample_weight = class_weights)
xgb_train_pred = grid.predict(X_train)
xgb_test_pred = grid.predict(X_test)

In [24]:
xgb_df = pd.DataFrame(xgb_test_pred)
target = pd.DataFrame(y_test)
target_df = target.reset_index().drop(columns = "index") #Resetting the index is needed for concatenation
df = pd.concat([xgb_df, target_df], axis = 1)
df.columns = ["xgb", "target"]

In [25]:
grid.best_estimator_

0,1,2
,objective,'multi:softprob'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [26]:
df.head(20)

Unnamed: 0,xgb,target
0,0,0
1,1,1
2,1,2
3,1,1
4,0,0
5,1,1
6,2,2
7,1,0
8,1,2
9,0,0


In [27]:
train_report, test_report = classification_report(y_train, xgb_train_pred), classification_report(y_test, xgb_test_pred)
print("Train\n", train_report)
print("Test\n", test_report)

Train
               precision    recall  f1-score   support

           0       0.33      0.36      0.35      3140
           1       0.65      0.57      0.60      5716
           2       0.49      0.56      0.52      3559

    accuracy                           0.51     12415
   macro avg       0.49      0.49      0.49     12415
weighted avg       0.52      0.51      0.52     12415

Test
               precision    recall  f1-score   support

           0       0.28      0.32      0.30      1035
           1       0.61      0.54      0.57      1845
           2       0.47      0.49      0.48      1259

    accuracy                           0.47      4139
   macro avg       0.46      0.45      0.45      4139
weighted avg       0.49      0.47      0.48      4139



## Multinomial Logistic Regression

In [28]:
#for logistic regression (but not tree methods), you first have to scale the data (except categorical 0/1 columns)!

In [29]:
logr = LogisticRegression(multi_class = "multinomial", solver = "lbfgs")
logr.fit(X_train, y_train)
logr_train_pred = logr.predict(X_train)
logr_test_pred = logr.predict(X_test)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [30]:
logr_df = pd.DataFrame(logr_test_pred)
target = pd.DataFrame(y_test)
target_df = target.reset_index().drop(columns = "index")
df = pd.concat([xgb_df, target_df], axis = 1)
df.columns = ["logr", "target"]

In [31]:
df.head(20)

Unnamed: 0,logr,target
0,0,0
1,1,1
2,1,2
3,1,1
4,0,0
5,1,1
6,2,2
7,1,0
8,1,2
9,0,0


In [32]:
print("Train\n", classification_report(y_train, logr_train_pred))

Train
               precision    recall  f1-score   support

           0       0.32      0.02      0.04      3140
           1       0.55      0.83      0.66      5716
           2       0.49      0.48      0.49      3559

    accuracy                           0.53     12415
   macro avg       0.45      0.45      0.40     12415
weighted avg       0.47      0.53      0.45     12415



In [33]:
print("Test\n", classification_report(y_test, logr_test_pred))

Test
               precision    recall  f1-score   support

           0       0.28      0.02      0.03      1035
           1       0.52      0.82      0.64      1845
           2       0.49      0.46      0.48      1259

    accuracy                           0.51      4139
   macro avg       0.43      0.43      0.38      4139
weighted avg       0.45      0.51      0.44      4139

