# Import & Ingest

In [61]:
import sys
import pathlib
SOURCE_PATH = pathlib.Path.cwd().resolve().parent
sys.path.append(str(SOURCE_PATH))

In [62]:
from config import *
from utils import *
import pandas as pd
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, classification_report, make_scorer, f1_score
from sklearn.utils import class_weight
from xgboost import XGBClassifier

In [63]:
data = DATA_PATH.joinpath("match_predict.csv")
df = pd.read_csv(data)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16554 entries, 0 to 16553
Data columns (total 53 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   stage                                    16554 non-null  int64  
 1   Country                                  16554 non-null  object 
 2   League                                   16554 non-null  object 
 3   home_buildUpPlayPositioningClass         16554 non-null  object 
 4   home_chanceCreationPositioningClass      16554 non-null  object 
 5   home_defenceDefenderLineClass            16554 non-null  object 
 6   away_buildUpPlayPositioningClass         16554 non-null  object 
 7   away_chanceCreationPositioningClass      16554 non-null  object 
 8   away_defenceDefenderLineClass            16554 non-null  object 
 9   buildUpPlaySpeed_home_diff               16554 non-null  float64
 10  buildUpPlayPassing_home_diff             16554

# Regression Probe

In [64]:
#To begin, I test linear and Random Forest regression on predicting the home score advantage (home goals minus away goals)
#to probe the viability of precise prediction.

In [65]:
#One-hot encoding the categorical columns
cat_cols = ['Country',
 'League',
 'home_buildUpPlayPositioningClass',
 'home_chanceCreationPositioningClass',
 'home_defenceDefenderLineClass',
 'away_buildUpPlayPositioningClass',
 'away_chanceCreationPositioningClass',
 'away_defenceDefenderLineClass']

df[cat_cols].nunique() #I will drop_first for the binary columns but not the ones with more than two categories.
#For linear regression, this allows all the columns to be used.
#For ensemble methods, if a feature has more than one category, then the dropped level could implicitly become important to a tree split,
#and therefore keeping it would afford greater explicit interpretability after the fact. (This isn't an issue with binary columns, because
#keeping the second column is just redundant.)

Country                                11
League                                 11
home_buildUpPlayPositioningClass        2
home_chanceCreationPositioningClass     2
home_defenceDefenderLineClass           2
away_buildUpPlayPositioningClass        2
away_chanceCreationPositioningClass     2
away_defenceDefenderLineClass           2
dtype: int64

In [66]:
multi_cat_cols = ['Country',
'League']

binary_cat_cols = [col for col in df.columns if "class" in col.lower()]

def one_hot_encode(df):
    df_multi = pd.get_dummies(df[multi_cat_cols], drop_first = False).astype(int)
    df_binary = pd.get_dummies(df[binary_cat_cols], drop_first = True).astype(int)
    df_num = df.drop(columns = cat_cols)
    df = pd.concat([df_num, df_binary, df_multi], axis = 1)
    return df

In [67]:
df = one_hot_encode(df)

In [68]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16554 entries, 0 to 16553
Data columns (total 73 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   stage                                          16554 non-null  int64  
 1   buildUpPlaySpeed_home_diff                     16554 non-null  float64
 2   buildUpPlayPassing_home_diff                   16554 non-null  float64
 3   chanceCreationPassing_home_diff                16554 non-null  float64
 4   chanceCreationCrossing_home_diff               16554 non-null  float64
 5   chanceCreationShooting_home_diff               16554 non-null  float64
 6   defencePressure_home_diff                      16554 non-null  float64
 7   defenceAggression_home_diff                    16554 non-null  float64
 8   defenceTeamWidth_home_diff                     16554 non-null  float64
 9   Home_Score_Adv                                 165

In [69]:
df.head()

Unnamed: 0,stage,buildUpPlaySpeed_home_diff,buildUpPlayPassing_home_diff,chanceCreationPassing_home_diff,chanceCreationCrossing_home_diff,chanceCreationShooting_home_diff,defencePressure_home_diff,defenceAggression_home_diff,defenceTeamWidth_home_diff,Home_Score_Adv,...,League_England Premier League,League_France Ligue 1,League_Germany 1. Bundesliga,League_Italy Serie A,League_Netherlands Eredivisie,League_Poland Ekstraklasa,League_Portugal Liga ZON Sagres,League_Scotland Premier League,League_Spain LIGA BBVA,League_Switzerland Super League
0,23,0.0,-5.0,20.0,10.0,0.0,15.0,30.0,10.0,1,...,0,0,0,0,0,0,0,0,1,0
1,28,12.0,15.0,14.0,0.0,15.0,10.0,-20.0,10.0,3,...,1,0,0,0,0,0,0,0,0,0
2,17,24.0,5.0,0.0,15.0,15.0,-5.0,-25.0,-5.0,1,...,0,0,0,1,0,0,0,0,0,0
3,17,22.0,0.0,-5.0,15.0,-10.0,15.0,-5.0,0.0,-1,...,0,0,0,1,0,0,0,0,0,0
4,24,5.0,15.0,15.0,-25.0,-20.0,-15.0,5.0,0.0,0,...,0,0,1,0,0,0,0,0,0,0


In [70]:
X, y = df.drop(columns = ["Home_Score_Adv"]), df.Home_Score_Adv
y.describe()

count    16554.000000
mean         0.384982
std          1.811005
min         -9.000000
25%         -1.000000
50%          0.000000
75%          1.000000
max         10.000000
Name: Home_Score_Adv, dtype: float64

In [71]:
#No shuffling--maintaining temporal integrity and predicting future matches, as the model would do if realistically deployed.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, shuffle = False)

In [72]:
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_train_pred = lr.predict(X_train)
lr_test_pred = lr.predict(X_test)

In [73]:
#Linear regression mean absolute error, train and test
mean_absolute_error(y_train, lr_train_pred), mean_absolute_error(y_test, lr_test_pred)

(1.267188244970302, 1.2789098158541359)

In [74]:
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
rf_train_pred = rf.predict(X_train)
rf_test_pred = rf.predict(X_test)

In [75]:
#Random Forest regression mean absolute error, train and test.

mean_absolute_error(y_train, rf_train_pred), mean_absolute_error(y_test, rf_test_pred)
#Interestingly, a baseline Random Forest model overfits on the training data (unlike linear regression)
#but doesn't perform much differently at all on the test data.

(0.4760515505436972, 1.2959458806474995)

In [76]:
lr = pd.DataFrame(lr_test_pred)
rf = pd.DataFrame(rf_test_pred)
target = pd.DataFrame(y_test)
target = target.reset_index().drop(columns = "index")
df = pd.concat([lr, rf, target], axis = 1)
df.columns = ["Linear Regression", "Random Forest Regression", "Home_Score_Adv_Target"]

In [77]:
df.head(15)

Unnamed: 0,Linear Regression,Random Forest Regression,Home_Score_Adv_Target
0,0.433091,0.56,0
1,1.697285,1.67,2
2,1.42894,1.18,-1
3,0.542874,0.31,3
4,0.378278,0.07,0
5,0.233211,1.0,1
6,-0.493826,-0.08,-3
7,1.229472,0.55,0
8,0.924966,1.08,-1
9,0.015831,0.41,0


In [78]:
df.describe()

#Ah. The means of the predictions (both models) tightly hug the mean of the target, but the standard deviations
#are less than half that of the target. Duly, these regression models are playing it safe, trying to fit the best
#line to the data. Regression isn't the right paradigm at all here, and classification is more practical anyway than
#exact score difference prediction.

Unnamed: 0,Linear Regression,Random Forest Regression,Home_Score_Adv_Target
count,4139.0,4139.0,4139.0
mean,0.387095,0.389969,0.344044
std,0.749353,0.783722,1.826188
min,-2.120662,-3.25,-9.0
25%,-0.111625,-0.08,-1.0
50%,0.376791,0.39,0.0
75%,0.863467,0.85,1.0
max,3.003734,3.57,8.0


# Match Outcome Prediction: XGBoost

In [79]:
df = pd.read_csv(data)

df = one_hot_encode(df)

#Get our outcome category feature from the home score advantage feature, and then drop the home score advantage feature
def home_away_draw(row):
    if row["Home_Score_Adv"] > 0:
        return "H"
    elif row["Home_Score_Adv"] < 0:
        return "A"
    else:
        return "D"

df["Outcome"] = df.apply(home_away_draw, axis = 1)
df = df.drop(columns = "Home_Score_Adv")
df.head(10)

Unnamed: 0,stage,buildUpPlaySpeed_home_diff,buildUpPlayPassing_home_diff,chanceCreationPassing_home_diff,chanceCreationCrossing_home_diff,chanceCreationShooting_home_diff,defencePressure_home_diff,defenceAggression_home_diff,defenceTeamWidth_home_diff,home_diff_avg_player_overall_rating,...,League_France Ligue 1,League_Germany 1. Bundesliga,League_Italy Serie A,League_Netherlands Eredivisie,League_Poland Ekstraklasa,League_Portugal Liga ZON Sagres,League_Scotland Premier League,League_Spain LIGA BBVA,League_Switzerland Super League,Outcome
0,23,0.0,-5.0,20.0,10.0,0.0,15.0,30.0,10.0,5.090909,...,0,0,0,0,0,0,0,1,0,H
1,28,12.0,15.0,14.0,0.0,15.0,10.0,-20.0,10.0,7.272727,...,0,0,0,0,0,0,0,0,0,H
2,17,24.0,5.0,0.0,15.0,15.0,-5.0,-25.0,-5.0,3.0,...,0,0,1,0,0,0,0,0,0,H
3,17,22.0,0.0,-5.0,15.0,-10.0,15.0,-5.0,0.0,-3.181818,...,0,0,1,0,0,0,0,0,0,A
4,24,5.0,15.0,15.0,-25.0,-20.0,-15.0,5.0,0.0,0.636364,...,0,1,0,0,0,0,0,0,0,D
5,24,-25.0,0.0,-25.0,30.0,0.0,-20.0,5.0,-20.0,-4.272727,...,0,1,0,0,0,0,0,0,0,A
6,24,15.0,-35.0,5.0,30.0,10.0,25.0,5.0,0.0,5.363636,...,0,1,0,0,0,0,0,0,0,D
7,28,-1.0,40.0,40.0,25.0,20.0,5.0,30.0,-15.0,-8.272727,...,0,0,0,0,0,0,0,0,0,A
8,27,0.0,0.0,0.0,0.0,10.0,0.0,5.0,0.0,1.727273,...,0,0,0,0,0,0,1,0,0,D
9,28,0.0,0.0,1.0,0.0,0.0,-15.0,5.0,-10.0,2.272727,...,0,0,0,0,0,0,0,0,0,A


In [80]:
X, y = df.drop(columns = "Outcome"), df.Outcome.map(dict(zip(["D", "H", "A"], [0, 1, 2])))

In [81]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16554 entries, 0 to 16553
Data columns (total 72 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   stage                                          16554 non-null  int64  
 1   buildUpPlaySpeed_home_diff                     16554 non-null  float64
 2   buildUpPlayPassing_home_diff                   16554 non-null  float64
 3   chanceCreationPassing_home_diff                16554 non-null  float64
 4   chanceCreationCrossing_home_diff               16554 non-null  float64
 5   chanceCreationShooting_home_diff               16554 non-null  float64
 6   defencePressure_home_diff                      16554 non-null  float64
 7   defenceAggression_home_diff                    16554 non-null  float64
 8   defenceTeamWidth_home_diff                     16554 non-null  float64
 9   home_diff_avg_player_overall_rating            165

In [82]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, shuffle = False)

In [83]:
#I balance the class weights and set scoring to f1 to promote recall on draws (minority class) while
#maintaining overall precision and avoiding sloppy over-guessing.

#By default, the model almost completely neglects to recognize draws for the sake of slight advantages
#in overall accuracy--not what we want.

xgb = XGBClassifier()
f1 = make_scorer(f1_score, average = "weighted") #ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].
param_grid = {"max_depth": [3, 4, 5, 6], "learning_rate": [0.01, 0.02, 0.03, 0.05], "gamma": [1, 2, 4, 5]} #Limiting the ceiling on these params to control overfitting.
grid = GridSearchCV(estimator = xgb, param_grid = param_grid, n_jobs = -1, scoring = f1)
class_weights = class_weight.compute_sample_weight(class_weight = "balanced", y = y_train)

In [84]:
grid.fit(X_train, y_train, sample_weight = class_weights)
xgb_train_pred = grid.predict(X_train)
xgb_test_pred = grid.predict(X_test)

In [85]:
xgb_df = pd.DataFrame(xgb_test_pred)
target = pd.DataFrame(y_test)
target_df = target.reset_index().drop(columns = "index") #Resetting the index is needed for concatenation
df = pd.concat([xgb_df, target_df], axis = 1)
df.columns = ["XGBoost", "Target_Outcome"]
df["XGBoost"] = df["XGBoost"].map({0: "D", 1: "H", 2: "A"})
df["Target_Outcome"] = df["Target_Outcome"].map({0: "D", 1: "H", 2: "A"})

In [86]:
grid.best_estimator_

0,1,2
,objective,'multi:softprob'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [87]:
df.head(15)

Unnamed: 0,XGBoost,Target_Outcome
0,D,D
1,H,H
2,H,A
3,H,H
4,D,D
5,H,H
6,A,A
7,H,D
8,H,A
9,D,D


In [88]:
train_report, test_report = classification_report(y_train, xgb_train_pred), classification_report(y_test, xgb_test_pred)
print("Train\n", train_report)
print("Test\n", test_report)

Train
               precision    recall  f1-score   support

           0       0.33      0.36      0.35      3140
           1       0.65      0.57      0.60      5716
           2       0.49      0.56      0.52      3559

    accuracy                           0.51     12415
   macro avg       0.49      0.49      0.49     12415
weighted avg       0.52      0.51      0.52     12415

Test
               precision    recall  f1-score   support

           0       0.28      0.32      0.30      1035
           1       0.61      0.54      0.57      1845
           2       0.47      0.49      0.48      1259

    accuracy                           0.47      4139
   macro avg       0.46      0.45      0.45      4139
weighted avg       0.49      0.47      0.48      4139



In [89]:
df = pd.read_csv(data)
df["Outcome"] = df.apply(home_away_draw, axis = 1)
print("Class percentage breakdown")
df["Outcome"].value_counts()/len(df)

Class percentage breakdown


Outcome
H    0.456748
A    0.291047
D    0.252205
Name: count, dtype: float64

In [90]:
#The overall test accuracy is 47%, which significantly outperforms a 33% random guess accuracy and slightly outperforms
#the 45.7% home win rate. This ceiling on accuracy reflects the limits of static data, the unpredictability of the sport,
#and the goal of balancing accuracy with overall classification performance. Hiking up the accuracy score without sacrificing
#overall performance would take some smart feature engineering requiring time, effort, and a greater knowledge of the sport, plus
#more rigorous and comprehensive hyperparameter tuning. For now, this project--focused more on the data engineering feats
#than the final numbers--is a home win.