# Import & Ingest

In [None]:
import sys
import pathlib
SOURCE_PATH = pathlib.Path.cwd().resolve().parent
sys.path.append(str(SOURCE_PATH))

In [None]:
from config import *
from utils import *
import pandas as pd
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, classification_report, make_scorer, f1_score
from sklearn.utils import class_weight
from xgboost import XGBClassifier

In [None]:
data = DATA_PATH.joinpath("match_predict.csv")
df = pd.read_csv(data)
df.info()

# Regression Probe

In [None]:
#To begin, I test linear and Random Forest regression on predicting the home score advantage (home goals minus away goals)
#to probe the viability of precise prediction.

In [None]:
#One-hot encoding the categorical columns
cat_cols = ['Country',
 'League',
 'home_buildUpPlayPositioningClass',
 'home_chanceCreationPositioningClass',
 'home_defenceDefenderLineClass',
 'away_buildUpPlayPositioningClass',
 'away_chanceCreationPositioningClass',
 'away_defenceDefenderLineClass']

df[cat_cols].nunique() #I will drop_first for the binary columns but not the ones with more than two categories.
#For linear regression, this allows all the columns to be used.
#For ensemble methods, if a feature has more than one category, then the dropped level could implicitly become important to a tree split,
#and therefore keeping it would afford greater explicit interpretability after the fact. (This isn't an issue with binary columns, because
#keeping the second column is just redundant.)

In [None]:
multi_cat_cols = ['Country',
'League']

binary_cat_cols = [col for col in df.columns if "class" in col.lower()]

def one_hot_encode(df):
    df_multi = pd.get_dummies(df[multi_cat_cols], drop_first = False).astype(int)
    df_binary = pd.get_dummies(df[binary_cat_cols], drop_first = True).astype(int)
    df_num = df.drop(columns = cat_cols)
    df = pd.concat([df_num, df_binary, df_multi], axis = 1)
    return df

In [None]:
df = one_hot_encode(df)

In [None]:
df.info()

In [None]:
df.head()

In [None]:
X, y = df.drop(columns = ["Home_Score_Adv"]), df.Home_Score_Adv
y.describe()

In [None]:
#No shuffling--maintaining temporal integrity and predicting future matches, as the model would do if realistically deployed.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, shuffle = False)

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_train_pred = lr.predict(X_train)
lr_test_pred = lr.predict(X_test)

In [None]:
#Linear regression mean absolute error, train and test
mean_absolute_error(y_train, lr_train_pred), mean_absolute_error(y_test, lr_test_pred)

In [None]:
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
rf_train_pred = rf.predict(X_train)
rf_test_pred = rf.predict(X_test)

In [None]:
#Random Forest regression mean absolute error, train and test.

mean_absolute_error(y_train, rf_train_pred), mean_absolute_error(y_test, rf_test_pred)
#Interestingly, a baseline Random Forest model overfits on the training data (unlike linear regression)
#but doesn't perform much differently at all on the test data.

In [None]:
lr = pd.DataFrame(lr_test_pred)
rf = pd.DataFrame(rf_test_pred)
target = pd.DataFrame(y_test)
target = target.reset_index().drop(columns = "index")
df = pd.concat([lr, rf, target], axis = 1)
df.columns = ["Linear Regression", "Random Forest Regression", "Home_Score_Adv_Target"]

In [None]:
df.head(15)

In [None]:
df.describe()

#Ah. The means of the predictions (both models) tightly hug the mean of the target, but the standard deviations
#are less than half that of the target. Duly, these regression models are playing it safe, trying to fit the best
#line to the data. Regression isn't the right paradigm at all here, and classification is more practical anyway than
#exact score difference prediction.

# Match Outcome Prediction: XGBoost

In [None]:
df = pd.read_csv(data)

df = one_hot_encode(df)

#Get our outcome category feature from the home score advantage feature, and then drop the home score advantage feature
def home_away_draw(row):
    if row["Home_Score_Adv"] > 0:
        return "H"
    elif row["Home_Score_Adv"] < 0:
        return "A"
    else:
        return "D"

df["Outcome"] = df.apply(home_away_draw, axis = 1)
df = df.drop(columns = "Home_Score_Adv")
df.head(10)

In [None]:
X, y = df.drop(columns = "Outcome"), df.Outcome.map(dict(zip(["D", "H", "A"], [0, 1, 2])))

In [None]:
X.info()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, shuffle = False)

In [None]:
#I balance the class weights and set scoring to f1 to promote recall on draws (minority class) while
#maintaining overall precision and avoiding sloppy over-guessing.

#By default, the model almost completely neglects to recognize draws for the sake of slight advantages
#in overall accuracy--not what we want.

xgb = XGBClassifier()
f1 = make_scorer(f1_score, average = "weighted") #ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].
param_grid = {"max_depth": [3, 4, 5, 6], "learning_rate": [0.01, 0.02, 0.03, 0.05], "gamma": [1, 2, 4, 5]} #Limiting the ceiling on these params to control overfitting.
grid = GridSearchCV(estimator = xgb, param_grid = param_grid, n_jobs = -1, scoring = f1)
class_weights = class_weight.compute_sample_weight(class_weight = "balanced", y = y_train)

In [None]:
grid.fit(X_train, y_train, sample_weight = class_weights)
xgb_train_pred = grid.predict(X_train)
xgb_test_pred = grid.predict(X_test)

In [None]:
xgb_df = pd.DataFrame(xgb_test_pred)
target = pd.DataFrame(y_test)
target_df = target.reset_index().drop(columns = "index") #Resetting the index is needed for concatenation
df = pd.concat([xgb_df, target_df], axis = 1)
df.columns = ["XGBoost", "Target_Outcome"]
df["XGBoost"] = df["XGBoost"].map({0: "D", 1: "H", 2: "A"})
df["Target_Outcome"] = df["Target_Outcome"].map({0: "D", 1: "H", 2: "A"})

In [None]:
grid.best_estimator_

In [None]:
df.head(15)

In [None]:
train_report, test_report = classification_report(y_train, xgb_train_pred), classification_report(y_test, xgb_test_pred)
print("Train\n", train_report)
print("Test\n", test_report)

In [None]:
df = pd.read_csv(data)
df["Outcome"] = df.apply(home_away_draw, axis = 1)
print("Class percentage breakdown")
df["Outcome"].value_counts()/len(df)

In [None]:
#The overall test accuracy is 47%, which significantly outperforms a 33% random guess accuracy and slightly outperforms
#the 45.7% home win rate. This ceiling on accuracy reflects the limits of static data, the unpredictability of the sport,
#and the goal of balancing accuracy with overall classification performance. Hiking up the accuracy score without sacrificing
#overall performance would take some smart feature engineering requiring time, effort, and a greater knowledge of the sport, plus
#more rigorous and comprehensive hyperparameter tuning. For now, this project--focused more on the data engineering feats
#than the final numbers--is a home win.