# 05. Model Training

In [1]:
# Importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import joblib
import xgboost as xgb

In [2]:
# Importing dataset

df = pd.read_csv('../data/processed/dataset_processed.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 380 entries, 0 to 379
Data columns (total 39 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Round         380 non-null    int64  
 1   Day           380 non-null    object 
 2   Venue         380 non-null    object 
 3   Result        380 non-null    object 
 4   Home_Goals    380 non-null    int64  
 5   Away_Goals    380 non-null    int64  
 6   Away_Team     380 non-null    object 
 7   Home_xG       380 non-null    float64
 8   Away_xG       380 non-null    float64
 9   Home_Poss     380 non-null    float64
 10  Home_Shots    380 non-null    float64
 11  Home_SoT      380 non-null    float64
 12  Dist          380 non-null    float64
 13  FK            380 non-null    float64
 14  PK            380 non-null    int64  
 15  PKatt         380 non-null    int64  
 16  Home_Team     380 non-null    object 
 17  GD            380 non-null    int64  
 18  xGD           380 non-null    

In [3]:
# Encoding away, home, and days

features = pd.DataFrame()

home_encoder = LabelEncoder()
away_encoder = LabelEncoder()
features['home_team_id'] = home_encoder.fit_transform(df['Home_Team'])
features['away_team_id'] = away_encoder.fit_transform(df['Away_Team'])

day_encoder = LabelEncoder()
features['day_id'] = day_encoder.fit_transform(df['Day'])
features['is_weekend'] = df['Day'].isin(['Sat', 'Sun']).astype(int)

# Saving encoders
encoders = {'home': home_encoder, 'away': away_encoder, 'day': day_encoder}
print(f"Features: {features.columns.tolist()}")

Features: ['home_team_id', 'away_team_id', 'day_id', 'is_weekend']


In [4]:
# Adding rolling features like recent form (Calculating it from previous games in the dataset)

df = df.rename(columns={
    "Home_Team": "HomeTeam",
    "Away_Team": "AwayTeam",
    "Home_Goals": "FTHG",
    "Away_Goals": "FTAG"
})

team_stats = {}
all_teams = pd.concat([df['HomeTeam'], df['AwayTeam']]).unique()
for team in all_teams:
    team_stats[team] = {'games': 0, 'goals_for': [], 'goals_against': [], 'points': []}

home_form, away_form = [], []
home_goals_recent, away_goals_recent = [], []
h2h_history = []

for idx, row in df.iterrows():
    home, away = row['HomeTeam'], row['AwayTeam']

    # home form
    if len(team_stats[home]['points']) >= 5:
        home_form.append(sum(team_stats[home]['points'][-5:]))
        home_goals_recent.append(np.mean(team_stats[home]['goals_for'][-5:]))
    else:
        home_form.append(team_stats[home]['games'] * 1.5)
        home_goals_recent.append(1.5)

    # away form
    if len(team_stats[away]['points']) >= 5:
        away_form.append(sum(team_stats[away]['points'][-5:]))
        away_goals_recent.append(np.mean(team_stats[away]['goals_for'][-5:]))
    else:
        away_form.append(team_stats[away]['games'] * 1.5)
        away_goals_recent.append(1.5)

    # head to head
    h2h = df[:idx][(df[:idx]['HomeTeam'] == home) & (df[:idx]['AwayTeam'] == away)]
    if len(h2h) > 0:
        h2h_history.append(h2h.iloc[-1]['Result'])
    else:
        h2h_history.append('N')

    # update stats
    if row['Result'] == 'H':
        team_stats[home]['points'].append(3)
        team_stats[away]['points'].append(0)
    elif row['Result'] == 'A':
        team_stats[home]['points'].append(0)
        team_stats[away]['points'].append(3)
    else:
        team_stats[home]['points'].append(1)
        team_stats[away]['points'].append(1)

    team_stats[home]['goals_for'].append(row['FTHG'])
    team_stats[home]['goals_against'].append(row['FTAG'])
    team_stats[away]['goals_for'].append(row['FTAG'])
    team_stats[away]['goals_against'].append(row['FTHG'])
    team_stats[home]['games'] += 1
    team_stats[away]['games'] += 1

df['home_form'] = home_form
df['away_form'] = away_form
df['home_goals_avg'] = home_goals_recent
df['away_goals_avg'] = away_goals_recent
df['h2h_last'] = h2h_history

features['home_form'] = df['home_form']
features['away_form'] = df['away_form']
features['home_goals_avg'] = df['home_goals_avg']
features['away_goals_avg'] = df['away_goals_avg']

In [5]:
# One-hot encode team-ids to
features = pd.get_dummies(features, columns=["home_team_id", "away_team_id"])

In [6]:
# Splitting data into train and test
y = df['Result']
mask = df.index >= 50
features = features[mask]
y = y[mask]

X_train, X_test, y_train, y_test = train_test_split(
    features, y, test_size=0.2, random_state=42, stratify=y
)

In [7]:
# Encoding results

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_train_encoded

array([2, 0, 1, 0, 0, 1, 2, 0, 0, 2, 0, 2, 2, 1, 0, 0, 2, 2, 1, 2, 0, 1,
       0, 2, 0, 0, 2, 1, 2, 0, 0, 2, 0, 0, 2, 0, 0, 2, 0, 0, 1, 0, 2, 2,
       2, 1, 2, 0, 2, 0, 2, 0, 2, 1, 2, 0, 2, 0, 0, 0, 0, 0, 0, 2, 2, 2,
       1, 0, 2, 2, 2, 0, 1, 2, 0, 2, 2, 2, 1, 1, 1, 2, 0, 2, 2, 2, 0, 1,
       0, 2, 0, 2, 2, 1, 1, 1, 1, 2, 2, 1, 2, 1, 0, 0, 2, 2, 2, 2, 2, 2,
       0, 2, 2, 0, 0, 2, 0, 2, 0, 0, 1, 1, 2, 2, 0, 0, 1, 1, 0, 0, 0, 1,
       1, 1, 2, 0, 0, 2, 0, 2, 2, 0, 2, 1, 1, 1, 2, 2, 2, 2, 0, 0, 2, 0,
       0, 2, 1, 2, 1, 0, 1, 2, 0, 2, 1, 2, 0, 0, 2, 2, 2, 2, 1, 1, 0, 2,
       0, 2, 2, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 2, 0, 0, 1, 0, 0, 1, 1, 0,
       2, 0, 2, 0, 2, 0, 1, 2, 2, 2, 1, 1, 1, 2, 1, 0, 2, 0, 1, 2, 2, 1,
       2, 0, 0, 2, 0, 0, 1, 0, 2, 2, 2, 2, 2, 1, 2, 2, 2, 0, 2, 2, 2, 0,
       0, 0, 2, 1, 1, 0, 1, 0, 2, 2, 2, 2, 0, 2, 0, 2, 0, 0, 1, 0, 2, 0])

In [8]:
# First prototype

model_1 = xgb.XGBClassifier(
    n_estimators=200,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='multi:softprob',
    random_state=42,
    use_label_encoder=False,
    eval_metric='mlogloss'
)

model_1.fit(X_train, y_train_encoded)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


0,1,2
,objective,'multi:softprob'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [9]:
print(model_1.get_xgb_params())

{'objective': 'multi:softprob', 'base_score': None, 'booster': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': 0.8, 'device': None, 'eval_metric': 'mlogloss', 'gamma': None, 'grow_policy': None, 'interaction_constraints': None, 'learning_rate': 0.05, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': 5, 'max_leaves': None, 'min_child_weight': None, 'monotone_constraints': None, 'multi_strategy': None, 'n_jobs': None, 'num_parallel_tree': None, 'random_state': 42, 'reg_alpha': None, 'reg_lambda': None, 'sampling_method': None, 'scale_pos_weight': None, 'subsample': 0.8, 'tree_method': None, 'validate_parameters': None, 'verbosity': None, 'use_label_encoder': False}


In [10]:
# accuracy score

y_pred = model_1.predict(X_test)
y_test_encoded = label_encoder.transform(y_test)
accuracy = accuracy_score(y_test_encoded, y_pred)

print(f"Test Accuracy: {accuracy:.3f}")

Test Accuracy: 0.439


44% is mediocre, can be improved (aiming for 50%-60%)

In [11]:
print(classification_report(y_test_encoded, y_pred, target_names=label_encoder.classes_))

if hasattr(model_1, 'feature_importances_'):
    importance = pd.DataFrame({
        'feature': features.columns,
        'importance': model_1.feature_importances_
    }).sort_values('importance', ascending=False)
    print("Feature Importance:")
    print(importance)

              precision    recall  f1-score   support

           A       0.50      0.58      0.54        24
           D       0.11      0.07      0.08        15
           H       0.48      0.52      0.50        27

    accuracy                           0.44        66
   macro avg       0.36      0.39      0.37        66
weighted avg       0.40      0.44      0.42        66

Feature Importance:
            feature  importance
38  away_team_id_12    0.033077
27   away_team_id_1    0.029608
22  home_team_id_16    0.028698
8    home_team_id_2    0.028137
21  home_team_id_15    0.028010
32   away_team_id_6    0.027993
11   home_team_id_5    0.027910
17  home_team_id_11    0.027326
26   away_team_id_0    0.025265
36  away_team_id_10    0.025212
20  home_team_id_14    0.025210
9    home_team_id_3    0.025060
15   home_team_id_9    0.024890
45  away_team_id_19    0.024837
1        is_weekend    0.024413
37  away_team_id_11    0.024080
23  home_team_id_17    0.023529
28   away_team_id_2    

Weekend seems to be more prominent

In [12]:
# Second model with more finetuned params
model_2 = xgb.XGBClassifier(
    objective="multi:softmax",  # For probabilities
    num_class=3,                # 3 outcomes: H, D, A
    eval_metric="mlogloss",     
    n_estimators=500,           
    learning_rate=0.05,        
    max_depth=7,                
    subsample=0.8,              
    colsample_bytree=0.8,      
    min_child_weight=3,        
    reg_alpha=0.1,              
    reg_lambda=1,               
    random_state=42
)

model_2.fit(X_train, y_train_encoded)

0,1,2
,objective,'multi:softmax'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [13]:
# accuracy score

y_pred = model_2.predict(X_test)
accuracy_2 = accuracy_score(y_test_encoded, y_pred)

print(f"Test Accuracy: {accuracy_2:.3f}")

Test Accuracy: 0.409


Worse accuracy, Should conitnue with model_1

In [14]:
# Trying to train model_1 with GridSearchCV

from sklearn.model_selection import GridSearchCV

model_3 = xgb.XGBClassifier(
    n_estimators=200,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='multi:softprob',
    random_state=42,
    use_label_encoder=False,
    eval_metric='mlogloss'
)

param_grid = {
    'max_depth': [4, 5, 6],
    'learning_rate': [0.03, 0.05, 0.1],
    'n_estimators': [150, 200, 300],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9],
    'gamma': [0, 0.1, 0.3],
    'min_child_weight': [1, 3, 5]
}

grid_search = GridSearchCV(
    estimator=model_3,
    param_grid=param_grid,
    scoring='accuracy',  # or 'f1_macro' if multiclass
    cv=3,                # 3-fold cross-validation
    verbose=2,
    n_jobs=-1
)

grid_search.fit(X_train, y_train_encoded)

print("Best parameters:", grid_search.best_params_)
print("Best cross-validated accuracy:", grid_search.best_score_)

Fitting 3 folds for each of 2187 candidates, totalling 6561 fits
Best parameters: {'colsample_bytree': 0.8, 'gamma': 0.3, 'learning_rate': 0.03, 'max_depth': 4, 'min_child_weight': 1, 'n_estimators': 150, 'subsample': 0.7}
Best cross-validated accuracy: 0.4924242424242425


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [17]:
# accuracy score

y_pred = grid_search.predict(X_test)
accuracy_3 = accuracy_score(y_test_encoded, y_pred)

print(f"Test Accuracy: {accuracy_3}")

Test Accuracy: 0.5


Exactly 50%

In [18]:
# Save model
joblib.dump(model_3, "../models/xgboost_model.pkl")

['../models/xgboost_model.pkl']