The chosen modeling has two aspects:
- a primary model, whose goal is to predict matches final issue (home, away or draw)
- secondary models, whose goals are to predict the number of goals scored by home and away teams knowing the match final issue

This notebooks starts by the primary model, and has the following sections:
- feature engineering
- feature selection
- fitting
- model evaluation

After that, the notebook focuses on the secondary model. Same sections are developed for that model.

In [None]:
import pandas as pd
import numpy as np
import optuna
import os
import sys
import matplotlib.pyplot as plt
import seaborn as sns
import re

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report, log_loss
from sklearn.linear_model import LogisticRegression, PoissonRegressor
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier, XGBRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

root_path = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(root_path)

from src.config import load_config
from src.feature_engineering import create_diff_features
from src.feature_selection import find_highly_correlated_cols, remove_low_variance_features, select_top_features
from src.modeling import run_primary_modeling, run_secondary_modeling, load_model, evaluate_model_metrics, evaluate_regression_model

# config.yaml importation
config_file = 'config.yaml'
config_path = os.path.join(root_path, config_file)
config = load_config(config_path)

# Preprocessed data importation

In [None]:
preprocessed_data_path = os.path.join(root_path, config['preprocessed_dir'])
df_train_path = os.path.join(preprocessed_data_path, f"{config['preprocessed_train_df_name']}.csv")
df_test_path = os.path.join(preprocessed_data_path, f"{config['preprocessed_test_df_name']}.csv")

df_train = pd.read_csv(df_train_path)
df_test = pd.read_csv(df_test_path)
df_train.head()

In [None]:
primary_target = config['final_result_column']
secondary_target_home = config['nb_goals_home_column']
secondary_target_away = config['nb_goals_away_column']

X_train = df_train.drop(columns=[primary_target, secondary_target_home, secondary_target_away, config['date_column'], config['season_column']])
X_test = df_test.drop(columns=[primary_target, secondary_target_home, secondary_target_away, config['date_column'], config['season_column']])

# Target for primary model
y_train_primary = df_train[primary_target]
y_test_primary = df_test[primary_target]

# Targets for secondary models
y_train_secondary_home = df_train[secondary_target_home]
y_test_secondary_home = df_test[secondary_target_home]

y_train_secondary_away = df_train[secondary_target_away]
y_test_secondary_away = df_test[secondary_target_away]

In [None]:
X_train.columns

# Primary model

Target: issue of a match (home / draw / away).

## Feature engineering

Since we want to model the final issue of a match, exhaustive data related to home and away teams is not necessary. That is why for each pair of similar columns for home and away teams, we create the difference between these two columns.

In [None]:
patterns = [
        ("_home_team_ranking_at_home", "_away_team_ranking_away"),
        ("_home_team_at_home", "_away_team_away"),
        ("_home_team", "_away_team"),
        ("_at_home", "_away"),
        ("_home", "_away")
]

X_train_primary = create_diff_features(X_train, patterns=patterns)
X_test_primary = create_diff_features(X_test, patterns=patterns)

X_train_primary.head()

## Feature Selection

We implemented 3 methods to select features:
- remove highly correlated features
- remove low variance features
- select top K features which could explain the primary target

### Correlation method

In [None]:
corr = X_train_primary.corr(numeric_only=True)
plt.figure(figsize=(12,8))
sns.heatmap(corr, cmap="coolwarm", center=0)
plt.title("Correlation matrix")
plt.show()

In [None]:
highly_correlated_cols = find_highly_correlated_cols(X_train_primary)
highly_correlated_cols

In [None]:
X_train_primary = X_train_primary.drop(columns=highly_correlated_cols)
X_test_primary = X_test_primary.drop(columns=highly_correlated_cols)

### Low variance method

In [None]:
low_variance_cols = remove_low_variance_features(X_train_primary)
low_variance_cols

In [None]:
X_train_primary = X_train_primary.drop(columns=low_variance_cols)
X_test_primary = X_test_primary.drop(columns=low_variance_cols)

In [None]:
X_train_primary.columns

### Top K features 

In [None]:
top_k_cols = select_top_features(X_train_primary, y_train_primary)
top_k_cols

In [None]:
# # Optional: only select these top k features
# X_train_primary = X_train_primary[top_k_cols]
# X_test_primary = X_test_primary[top_k_cols]

## Fitting

Three types of classifiers will be tested:
- logistic regression
- random forest
- XGBoost

For hyperparameters, we will fit with a GridSearch.

### Logistic regression

In [None]:
cat_cols = [config['home_column'], config['away_column']]
num_cols = X_train_primary.select_dtypes(include=['int64','float64']).columns.tolist()
    
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
])

pipe_lr = Pipeline([
    ('pre', preprocessor),
    ('clf', LogisticRegression(multi_class='multinomial', solver='saga', max_iter=5000))
])

### Random forest

In [None]:
cat_cols = [config['home_column'], config['away_column']]
num_cols = X_train_primary.select_dtypes(include=['int64','float64']).columns.tolist()
    
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
])

pipe_rf = Pipeline([
    ('pre', preprocessor),
    ('clf', RandomForestClassifier(random_state=42, n_jobs=1))
])

### XGBoost

In [None]:
cat_cols = [config['home_column'], config['away_column']]
num_cols = X_train_primary.select_dtypes(include=['int64','float64']).columns.tolist()
    
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
])

pipe_xgb = Pipeline([
    ('pre', preprocessor),
    ('clf', XGBClassifier(objective='multi:softprob', use_label_encoder=False, eval_metric='mlogloss'))
])

### Run

In [None]:
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train_primary)

# run_primary_modeling(X=X_train_primary,
#                      y=y_train_enc,
#                      param_grid_lr=config['param_grid_lr'],
#                      param_grid_rf=config['param_grid_primary_rf'],
#                      param_grid_xgb=config['param_grid_xgb'],
#                      preprocessing_pipeline_lr=pipe_lr,
#                      preprocessing_pipeline_rf=pipe_rf,
#                      preprocessing_pipeline_xgb=pipe_xgb,
#                      outdir=config['primary_models_dir'])

## Model evaluation

In [None]:
y_test_enc = le.transform(y_test_primary)

### Logistic Regression

In [None]:
best_lr = load_model(os.path.join('..', config['primary_models_dir'], 'logistic.joblib'))
metrics = evaluate_model_metrics(best_lr, X_test_primary, y_test_enc)

### Random Forest

In [None]:
best_rf = load_model(os.path.join('..', config['primary_models_dir'], 'rf.joblib'))
metrics = evaluate_model_metrics(best_rf, X_test_primary, y_test_enc)

### XGBoost

In [None]:
# best_xgb = load_model(os.path.join(config['primary_models_dir'], 'xgb.joblib'))
# metrics = evaluate_model_metrics(best_xgb, X_test_primary, y_test_enc)

# Secondary models

Targets: knowing the probabilities of a match issue, number of goals of the two teams. There are therefore two different models: one dedicated to predict the number of goals scored by the home team, one dedicated to the number of goals scored by the away team.

## Feature engineering

Recovery of the home and away columns, providen as input of the two secondary models. Note that for each model, we recover only the performance indicators for each team, and the average of final result probabilities generated by the three primary models (these features replace the odd columns).

In [None]:
X_train.columns

In [None]:
patterns = [
        ("_home_team_ranking_at_home", "_away_team_ranking_away"),
        ("_home_team_at_home", "_away_team_away"),
        ("_home_team", "_away_team"),
        ("_at_home", "_away"),
        ("_home", "_away")
]

home_columns = [config['home_column'], config['away_column']]
away_columns = [config['home_column'], config['away_column']]
classified_cols = set(home_columns + away_columns)

for home_suffix, away_suffix in patterns:
    home_cols_with_suffix = [c for c in X_train.columns if (c.endswith(home_suffix) and c != config['odd_home_column'] and c not in classified_cols)]
    away_cols_with_suffix = [c for c in X_train.columns if (c.endswith(away_suffix) and c != config['odd_away_column'] and c not in classified_cols)]

    home_columns += home_cols_with_suffix
    away_columns += away_cols_with_suffix

    classified_cols.update(home_cols_with_suffix + away_cols_with_suffix)

# QC test
if len(home_columns) + len(away_columns) + 1 != len(X_train.columns):
    raise ValueError(f"X_train has {len(X_train.columns)} columns, {len(home_columns) + len(away_columns) + 3} columns have been classified")

In [None]:
# Match issues probabilities

models_to_consider = [best_lr, best_rf]
name_models = ['lr', 'rf'] # in the same order

labels = le.inverse_transform(np.array((0,1,2)))
probs_df_train = pd.DataFrame()
probs_df_test = pd.DataFrame()

for i, model in enumerate(models_to_consider):
    df_model_train = pd.DataFrame(model.predict_proba(X_train_primary), columns=[f"proba_{lab}_{name_models[i]}" for lab in labels])
    probs_df_train = pd.concat([probs_df_train, df_model_train], axis=1)

    df_model_test = pd.DataFrame(model.predict_proba(X_test_primary), columns=[f"proba_{lab}_{name_models[i]}" for lab in labels])
    probs_df_test = pd.concat([probs_df_test, df_model_test], axis=1)

# Averaging issue probabilities along models
probs_df_train['proba_home'] = probs_df_train[['proba_home_lr', 'proba_home_rf']].mean(axis=1)
probs_df_train['proba_draw'] = probs_df_train[['proba_draw_lr', 'proba_draw_rf']].mean(axis=1)
probs_df_train['proba_away'] = probs_df_train[['proba_away_lr', 'proba_away_rf']].mean(axis=1)
probs_df_train = probs_df_train[['proba_home', 'proba_draw', 'proba_away']]

probs_df_test['proba_home'] = probs_df_test[['proba_home_lr', 'proba_home_rf']].mean(axis=1)
probs_df_test['proba_draw'] = probs_df_test[['proba_draw_lr', 'proba_draw_rf']].mean(axis=1)
probs_df_test['proba_away'] = probs_df_test[['proba_away_lr', 'proba_away_rf']].mean(axis=1)
probs_df_test = probs_df_test[['proba_home', 'proba_draw', 'proba_away']]

In [None]:
# X_train construction
X_train_secondary_home = X_train[home_columns]
X_train_secondary_home = pd.concat([X_train_secondary_home, probs_df_train], axis=1)

X_train_secondary_away = X_train[away_columns]
X_train_secondary_away = pd.concat([X_train_secondary_away, probs_df_train], axis=1)

# X_test construction
X_test_secondary_home = X_test[home_columns]
X_test_secondary_home = pd.concat([X_test_secondary_home, probs_df_test], axis=1)

X_test_secondary_away = X_test[away_columns]
X_test_secondary_away = pd.concat([X_test_secondary_away, probs_df_test], axis=1)

## Feature Selection

We implemented 3 methods to select features:

- remove highly correlated features
- remove low variance features
- select top K features which could explain the primary target

### Correlation method

#### Home

In [None]:
corr = X_train_secondary_home.corr(numeric_only=True)
plt.figure(figsize=(12,8))
sns.heatmap(corr, cmap="coolwarm", center=0)
plt.title("Correlation matrix")
plt.show()

In [None]:
highly_correlated_cols = find_highly_correlated_cols(X_train_secondary_home)
highly_correlated_cols

In [None]:
X_train_secondary_home = X_train_secondary_home.drop(columns=highly_correlated_cols)
X_test_secondary_home = X_test_secondary_home.drop(columns=highly_correlated_cols)

#### Away

In [None]:
corr = X_train_secondary_away.corr(numeric_only=True)
plt.figure(figsize=(12,8))
sns.heatmap(corr, cmap="coolwarm", center=0)
plt.title("Correlation matrix")
plt.show()

In [None]:
highly_correlated_cols = find_highly_correlated_cols(X_train_secondary_away)
highly_correlated_cols

In [None]:
X_train_secondary_away = X_train_secondary_away.drop(columns=highly_correlated_cols)
X_test_secondary_away = X_test_secondary_away.drop(columns=highly_correlated_cols)

### Top K features

#### Home

In [None]:
top_k_cols = select_top_features(X_train_secondary_home, y_train_secondary_home)
top_k_cols

In [None]:
# # Optional: only select these top k features
# X_train_secondary_home = X_train_secondary_home[top_k_cols]
# X_test_secondary_home = X_test_secondary_home[top_k_cols]

#### Away

In [None]:
top_k_cols = select_top_features(X_train_secondary_away, y_train_secondary_away)
top_k_cols

In [None]:
# # Optional: only select these top k features
# X_train_secondary_away = X_train_secondary_away[top_k_cols]
# X_test_secondary_away = X_test_secondary_away[top_k_cols]

## Fitting

Three types of regressors will be tested:

- Poisson regressor
- random forest
- XGBoost

For hyperparameters, we will fit with a GridSearch.arch.

### Poisson regressor

In [None]:
cat_cols = [config['home_column'], config['away_column']]

num_cols_home = X_train_secondary_home.select_dtypes(include=['int64','float64']).columns.tolist()
num_cols_away = X_train_secondary_away.select_dtypes(include=['int64','float64']).columns.tolist()

preprocessor_home = ColumnTransformer([
        ('num', MinMaxScaler(), num_cols_home), # no negative values for Poisson regression
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
])

preprocessor_away = ColumnTransformer([
        ('num', MinMaxScaler(), num_cols_away), # no negative values for Poisson regression
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
])


pipe_poisson_home = Pipeline([
        ('pre', preprocessor_home),
        ('clf', PoissonRegressor(max_iter=5000, alpha=1.0))
])

pipe_poisson_away = Pipeline([
        ('pre', preprocessor_away),
        ('clf', PoissonRegressor(max_iter=5000, alpha=1.0))
])

### Random Forest

In [None]:
cat_cols = [config['home_column'], config['away_column']]

num_cols_home = X_train_secondary_home.select_dtypes(include=['int64','float64']).columns.tolist()
num_cols_away = X_train_secondary_away.select_dtypes(include=['int64','float64']).columns.tolist()

preprocessor_home = ColumnTransformer([
    ('num', StandardScaler(), num_cols_home),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
])

preprocessor_away = ColumnTransformer([
    ('num', StandardScaler(), num_cols_away),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
])

pipe_rf_home = Pipeline([
    ('pre', preprocessor_home),
    ('clf', RandomForestRegressor(random_state=42, n_jobs=1))
])

pipe_rf_away = Pipeline([
    ('pre', preprocessor_away),
    ('clf', RandomForestRegressor(random_state=42, n_jobs=1))
])

### XGBoost

In [None]:
cat_cols = [config['home_column'], config['away_column']]

num_cols_home = X_train_secondary_home.select_dtypes(include=['int64','float64']).columns.tolist()
num_cols_away = X_train_secondary_away.select_dtypes(include=['int64','float64']).columns.tolist() 


preprocessor_home = ColumnTransformer([
    ('num', StandardScaler(), num_cols_home),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
])

preprocessor_away = ColumnTransformer([
    ('num', StandardScaler(), num_cols_away),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
])

pipe_xgb_home = Pipeline([
    ('pre', preprocessor_home),
    ('clf', XGBRegressor(objective='reg:squarederror', use_label_encoder=False, eval_metric='mlogloss'))
])

pipe_xgb_away = Pipeline([
    ('pre', preprocessor_away),
    ('clf', XGBRegressor(objective='reg:squarederror', use_label_encoder=False, eval_metric='mlogloss'))
])

### Run

In [None]:
# run_secondary_modeling(X_home=X_train_secondary_home,
#                        y_home=y_train_secondary_home,
#                        X_away=X_train_secondary_away,
#                        y_away=y_train_secondary_away,
#                        param_grid_poisson=config['param_grid_poisson'],
#                        param_grid_rf=config['param_grid_secondary_rf'],
#                        param_grid_xgb=config['param_grid_xgb'],
#                        preprocessing_pipeline_poisson_home=pipe_poisson_home,
#                        preprocessing_pipeline_poisson_away=pipe_poisson_away,
#                        preprocessing_pipeline_rf_home=pipe_rf_home,
#                        preprocessing_pipeline_rf_away=pipe_rf_away,
#                        preprocessing_pipeline_xgb_home=pipe_xgb_home,
#                        preprocessing_pipeline_xgb_away=pipe_xgb_away,
#                        outdir=config['secondary_models_dir'])

## Model evaluation

### Poisson

In [None]:
best_home_poisson = load_model(os.path.join('..', config['secondary_models_dir'], 'home_poisson.joblib'))
print('Home model: \n')
metrics = evaluate_regression_model(best_home_poisson, X_test_secondary_home, y_test_secondary_home)

In [None]:
best_away_poisson = load_model(os.path.join('..', config['secondary_models_dir'], 'away_poisson.joblib'))
print('Away model: \n')
metrics = evaluate_regression_model(best_away_poisson, X_test_secondary_away, y_test_secondary_away)

### Random Forest

In [None]:
best_home_rf = load_model(os.path.join('..', config['secondary_models_dir'], 'home_rf.joblib'))
print('Home model: \n')
metrics = evaluate_regression_model(best_home_rf, X_test_secondary_home, y_test_secondary_home)

In [None]:
best_away_rf = load_model(os.path.join('..', config['secondary_models_dir'], 'away_rf.joblib'))
print('Away model: \n')
metrics = evaluate_regression_model(best_away_rf, X_test_secondary_away, y_test_secondary_away)

In [None]:
y_test_secondary_away

### XGBoost

In [None]:
# best_home_xgb = load_model(os.path.join('..', config['secondary_models_dir'], 'home_xgb.joblib'))
# print('Home model: \n')
# metrics = evaluate_regression_model(best_home_xgb, X_test_secondary_home, y_test_secondary_home)

In [None]:
# best_away_xgb = load_model(os.path.join('..', config['secondary_models_dir'], 'away_xgb.joblib'))
# print('Away model: \n')
# metrics = evaluate_regression_model(best_away_xgb, X_test_secondary_away, y_test_secondary_away)