<a href="https://colab.research.google.com/github/chorltonm/fa-cup-upsets/blob/main/notebooks/models/ml_models_unseen.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import general python libaries
import os
import json
import pandas as pd
import numpy as np
import importlib

# Google Cloud libraries
from google.cloud import bigquery
from google.oauth2 import service_account
from google.colab import drive
from google.colab import userdata
import pandas_gbq

# Scikit Learn libraries
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, classification_report, confusion_matrix, roc_auc_score, roc_curve, log_loss
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

# Other
from matplotlib import pyplot
import seaborn as sns
from xgboost import XGBClassifier

In [None]:
# Mount Google Drive
drive.mount('/content/drive')

# Change default ouput directory
os.chdir('/content/drive/MyDrive/birkbeck_msc-project/python_files')



In [None]:
# Import user defined python functions. Used importlib as having stability issues with simple import and not picking up the files
spec = importlib.util.spec_from_file_location("create_model_results", "/content/drive/MyDrive/birkbeck_msc-project/python_files/create_model_results.py")
create_model_results = importlib.util.module_from_spec(spec)
spec.loader.exec_module(create_model_results)


In [None]:
# Authentication credentials and keys
# Google Service Account

# Load the JSON key from local Google Collab file
key = json.load(open('/content/drive/MyDrive/service_account.json', 'r'))

# Authenticate using the loaded key
credentials = service_account.Credentials.from_service_account_info(key)

# Set up the BigQuery client with the credentials to project
client = bigquery.Client(credentials=credentials, project='birkbeck-msc-project-422917')

In [None]:
# Feature data for FA Cup

# Query Google Big Query
fa_cup_features_all = """
    SELECT * FROM preparation_layer.view_fa_cup_round_3_features
"""

fa_cup_features_all_df = client.query(fa_cup_features_all).to_dataframe()
display(fa_cup_features_all_df)


test_flag = 'Unseen'


In [None]:
# Function to calculate home advantage
def add_home_advantage(X):

    result = X.groupby(['home_team_league_level', 'away_team_league_level'])['home_win'].mean().reset_index()
    result.columns = ['home_team_league_level', 'away_team_league_level', 'home_win_factor']
    result['home_win_factor'] = result['home_win_factor'].round(3)

    X = X.merge(result, on=['home_team_league_level', 'away_team_league_level'], how='left')
    return X, 'home_win_factor'

In [None]:
# Function to calculate weights
def calculate_weights(y):
    class_counts = y.value_counts()
    total_samples = len(y)
    return {class_label: int(round((1 - (count / total_samples)) * 100))
            for class_label, count in class_counts.items()}


In [None]:
def classifer_models_optimisation_single_split (fa_cup_features_all_df, model_name, home_advt, weighted, model_classifier, random_state):

    # Define the ranking systems
    ranking_systems = ['no_ranking','round_3_position', 'massey', 'colley', 'keener', 'trueskill', 'borda_count', 'local_kemeny_optimisation']

    # Split data into training and test sets
    fa_cup_features_train = fa_cup_features_all_df[~fa_cup_features_all_df['season_year'].isin(['21/22', '22/23'])]
    fa_cup_features_test = fa_cup_features_all_df[fa_cup_features_all_df['season_year'].isin(['21/22', '22/23'])]

    # Drop 'season_year, match id, match name' column from both sets so not included in model training but then retain for vlaidation set for futher analysis and comparision
    analysis_columns = ['season_year', 'match_id', 'match_name','match_final_score']

    fa_cup_features_train = fa_cup_features_train.drop(analysis_columns, axis=1)

    test_analysis_columns = fa_cup_features_test[analysis_columns].copy()
    fa_cup_features_test = fa_cup_features_test.drop(analysis_columns, axis=1)

    # Get all columns except the target and ranking columns
    target_variable = 'target_variable'
    base_features = [col for col in fa_cup_features_train.columns if col != target_variable and not any(f"{team}_{ranking}" in col for team in ['home_team', 'away_team'] for ranking in ranking_systems)]

    # Identify numeric and categorical columns
    numeric_features = fa_cup_features_train[base_features].select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_features = fa_cup_features_train[base_features].select_dtypes(include=['object']).columns.tolist()

    # Define preprocessing steps
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', MinMaxScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    ranking_transformer = Pipeline(steps=[
        ('scaler', MinMaxScaler())
    ])

    for ranking in ranking_systems:
        model_name_ranking = f"{model_name} {ranking}"
        print(model_name_ranking)

        # Initialize lists to store results for the model
        train_accuracies = []
        train_recalls = []
        test_accuracies = []
        test_recalls = []
        all_y_train_true = []
        all_y_train_pred = []
        all_y_train_pred_proba = []  # Store probabilities for ROC
        all_y_test_true = []
        all_y_test_pred = []
        all_y_test_pred_proba = []  # Store probabilities for ROC

        # Fetch parameter grid from BigQuery for model name ranking
        query = """
        SELECT param_grid FROM analysis_layer.view_ml_models_best_recall_param_grid
        WHERE model_name_ranking = @model_name_ranking
        """

        # Set up the query configuration with the parameter
        job_config = bigquery.QueryJobConfig(
            query_parameters=[
                bigquery.ScalarQueryParameter("model_name_ranking", "STRING", model_name_ranking)
            ]
        )

        # Execute the query
        query_job = client.query(query, job_config=job_config)
        results = query_job.result()

        # Process the results and remove 'Classifier__' prefix
        param_grid_raw = eval(next(iter(results)).param_grid)
        param_grid = {k.replace('classifier__', ''): v for k, v in param_grid_raw.items()}

        # Update the model_classifier with the fetched parameters
        model_classifier.set_params(**param_grid)

        print("Updated model parameters:")

        for param, value in model_classifier.get_params().items():
            print(f"  {param}: {value}")

        if  ranking == 'no_ranking':
            features = base_features
        else:
            features = base_features + [f'home_team_{ranking}', f'away_team_{ranking}']

        # Create X_train and y_train
        X_train = fa_cup_features_train[features]
        y_train = fa_cup_features_train[target_variable]

        # Create X_val and y_val
        X_test = fa_cup_features_test[features]
        y_test = fa_cup_features_test[target_variable]

        if home_advt == 'yes':
            X_train, home_advantage_column = add_home_advantage(X_train)
            X_test, _ = add_home_advantage(X_test)
            numeric_features = [home_advantage_column] + numeric_features

        # Drop home win and league level feature so not used in modelling as impact predictability be proviing the winner
        columns_to_drop = ['home_win', 'home_team_league_level', 'away_team_league_level']
        X_train = X_train.drop(columns_to_drop, axis=1)
        X_test = X_test.drop(columns_to_drop, axis=1)
        numeric_features = [feat for feat in numeric_features if feat not in columns_to_drop]
        print(f'numeric features {numeric_features}')

        # Create preprocessor
        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, numeric_features),
                ('cat', categorical_transformer, categorical_features)
            ])

        if  ranking != 'no_ranking':
            preprocessor.transformers.append(('rank', ranking_transformer, [f'home_team_{ranking}', f'away_team_{ranking}']))

        # Create pipeline
        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('classifier', model_classifier)
        ])

        # Fit the model
        pipeline.fit(X_train, y_train)

        # Calculate train metrics

        y_train_pred = pipeline.predict(X_train).astype(int)
        y_train_pred_proba = pipeline.predict_proba(X_train)[:, 1]

        train_accuracy = accuracy_score(y_train, y_train_pred)
        train_recall = recall_score(y_train, y_train_pred)

        train_accuracies.append(train_accuracy)
        train_recalls.append(train_recall)

        # Calculate test metrics
        y_test_pred = pipeline.predict(X_test).astype(int)
        y_test_pred_proba = pipeline.predict_proba(X_test)[:, 1]

        test_accuracy = accuracy_score(y_test, y_test_pred)
        test_recall = recall_score(y_test, y_test_pred)

        test_accuracies.append(test_accuracy)
        test_recalls.append(test_recall)

        # Store true and predicted testues for later analysis
        all_y_train_true.extend(y_train)
        all_y_train_pred.extend(y_train_pred)
        all_y_train_pred_proba.extend(y_train_pred_proba)
        all_y_test_true.extend(y_test)
        all_y_test_pred.extend(y_test_pred)
        all_y_test_pred_proba.extend(y_test_pred_proba)

        print(f"{model_name_ranking} Train Accuracy: {train_accuracy:.3f}, test Accuracy: {test_accuracy:.3f}")
        print(f"{model_name_ranking} Train Recall: {train_recall:.3f}, test Recall: {test_recall:.3f}")

        #results_df, cm_fig, roc_fig = create_model_results.create_model_results_df(y_test, y_pred, accuracy, recall, None, model_name_ranking)
        results_df, cm_fig, roc_fig = create_model_results.create_model_results_df (all_y_train_true, all_y_train_pred, all_y_test_true, all_y_test_pred, train_accuracies, train_recalls, test_accuracies, test_recalls, all_y_train_pred_proba, all_y_test_pred_proba, model_name_ranking, test_flag)
        results_df = results_df.reset_index()
        results_df['metric_id'] = results_df.index + 1
        results_df = results_df[['metric_id', 'metric', model_name_ranking]]
        results_df = results_df[~results_df['metric'].str.startswith('Cross')]

        # Create a DataFrame with features, actual target, and predicted target
        comparison_df = pd.DataFrame({
            'Actual': y_test,
            'Predicted': y_test_pred
        })

        validation_df = X_test.copy()

        for col in analysis_columns:
          if col in test_analysis_columns.columns:
            validation_df[col] =  test_analysis_columns[col]

        #display(X_val)

        comparison_df_reset = comparison_df.reset_index(drop=True)
        validation_data_reset = validation_df.reset_index(drop=True)

        all_data_act_pred_df = comparison_df_reset.merge(validation_data_reset, left_index=True, right_index=True)

    return results_df, all_data_act_pred_df



In [None]:
# Standard, no weights
model_name = "Standard LogisticRegression"
home_advt = 'no'
weighted = 'no'
random_state = 47
model_classifier  = LogisticRegression(max_iter=1000)

all_results_df, fold_results_df  = classifer_models_optimisation_single_split (fa_cup_features_all_df,model_name, home_advt, weighted, model_classifier, random_state)

all_results_df_slr = all_results_df
display(all_results_df_slr)

fold_results_df_slr = fold_results_df
display(fold_results_df_slr)

# Change default ouput directory
os.chdir('/content/drive/MyDrive/birkbeck_msc-project/output_files')
# Save ranks to excel
all_results_df_slr.to_excel("all_slr_results_unseen.xlsx")
fold_results_df_slr.to_excel("fold_results_slr_unseen.xlsx")



In [None]:
# Standard, no weights with home advantage
model_name = "Standard LogisticRegression with home advantage"
home_advt = 'yes'
weighted = 'no'
random_state = 47
model_classifier  = LogisticRegression(max_iter=1000)


all_results_df, fold_results_df  = classifer_models_optimisation_single_split (fa_cup_features_all_df,model_name, home_advt, weighted, model_classifier, random_state)

all_results_df_slrh = all_results_df
display(all_results_df_slrh)

fold_results_df_slrh = fold_results_df
display(fold_results_df_slrh)

# Change default ouput directory
os.chdir('/content/drive/MyDrive/birkbeck_msc-project/output_files')
# Save ranks to excel
all_results_df_slrh.to_excel("all_slrh_results_unseen.xlsx")
fold_results_df_slrh.to_excel("fold_results_slrh_unseen.xlsx")

In [None]:
0# Weighted Logistc Regression
model_name = "Weighted LogisticRegression"
home_advt = 'no'
weighted = 'yes'
random_state = 47
weights = {0: 50, 1: 50} # intial value before recalauclating
model_classifier  = LogisticRegression(class_weight=weights, max_iter=1000)

all_results_df, fold_results_df  = classifer_models_optimisation_single_split (fa_cup_features_all_df,model_name, home_advt, weighted, model_classifier, random_state)

all_results_df_wlr = all_results_df
display(all_results_df_wlr)

fold_results_df_wlr = fold_results_df
display(fold_results_df_wlr)

# Change default ouput directory
os.chdir('/content/drive/MyDrive/birkbeck_msc-project/output_files')
# Save ranks to excel
all_results_df_wlr.to_excel("all_wlr_results_unseen.xlsx")
fold_results_df_wlr.to_excel("fold_results_wlr_unseen.xlsx")

In [None]:
# Weighted Logistc Regression with home advatange
model_name = "Weighted LogisticRegression with home advantage"
home_advt = 'yes'
weighted = 'yes'
random_state = 47
model_classifier  = LogisticRegression(class_weight=weights, max_iter=1000)

all_results_df, fold_results_df  = classifer_models_optimisation_single_split (fa_cup_features_all_df,model_name, home_advt, weighted, model_classifier, random_state)

all_results_df_wlrh = all_results_df
display(all_results_df_wlrh)

fold_results_df_wlrh = fold_results_df
display(fold_results_df_wlrh)

# Change default ouput directory
os.chdir('/content/drive/MyDrive/birkbeck_msc-project/output_files')
# Save ranks to excel
all_results_df_wlrh.to_excel("all_wlrh_results.xlsx")
fold_results_df_wlrh.to_excel("fold_results_wlrh.xlsx")

In [None]:
# MCP Neural Network
model_name = "MLP Classifier Neural Network"
home_advt = 'no'
weighted = 'no'
random_state = 47
model_classifier  = MLPClassifier(max_iter = 10000)

all_results_df, fold_results_df  = classifer_models_optimisation_single_split (fa_cup_features_all_df,model_name, home_advt, weighted, model_classifier, random_state)

all_results_df_nn = all_results_df
display(all_results_df_nn)

fold_results_df_nn = fold_results_df
display(fold_results_df_nn)

# Change default ouput directory
os.chdir('/content/drive/MyDrive/birkbeck_msc-project/output_files')
# Save ranks to excel
all_results_df_nn.to_excel("all_nn_results.xlsx")
fold_results_df_nn.to_excel("fold_results_nn.xlsx")

In [None]:
# MCP Neural Network with home advantage
model_name = "MLP Classifier Neural Network with home advantage"
home_advt = 'yes'
weighted = 'no'
random_state = 47
model_classifier  = MLPClassifier(max_iter = 10000)

all_results_df, fold_results_df  = classifer_models_optimisation_single_split (fa_cup_features_all_df,model_name, home_advt, weighted, model_classifier, random_state)

all_results_df_nnh = all_results_df
display(all_results_df_nnh)

fold_results_df_nnh = fold_results_df
display(fold_results_df_nnh)

# Change default ouput directory
os.chdir('/content/drive/MyDrive/birkbeck_msc-project/output_files')
# Save ranks to excel
all_results_df_nnh.to_excel("all_nnh_results.xlsx")
fold_results_df_nnh.to_excel("fold_results_nnh.xlsx")

In [None]:
# Random Forest Classifier
model_name = "Random Forest Classifier"
home_advt = 'no'
weighted = 'no'
random_state = 47
model_classifier  = RandomForestClassifier()

all_results_df, fold_results_df  = classifer_models_optimisation_single_split (fa_cup_features_all_df,model_name, home_advt, weighted, model_classifier, random_state)


all_results_df_rf = all_results_df
display(all_results_df_rf)

fold_results_df_rf = fold_results_df
display(fold_results_df_rf)

# Change default ouput directory
os.chdir('/content/drive/MyDrive/birkbeck_msc-project/output_files')
# Save ranks to excel
all_results_df_rf.to_excel("all_rf_results.xlsx")
fold_results_df_rf.to_excel("fold_results_rf.xlsx")


In [None]:
# Random Forest Classifier with home advantage
model_name = "Random Forest Classifier with home advantage"
home_advt = 'yes'
weighted = 'no'
random_state = 47
model_classifier  = RandomForestClassifier()

all_results_df, fold_results_df  = classifer_models_optimisation_single_split (fa_cup_features_all_df,model_name, home_advt, weighted, model_classifier, random_state)

all_results_df_rfh = all_results_df
display(all_results_df_rfh)

fold_results_df_rfh = fold_results_df
display(fold_results_df_rfh)

# Change default ouput directory
os.chdir('/content/drive/MyDrive/birkbeck_msc-project/output_files')
# Save ranks to excel
all_results_df_rfh.to_excel("all_rfh_results.xlsx")
fold_results_df_rfh.to_excel("fold_results_rfh.xlsx")

In [None]:
# XG Boost
model_name = "XG Boost"
home_advt = 'no'
weighted = 'no'
random_state = 47
model_classifier  = XGBClassifier()

all_results_df, fold_results_df  = classifer_models_optimisation_single_split (fa_cup_features_all_df,model_name, home_advt, weighted, model_classifier, random_state)

all_results_df_xg = all_results_df
display(all_results_df_xg)

fold_results_df_xg = fold_results_df
display(fold_results_df_xg)

# Change default ouput directory
os.chdir('/content/drive/MyDrive/birkbeck_msc-project/output_files')
# Save ranks to excel
all_results_df_xg.to_excel("all_xg_results.xlsx")
fold_results_df_xg.to_excel("fold_results_xg.xlsx")

In [None]:
# XG Boost with home advantage
model_name = "XG Boost with home advantage"
home_advt = 'yes'
weighted = 'no'
random_state = 47
model_classifier  = XGBClassifier()

all_results_df, fold_results_df  = classifer_models_optimisation_single_split (fa_cup_features_all_df,model_name, home_advt, weighted, model_classifier, random_state)

all_results_df_xgh = all_results_df
display(all_results_df_xgh)

fold_results_df_xgh = fold_results_df
display(fold_results_df_xgh)

# Change default ouput directory
os.chdir('/content/drive/MyDrive/birkbeck_msc-project/output_files')
# Save ranks to excel
all_results_df_xgh.to_excel("all_xgh_results.xlsx")
fold_results_df_xgh.to_excel("fold_results_xgh.xlsx")

In [None]:
all_results_df = all_results_df_xg.merge(all_results_df_xgh, on=['metric_id', 'metric'], how='outer')
all_results_df = all_results_df_rfh.merge(all_results_df, on=['metric_id', 'metric'], how='outer')
all_results_df = all_results_df_rf.merge(all_results_df, on=['metric_id', 'metric'], how='outer')
all_results_df = all_results_df_nnh.merge(all_results_df, on=['metric_id', 'metric'], how='outer')
all_results_df = all_results_df_nn.merge(all_results_df, on=['metric_id', 'metric'], how='outer')
all_results_df = all_results_df_wlrh.merge(all_results_df, on=['metric_id', 'metric'], how='outer')
all_results_df = all_results_df_wlr.merge(all_results_df, on=['metric_id', 'metric'], how='outer')
all_results_df = all_results_df_slrh.merge(all_results_df, on=['metric_id', 'metric'], how='outer')
all_results_df = all_results_df_slr.merge(all_results_df, on=['metric_id', 'metric'], how='outer')

all_results_df = all_results_df.rename(columns=lambda x: x.lower().replace(' ','(_)').replace('(', '').replace(')', ''))
display(all_results_df)
all_results_df.to_excel("ml_model_confusion_matrix_results.xlsx")

# Load data from Excel to Google BigQuery
all_results_from_excel = pd.read_excel("all_results.xlsx")
load_dataset_name = 'analysis_layer'
load_table_name = 'ml_model_confusion_matrix_results'
full_table_name = f"{load_dataset_name}.{load_table_name}"

pandas_gbq.to_gbq(all_results_from_excel, full_table_name,
                  project_id='birkbeck-msc-project-422917',
                  if_exists='replace')

print(f"\nData loaded to BigQuery table: {full_table_name}")


In [None]:
# Load updated Fold results to Google Big Query

# Merge fold results from each model
fold_results_df = pd.concat([fold_results_df_slr, fold_results_df_slrh, fold_results_df_wlr, fold_results_df_wlrh, fold_results_df_nn, fold_results_df_nnh, fold_results_df_rf, fold_results_df_rfh, fold_results_df_xg, fold_results_df_xgh])
fold_results_df = fold_results_df.rename(columns=lambda x: x.lower().replace(' ','(_)').replace('(', '').replace(')', ''))
display(fold_results_df)

# Write to Excel
fold_results_df.to_excel("ml_model_fold_results.xlsx")

# Load fold results data from Excel to Google BigQuery
fold_results_from_excel = pd.read_excel("all_results_.xlsx")
load_dataset_name = 'analysis_layer'
load_table_name = 'ml_model_fold_results'
full_table_name = f"{load_dataset_name}.{load_table_name}"

pandas_gbq.to_gbq(fold_results_from_excel, full_table_name,
                  project_id='birkbeck-msc-project-422917',
                  if_exists='replace')

print(f"\nData loaded to BigQuery table: {full_table_name}")
