# Compare models trained on the primary cohort (Sheffield), the replication cohort (KCL) and both combined.
Models are trained and tested either on the same dataset or on the other dataset

In [None]:
%pylab
%matplotlib inline
%reload_ext autoreload
import pandas as pd
import sys
import seaborn as sns
sns.set_style('ticks')
sns.set_context('poster')
sys.path.append('../src')
import abrTools as at
import os
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler,Normalizer
from sklearn.metrics import ConfusionMatrixDisplay,confusion_matrix,classification_report
from collections import Counter
import pretty_confusion_matrix as pcm
from scipy.signal import savgol_filter


fs = 195000.0/2.0 # Acquisition sampling rate

from datetime import date
savefolder = os.path.join('..','results',str(date.today()))

#Create saveFolder. Declare the folder explicitly if continuing a previous run.
if not os.path.exists(savefolder):
    os.makedirs(savefolder)

We set the sound level interval between 15 and 85 dB to include the largest possible amount of Replication cohort data, while avoiding imputation. We consider only Click stimuli

In [None]:
lowestInt = 15 # Lowest sound level considered
highestInt = 85# highest sound level considered


In [None]:
from abrTools import loadKingsData, loadSheffieldData,interFunc
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.utils.class_weight import compute_sample_weight
from sktime.classification.kernel_based import RocketClassifier
from sktime.classification.hybrid import HIVECOTEV2
from sklearn.feature_selection import f_classif,mutual_info_classif, SelectFpr, SelectPercentile
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier

from joblib import dump, load

# Retrain sheffield models (on a dataset comparable to KCL)

In [None]:
# Define the parameters for the experiment
frequencies = [[100]]
suffices =  ['Click']
results = []
njobs = -1
anovaPercentile= 10

# Loop through each frequency
for i, freq in enumerate(frequencies):
    print(freq)
    
    # Load Sheffield data
    X_train, X_test, y_train, y_test, X_full, y_full, dataVersion = loadSheffieldData()

    # Random Forest Classifier with ANOVA feature selection
    print('Forest - anova')
    anova_fs = SelectPercentile(f_classif, percentile=anovaPercentile)
    forest_cl = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced', n_jobs=-1, max_depth=None, max_features='sqrt', min_samples_leaf=1, min_samples_split=5, bootstrap=True)
    forest_pip = make_pipeline(anova_fs, forest_cl)
    
    # Check if results already exist, if not, fit the model and save results
    if not os.path.exists(os.path.join(savefolder, f'forestSheffield_kFoldCrossValidation_AnovaFS{anovaPercentile}percent' + suffices[i] + '.csv')):
        res = at.fitClassificationModel(forest_pip, X_train, y_train, X_test=X_test, y_test=y_test, saveToWandb=False, modelName='Forest classifier', dataVersion=dataVersion, crossValidation=True, makePlot=False, calculatePValue=False, njobs=njobs)
        pd.DataFrame(res).to_csv(os.path.join(savefolder, f'forestSheffield_kFoldCrossValidation_AnovaFS{anovaPercentile}percent' + suffices[i] + '.csv'))
        results.append(res)
        forest_pip.fit(X_train, y_train)
        dump(forest_pip, os.path.join(savefolder, f'forestSheffield_kFoldCrossValidation_AnovaFS{anovaPercentile}percent' + suffices[i] + '.joblib'))
        pd.DataFrame({'y_test': y_test, 'y_predict': forest_pip.predict(X_test)}).to_csv(os.path.join(savefolder, f'forestSheffield_kFoldCrossValidation_AnovaFS{anovaPercentile}percent' + suffices[i] + 'testResults.csv'))

    # Support Vector Classifier with ANOVA feature selection
    print('svc - anova')
    anova_fs = SelectPercentile(f_classif, percentile=10)
    svc_cl = SVC(probability=True, C=0.01, class_weight='balanced', degree=2, gamma=0.01, kernel='poly', shrinking=True)
    svc_pip = make_pipeline(anova_fs, svc_cl)
    
    # Check if results already exist, if not, fit the model and save results
    if not os.path.exists(os.path.join(savefolder, f'SVCSheffield_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + '.csv')):
        res = at.fitClassificationModel(svc_pip, X_train, y_train, X_test=X_test, y_test=y_test, saveToWandb=False, modelName='SVC classifier', dataVersion=dataVersion, crossValidation=True, makePlot=False, calculatePValue=False, njobs=njobs)
        pd.DataFrame(res).to_csv(os.path.join(savefolder, f'SVCSheffield_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + '.csv'))
        results.append(res)
        svc_pip.fit(X_train, y_train)
        dump(svc_pip, os.path.join(os.path.join(savefolder, f'SVCSheffield_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + '.joblib')))
        pd.DataFrame({'y_test': y_test, 'y_predict': svc_pip.predict(X_test)}).to_csv(os.path.join(savefolder, f'SVCSheffield_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + 'testResults.csv'))

    # XGBoost Classifier with ANOVA feature selection
    print('xgboost - anova')
    anova_fs = SelectPercentile(f_classif, percentile=10)
    sample_weights = compute_sample_weight(class_weight='balanced', y=y_train)
    reg = xgb.XGBClassifier(n_estimators=200, verbosity=0, n_jobs=-1, random_state=42, max_depth=3, sample_weight=sample_weights, colsample_bytree=0.8, learning_rate=0.1, subsample=0.6)
    xg_pip = make_pipeline(anova_fs, reg)
    
    # Encode labels for XGBoost
    y_train2 = y_train.copy()
    y_train2[y_train == '6N'] = 0
    y_train2[y_train == 'Repaired'] = 1
    y_test2 = y_test.copy()
    y_test2[y_test == '6N'] = 0
    y_test2[y_test == 'Repaired'] = 1
    y_train2 = y_train2.astype(int)
    y_test2 = y_test2.astype(int)
    
    # Check if results already exist, if not, fit the model and save results
    if not os.path.exists(os.path.join(savefolder, f'XGBOOSTSheffield_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + '.csv')):
        res = at.fitClassificationModel(xg_pip, X_train, y_train2, X_test=X_test, y_test=y_test2, saveToWandb=False, modelName='XGBOOST', dataVersion=dataVersion, crossValidation=True, makePlot=False, njobs=njobs, encode_labels=True)
        pd.DataFrame(res).to_csv(os.path.join(savefolder, f'XGBOOSTSheffield_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + '.csv'))
        results.append(res)
        xg_pip.fit(X_train, y_train2)
        dump(xg_pip, os.path.join(savefolder, f'XGBOOSTSheffield_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + '.joblib'))

        y_predict2 = xg_pip.predict(X_test)
        y_predict = y_predict2.copy().astype(str)
        y_predict[y_predict2 == 0] = '6N'
        y_predict[y_predict2 == 1] = 'Repaired'
        pd.DataFrame({'y_test': y_test, 'y_predict': y_predict}).to_csv(os.path.join(savefolder, f'XGBOOSTSheffield_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + 'testResults.csv'))

    # Rocket Classifier with ANOVA feature selection
    print('rocket - anova')
    anova_fs = SelectPercentile(f_classif, percentile=10)
    rocket = RocketClassifier(num_kernels=5000, n_jobs=-1, max_dilations_per_kernel=16, n_features_per_kernel=2, use_multivariate='yes', random_state=42)
    rocket_pip = make_pipeline(anova_fs, rocket)
    
    # Check if results already exist, if not, fit the model and save results
    if not os.path.exists(os.path.join(savefolder, f'RocketSheffield_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + '.csv')):
        try:
            res = at.fitClassificationModel(rocket_pip, X_train, y_train, X_test=X_test, y_test=y_test, saveToWandb=False, modelName='Rocket classifier', dataVersion=dataVersion, crossValidation=True, makePlot=False, calculatePValue=False, njobs=njobs)
        except ValueError:
            res = {'accuracy': array([0, 0, 0]), 'roc_auc_score': array([0, 0, 0])}
        pd.DataFrame(res).to_csv(os.path.join(savefolder, f'RocketSheffield_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + '.csv'))
        results.append(res)
        rocket_pip.fit(X_train, y_train)
        dump(rocket_pip, os.path.join(savefolder, f'RocketSheffield_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + '.joblib'))
        pd.DataFrame({'y_test': y_test, 'y_predict': rocket_pip.predict(X_test)}).to_csv(os.path.join(savefolder, f'RocketSheffield_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + 'testResults.csv'))

    # MLP Classifier with ANOVA feature selection
    print('MLP - anova')
    anova_fs = SelectPercentile(f_classif, percentile=10)
    mlp = MLPClassifier(solver='lbfgs', random_state=42, early_stopping=True, activation='tanh', alpha=0.05, hidden_layer_sizes=(150,), learning_rate_init=0.001, max_iter=100)
    mlp_pip = make_pipeline(anova_fs, mlp)
    
    # Check if results already exist, if not, fit the model and save results
    if not os.path.exists(os.path.join(savefolder, f'MLPSheffield_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + '.csv')):
        res = at.fitClassificationModel(mlp_pip, X_train, y_train, X_test=X_test, y_test=y_test, saveToWandb=False, modelName='Forest classifier', dataVersion=dataVersion, crossValidation=True, makePlot=False, calculatePValue=False, njobs=njobs)
        pd.DataFrame(res).to_csv(os.path.join(savefolder, f'MLPSheffield_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + '.csv'))
        results.append(res)
        mlp_pip.fit(X_train, y_train)
        dump(mlp_pip, os.path.join(savefolder, f'MLPSheffield_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + '.joblib'))
        pd.DataFrame({'y_test': y_test, 'y_predict': mlp_pip.predict(X_test)}).to_csv(os.path.join(savefolder, f'MLPSheffield_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + 'testResults.csv'))

    # HIVE-COTE Classifier with ANOVA feature selection
    print('hive cote - anova')
    anova_fs = SelectPercentile(f_classif, percentile=10)
    hc2 = HIVECOTEV2(n_jobs=-1, time_limit_in_minutes=0.2)
    hc2_pip = make_pipeline(anova_fs, hc2)
    
    # Check if results already exist, if not, fit the model and save results
    if not os.path.exists(os.path.join(savefolder, f'hivecoteSheffield_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + '.csv')):
        try:
            res = at.fitClassificationModel(hc2_pip, X_train, y_train, X_test=X_test, y_test=y_test, saveToWandb=False, modelName='Forest classifier', dataVersion=dataVersion, crossValidation=True, makePlot=False, calculatePValue=False, njobs=njobs)
        except ValueError:
            res = {'accuracy': array([0, 0, 0]), 'roc_auc_score': array([0, 0, 0])}
        pd.DataFrame(res).to_csv(os.path.join(savefolder, f'hivecoteSheffield_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + '.csv'))
        results.append(res)
        hc2_pip.fit(X_train, y_train)
        dump(hc2_pip, os.path.join(savefolder, f'hivecoteSheffield_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + '.joblib'))
        pd.DataFrame({'y_test': y_test, 'y_predict': hc2_pip.predict(X_test)}).to_csv(os.path.join(savefolder, f'hivecoteSheffield_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + 'testResults.csv'))


# Train on KCL data

Note that KCL data are shifted by 54 points (to the left) to account for the longer distance between mouse and speaker used in the setup (10 cm vs 20 cm) and align Wave 1

In [None]:
X_train,X_test,y_train,y_test,X_kings,y_kings = loadKingsData(shift=54,scaling=False)


In [None]:
frequencies = [[100]]
suffices =  ['Click']
results = []
njobs = -1
anovaPercentile= 10
dataVersion = 'None'
# Loop through each frequency
for i, freq in enumerate(frequencies):
    print(freq)
    # Load data
    X_train, X_test, y_train, y_test, X_kings, y_kings = loadKingsData(shift=54, scaling=False)

    ### ANOVA FEATURE EXTRACTION
    print('Forest - anova')
    anova_fs = SelectPercentile(f_classif, percentile=anovaPercentile)
    forest_cl = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced', n_jobs=-1, max_depth=None, max_features='sqrt', min_samples_leaf=1, min_samples_split=5, bootstrap=True)
    forest_pip = make_pipeline(anova_fs, forest_cl)
    
    # Check if results already exist, if not, fit and save the model
    if not os.path.exists(os.path.join(savefolder, f'forestKings_kFoldCrossValidation_AnovaFS{anovaPercentile}percent' + suffices[i] + '.csv')):
        res = at.fitClassificationModel(forest_pip, X_train, y_train, X_test=X_test, y_test=y_test, saveToWandb=False, modelName='Forest classifier', dataVersion=dataVersion,
                                        crossValidation=True, makePlot=False, calculatePValue=False, njobs=njobs)
        pd.DataFrame(res).to_csv(os.path.join(savefolder, f'forestKings_kFoldCrossValidation_AnovaFS{anovaPercentile}percent' + suffices[i] + '.csv'))
        results.append(res)
        forest_pip.fit(X_train, y_train)
        dump(forest_pip, os.path.join(savefolder, f'forestKings_kFoldCrossValidation_AnovaFS{anovaPercentile}percent' + suffices[i] + '.joblib'))
        pd.DataFrame({'y_test': y_test, 'y_predict': forest_pip.predict(X_test)}).to_csv(os.path.join(savefolder, f'forestKings_kFoldCrossValidation_AnovaFS{anovaPercentile}percent' + suffices[i] + 'testResults.csv'))

    print('svc - anova')
    anova_fs = SelectPercentile(f_classif, percentile=10)
    svc_cl = SVC(probability=True, C=0.01, class_weight='balanced', degree=2, gamma=0.01, kernel='poly', shrinking=True)
    svc_pip = make_pipeline(anova_fs, svc_cl)
    
    # Check if results already exist, if not, fit and save the model
    if not os.path.exists(os.path.join(savefolder, f'SVCKings_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + '.csv')):
        res = at.fitClassificationModel(svc_pip, X_train, y_train, X_test=X_test, y_test=y_test, saveToWandb=False, modelName='SVC classifier', dataVersion=dataVersion,
                                        crossValidation=True, makePlot=False, calculatePValue=False, njobs=njobs)
        pd.DataFrame(res).to_csv(os.path.join(savefolder, f'SVCKings_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + '.csv'))
        results.append(res)
        svc_pip.fit(X_train, y_train)
        dump(svc_pip, os.path.join(os.path.join(savefolder, f'SVCKings_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + '.joblib')))
        pd.DataFrame({'y_test': y_test, 'y_predict': svc_pip.predict(X_test)}).to_csv(os.path.join(savefolder, f'SVCKings_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + 'testResults.csv'))

    print('xgboost - anova')
    anova_fs = SelectPercentile(f_classif, percentile=10)
    sample_weights = compute_sample_weight(class_weight='balanced', y=y_train)
    reg = xgb.XGBClassifier(n_estimators=200, verbosity=0, n_jobs=-1, random_state=42, max_depth=3, sample_weight=sample_weights, colsample_bytree=0.8, learning_rate=0.1, subsample=0.6)
    xg_pip = make_pipeline(anova_fs, reg)
    
    # Encode labels for XGBoost
    y_train2 = y_train.copy()
    y_train2[y_train == '6N'] = 0
    y_train2[y_train == 'Repaired'] = 1
    y_test2 = y_test.copy()
    y_test2[y_test == '6N'] = 0
    y_test2[y_test == 'Repaired'] = 1
    y_train2 = y_train2.astype(int)
    y_test2 = y_test2.astype(int)
    
    # Check if results already exist, if not, fit and save the model
    if not os.path.exists(os.path.join(savefolder, f'XGBOOSTKings_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + '.csv')):
        res = at.fitClassificationModel(xg_pip, X_train, y_train2, X_test=X_test, y_test=y_test2, saveToWandb=False, modelName='XGBOOST', dataVersion=dataVersion,
                                        crossValidation=True, makePlot=False, njobs=njobs, encode_labels=True)
        pd.DataFrame(res).to_csv(os.path.join(savefolder, f'XGBOOSTKings_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + '.csv'))
        results.append(res)
        xg_pip.fit(X_train, y_train2)
        dump(xg_pip, os.path.join(savefolder, f'XGBOOSTKings_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + '.joblib'))
        y_predict2 = xg_pip.predict(X_test)
        y_predict = y_predict2.copy().astype(str)
        y_predict[y_predict2 == 0] = '6N'
        y_predict[y_predict2 == 1] = 'Repaired'
        pd.DataFrame({'y_test': y_test, 'y_predict': y_predict}).to_csv(os.path.join(savefolder, f'XGBOOSTKings_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + 'testResults.csv'))

    print('rocket - anova')
    anova_fs = SelectPercentile(f_classif, percentile=10)
    rocket = RocketClassifier(num_kernels=5000, n_jobs=-1, max_dilations_per_kernel=16, n_features_per_kernel=2, use_multivariate='yes', random_state=42)
    rocket_pip = make_pipeline(anova_fs, rocket)
    
    # Check if results already exist, if not, fit and save the model
    if not os.path.exists(os.path.join(savefolder, f'RocketKings_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + '.csv')):
        try:
            res = at.fitClassificationModel(rocket_pip, X_train, y_train, X_test=X_test, y_test=y_test, saveToWandb=False, modelName='Rocket classifier', dataVersion=dataVersion,
                                            crossValidation=True, makePlot=False, calculatePValue=False, njobs=njobs)
        except ValueError:
            res = {'accuracy': array([0, 0, 0]),
                   'roc_auc_score': array([0, 0, 0]),
                   }
        pd.DataFrame(res).to_csv(os.path.join(savefolder, f'RocketKings_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + '.csv'))
        results.append(res)
        rocket_pip.fit(X_train, y_train)
        dump(rocket_pip, os.path.join(savefolder, f'RocketKings_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + '.joblib'))
        pd.DataFrame({'y_test': y_test, 'y_predict': rocket_pip.predict(X_test)}).to_csv(os.path.join(savefolder, f'RocketKings_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + 'testResults.csv'))

    print('MLP - anova')
    anova_fs = SelectPercentile(f_classif, percentile=10)
    mlp = MLPClassifier(solver='lbfgs', random_state=42, early_stopping=True, activation='tanh', alpha=0.05, hidden_layer_sizes=(150,), learning_rate_init=0.001, max_iter=100)
    mlp_pip = make_pipeline(anova_fs, mlp)
    
    # Check if results already exist, if not, fit and save the model
    if not os.path.exists(os.path.join(savefolder, f'MLPKings_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + '.csv')):
        res = at.fitClassificationModel(mlp_pip, X_train, y_train, X_test=X_test, y_test=y_test, saveToWandb=False, modelName='Forest classifier', dataVersion=dataVersion,
                                        crossValidation=True, makePlot=False, calculatePValue=False, njobs=njobs)
        pd.DataFrame(res).to_csv(os.path.join(savefolder, f'MLPKings_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + '.csv'))
        results.append(res)
        mlp_pip.fit(X_train, y_train)
        dump(mlp_pip, os.path.join(savefolder, f'MLPKings_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + '.joblib'))
        pd.DataFrame({'y_test': y_test, 'y_predict': mlp_pip.predict(X_test)}).to_csv(os.path.join(savefolder, f'MLPKings_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + 'testResults.csv'))

    print('hive cote - anova')
    anova_fs = SelectPercentile(f_classif, percentile=10)
    hc2 = HIVECOTEV2(n_jobs=-1, time_limit_in_minutes=0.2)
    hc2_pip = make_pipeline(anova_fs, hc2)
    
    # Check if results already exist, if not, fit and save the model
    if not os.path.exists(os.path.join(savefolder, f'hivecoteKings_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + '.csv')):
        try:
            res = at.fitClassificationModel(hc2_pip, X_train, y_train, X_test=X_test, y_test=y_test, saveToWandb=False, modelName='Forest classifier', dataVersion=dataVersion,
                                            crossValidation=True, makePlot=False, calculatePValue=False, njobs=njobs)
        except ValueError:
            res = {'accuracy': array([0, 0, 0]),
                   'roc_auc_score': array([0, 0, 0]),
                   }
        pd.DataFrame(res).to_csv(os.path.join(savefolder, f'hivecoteKings_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + '.csv'))
        results.append(res)
        hc2_pip.fit(X_train, y_train)
        dump(hc2_pip, os.path.join(savefolder, f'hivecoteKings_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + '.joblib'))
        pd.DataFrame({'y_test': y_test, 'y_predict': hc2_pip.predict(X_test)}).to_csv(os.path.join(savefolder, f'hivecoteKings_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + 'testResults.csv'))


# Train on combined. Same as before, but the two datasets are combined

In [None]:
frequencies = [[100]]
suffices =  ['Click']
results = []
njobs = -1
anovaPercentile= 10

for i, freq in enumerate(frequencies):
    # Load Kings data
    X_kings_train, X_kings_test, y_kings_train, y_kings_test, X_kings, y_kings = loadKingsData(shift=54, scaling=False)

    # Load Sheffield data
    X_train_Sheffield, X_test_Sheffled, y_train_Sheffield, y_test_Sheffield, X_full, y_full, dataVersion = loadSheffieldData()

    # Combine data
    X_train = np.vstack((X_train_Sheffield, X_kings_train))
    X_test = np.vstack((X_test_Sheffled, X_kings_test))
    y_train = np.hstack((y_train_Sheffield, y_kings_train))
    y_test = np.hstack((y_test_Sheffield, y_kings_test))

    X_combined = np.vstack((X_train, X_test))
    y_combined = np.hstack((y_train, y_test))

    ### ANOVA FEATURE EXTRACTION
    print('Forest - anova')
    anova_fs = SelectPercentile(f_classif, percentile=anovaPercentile)
    forest_cl = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced', n_jobs=-1, max_depth=None, max_features='sqrt', min_samples_leaf=1, min_samples_split=5, bootstrap=True)
    forest_pip = make_pipeline(anova_fs, forest_cl)
    
    # Check if results already exist, if not, fit and save the model
    if not os.path.exists(os.path.join(savefolder, f'forestCombined_kFoldCrossValidation_AnovaFS{anovaPercentile}percent' + suffices[i] + '.csv')):
        res = at.fitClassificationModel(forest_pip, X_train, y_train, X_test=X_test, y_test=y_test, saveToWandb=False, modelName='Forest classifier', dataVersion=dataVersion,
                                        crossValidation=True, makePlot=False, calculatePValue=False, njobs=njobs)
        pd.DataFrame(res).to_csv(os.path.join(savefolder, f'forestCombined_kFoldCrossValidation_AnovaFS{anovaPercentile}percent' + suffices[i] + '.csv'))
        results.append(res)
        forest_pip.fit(X_train, y_train)
        dump(forest_pip, os.path.join(savefolder, f'forestCombined_kFoldCrossValidation_AnovaFS{anovaPercentile}percent' + suffices[i] + '.joblib'))
        pd.DataFrame({'y_test': y_test, 'y_predict': forest_pip.predict(X_test)}).to_csv(os.path.join(savefolder, f'forestCombined_kFoldCrossValidation_AnovaFS{anovaPercentile}percent' + suffices[i] + 'testResults.csv'))

    print('svc - anova')
    anova_fs = SelectPercentile(f_classif, percentile=10)
    svc_cl = SVC(probability=True, C=0.01, class_weight='balanced', degree=2, gamma=0.01, kernel='poly', shrinking=True)
    svc_pip = make_pipeline(anova_fs, svc_cl)
    
    # Check if results already exist, if not, fit and save the model
    if not os.path.exists(os.path.join(savefolder, f'SVCCombined_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + '.csv')):
        res = at.fitClassificationModel(svc_pip, X_train, y_train, X_test=X_test, y_test=y_test, saveToWandb=False, modelName='SVC classifier', dataVersion=dataVersion,
                                        crossValidation=True, makePlot=False, calculatePValue=False, njobs=njobs)
        pd.DataFrame(res).to_csv(os.path.join(savefolder, f'SVCCombined_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + '.csv'))
        results.append(res)
        svc_pip.fit(X_train, y_train)
        dump(svc_pip, os.path.join(savefolder, f'SVCCombined_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + '.joblib'))
        pd.DataFrame({'y_test': y_test, 'y_predict': svc_pip.predict(X_test)}).to_csv(os.path.join(savefolder, f'SVCCombined_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + 'testResults.csv'))

    print('xgboost - anova')
    anova_fs = SelectPercentile(f_classif, percentile=10)
    sample_weights = compute_sample_weight(class_weight='balanced', y=y_train)
    reg = xgb.XGBClassifier(n_estimators=200, verbosity=0, n_jobs=-1, random_state=42, max_depth=3, sample_weight=sample_weights, colsample_bytree=0.8, learning_rate=0.1, subsample=0.6)
    xg_pip = make_pipeline(anova_fs, reg)
    
    # Encode labels for XGBoost
    y_train2 = y_train.copy()
    y_train2[y_train == '6N'] = 0
    y_train2[y_train == 'Repaired'] = 1
    y_test2 = y_test.copy()
    y_test2[y_test == '6N'] = 0
    y_test2[y_test == 'Repaired'] = 1
    y_train2 = y_train2.astype(int)
    y_test2 = y_test2.astype(int)
    
    # Check if results already exist, if not, fit and save the model
    if not os.path.exists(os.path.join(savefolder, f'XGBOOSTCombined_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + '.csv')):
        res = at.fitClassificationModel(xg_pip, X_train, y_train2, X_test=X_test, y_test=y_test2, saveToWandb=False, modelName='XGBOOST', dataVersion=dataVersion,
                                        crossValidation=True, makePlot=False, njobs=njobs, encode_labels=True)
        pd.DataFrame(res).to_csv(os.path.join(savefolder, f'XGBOOSTCombined_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + '.csv'))
        results.append(res)
        xg_pip.fit(X_train, y_train2)
        y_predict2 = xg_pip.predict(X_test)
        y_predict = y_predict2.copy().astype(str)
        y_predict[y_predict2 == 0] = '6N'
        y_predict[y_predict2 == 1] = 'Repaired'
        dump(xg_pip, os.path.join(savefolder, f'XGBOOSTCombined_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + '.joblib'))
        pd.DataFrame({'y_test': y_test, 'y_predict': y_predict}).to_csv(os.path.join(savefolder, f'XGBOOSTCombined_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + 'testResults.csv'))

    print('rocket - anova')
    anova_fs = SelectPercentile(f_classif, percentile=10)
    rocket = RocketClassifier(num_kernels=5000, n_jobs=-1, max_dilations_per_kernel=16, n_features_per_kernel=2, use_multivariate='yes', random_state=42)
    rocket_pip = make_pipeline(anova_fs, rocket)
    
    # Check if results already exist, if not, fit and save the model
    if not os.path.exists(os.path.join(savefolder, f'RocketCombined_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + '.csv')):
        try:
            res = at.fitClassificationModel(rocket_pip, X_train, y_train, X_test=X_test, y_test=y_test, saveToWandb=False, modelName='Rocket classifier', dataVersion=dataVersion,
                                            crossValidation=True, makePlot=False, calculatePValue=False, njobs=njobs)
        except ValueError:
            res = {'accuracy': array([0, 0, 0]),
                   'roc_auc_score': array([0, 0, 0]),
                   }
        pd.DataFrame(res).to_csv(os.path.join(savefolder, f'RocketCombined_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + '.csv'))
        results.append(res)
        rocket_pip.fit(X_train, y_train)
        dump(rocket_pip, os.path.join(savefolder, f'RocketCombined_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + '.joblib'))
        pd.DataFrame({'y_test': y_test, 'y_predict': rocket_pip.predict(X_test)}).to_csv(os.path.join(savefolder, f'RocketCombined_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + 'testResults.csv'))

    print('MLP - anova')
    anova_fs = SelectPercentile(f_classif, percentile=10)
    mlp = MLPClassifier(solver='lbfgs', random_state=42, early_stopping=True, activation='tanh', alpha=0.05, hidden_layer_sizes=(150,), learning_rate_init=0.001, max_iter=100)
    mlp_pip = make_pipeline(anova_fs, mlp)
    
    # Check if results already exist, if not, fit and save the model
    if not os.path.exists(os.path.join(savefolder, f'MLPCombined_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + '.csv')):
        res = at.fitClassificationModel(mlp_pip, X_train, y_train, X_test=X_test, y_test=y_test, saveToWandb=False, modelName='Forest classifier', dataVersion=dataVersion,
                                        crossValidation=True, makePlot=False, calculatePValue=False, njobs=njobs)
        pd.DataFrame(res).to_csv(os.path.join(savefolder, f'MLPCombined_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + '.csv'))
        results.append(res)
        mlp_pip.fit(X_train, y_train)
        dump(mlp_pip, os.path.join(savefolder, f'MLPCombined_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + '.joblib'))
        pd.DataFrame({'y_test': y_test, 'y_predict': mlp_pip.predict(X_test)}).to_csv(os.path.join(savefolder, f'MLPCombined_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + 'testResults.csv'))

    print('hive cote - anova')
    anova_fs = SelectPercentile(f_classif, percentile=10)
    hc2 = HIVECOTEV2(n_jobs=-1, time_limit_in_minutes=0.2)
    hc2_pip = make_pipeline(anova_fs, hc2)
    
    # Check if results already exist, if not, fit and save the model
    if not os.path.exists(os.path.join(savefolder, f'hivecoteCombined_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + '.csv')):
        try:
            res = at.fitClassificationModel(hc2_pip, X_train, y_train, X_test=X_test, y_test=y_test, saveToWandb=False, modelName='Forest classifier', dataVersion=dataVersion,
                                            crossValidation=True, makePlot=False, calculatePValue=False, njobs=njobs)
        except ValueError:
            res = {'accuracy': array([0, 0, 0]),
                   'roc_auc_score': array([0, 0, 0]),
                   }
        pd.DataFrame(res).to_csv(os.path.join(savefolder, f'hivecoteCombined_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + '.csv'))
        results.append(res)
        hc2_pip.fit(X_train, y_train)
        dump(hc2_pip, os.path.join(savefolder, f'hivecoteCombined_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + '.joblib'))
        pd.DataFrame({'y_test': y_test, 'y_predict': hc2_pip.predict(X_test)}).to_csv(os.path.join(savefolder, f'hivecoteCombined_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + 'testResults.csv'))


# Test models on different datasets than trained

Load the previously trained models, test them on a different dataset

In [None]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.preprocessing import LabelEncoder
from joblib import load

In [None]:
X_kings_train,X_kings_test,y_kings_train,y_kings_test,X_kings,y_kings = loadKingsData(shift=54,scaling=False)

X_train_Sheffield,X_test_Sheffled,y_train_Sheffield,y_test_Sheffield,X_full,y_full, dataVersion = loadSheffieldData()


#Combine data
X_train = np.vstack((X_train_Sheffield,X_kings_train))
X_test = np.vstack((X_test_Sheffled,X_kings_test))
y_train = np.hstack((y_train_Sheffield, y_kings_train))
y_test = np.hstack((y_test_Sheffield, y_kings_test))

X_combined = np.vstack((X_train,X_test))
y_combined = np.hstack((y_train,y_test))

## Train: Sheffield

In [None]:
# Define the folder to save results
#savefolder = '..\\results\\2024-03-18-sheffieldvKings'

# Set the ANOVA percentile for feature selection
anovaPercentile = 10

# Initialize the label encoder
le = LabelEncoder()

# Define the data type to be used ('Sheffield', 'Kings', 'Combined')
dataType = 'Sheffield'  # 'Kings', 'Combined

# Define the suffix index
i = 0

# Iterate over different model types
for modelType in ['forest', 'SVC', 'XGBOOST', 'MLP', 'rocket', 'hivecote']:
    try:
        # Try to load the model without an underscore in the filename
        model = load(os.path.join(savefolder, f'{modelType}{dataType}_kFoldCrossValidation_AnovaFS{anovaPercentile}percent' + 'Click' + '.joblib'))
    except FileNotFoundError:
        # If not found, try to load the model with an underscore in the filename
        model = load(os.path.join(savefolder, f'{modelType}{dataType}_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + 'Click' + '.joblib'))
    
    if modelType == 'XGBOOST':
        # Predict on Kings data
        y_kings2 = le.fit_transform(y_kings)
        y_predict_kings = model.predict(X_kings)
        y_predict_kings2 = y_predict_kings.copy().astype(str)
        y_predict_kings2[y_predict_kings == 0] = '6N'
        y_predict_kings2[y_predict_kings == 1] = 'Repaired'
        pd.DataFrame({'y_test': y_kings, 'y_predict': y_predict_kings2}).to_csv(os.path.join(savefolder, f'{modelType}{dataType}_kFoldCrossValidation_AnovaFS{anovaPercentile}percent' + suffices[i] + 'testResults_Kings.csv'))
        
        # Predict on combined test data
        y_predict2 = model.predict(X_test)
        y_predict = y_predict2.copy().astype(str)
        y_predict[y_predict2 == 0] = '6N'
        y_predict[y_predict2 == 1] = 'Repaired'
        pd.DataFrame({'y_test': y_test, 'y_predict': y_predict}).to_csv(os.path.join(savefolder, f'{modelType}{dataType}_kFoldCrossValidation_AnovaFS{anovaPercentile}percent' + suffices[i] + 'testResults_Combined.csv'))
    
    else:
        # Predict on Kings data
        pd.DataFrame({'y_test': y_kings, 'y_predict': model.predict(X_kings)}).to_csv(os.path.join(savefolder, f'{modelType}{dataType}_kFoldCrossValidation_AnovaFS{anovaPercentile}percent' + suffices[i] + 'testResults_Kings.csv'))
        
        # Predict on combined test data
        pd.DataFrame({'y_test': y_test, 'y_predict': model.predict(X_test)}).to_csv(os.path.join(savefolder, f'{modelType}{dataType}_kFoldCrossValidation_AnovaFS{anovaPercentile}percent' + suffices[i] + 'testResults_Combined.csv'))


## Train: KCL

In [None]:
dataType = 'Kings'  # 'Sheffield', 'Kings', 'Combined'
for modelType in ['forest', 'SVC', 'XGBOOST', 'MLP', 'rocket', 'hivecote']:
    try:
        # Try to load the model without an underscore in the filename
        model = load(os.path.join(savefolder, f'{modelType}{dataType}_kFoldCrossValidation_AnovaFS{anovaPercentile}percent' + 'Click' + '.joblib'))
    except FileNotFoundError:
        # If not found, try to load the model with an underscore in the filename
        model = load(os.path.join(savefolder, f'{modelType}{dataType}_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + 'Click' + '.joblib'))
    
    if modelType == 'XGBOOST':
        # Predict on Sheffield data
        y_predict2 = model.predict(X_full)
        y_predict = y_predict2.copy().astype(str)
        y_predict[y_predict2 == 0] = '6N'
        y_predict[y_predict2 == 1] = 'Repaired'
        pd.DataFrame({'y_test': y_full, 'y_predict': y_predict}).to_csv(os.path.join(savefolder, f'{modelType}{dataType}_kFoldCrossValidation_AnovaFS{anovaPercentile}percent' + suffices[i] + 'testResults_Sheffield.csv'))
        
        # Predict on combined test data
        y_predict2 = model.predict(X_test)
        y_predict = y_predict2.copy().astype(str)
        y_predict[y_predict2 == 0] = '6N'
        y_predict[y_predict2 == 1] = 'Repaired'
        pd.DataFrame({'y_test': y_test, 'y_predict': y_predict}).to_csv(os.path.join(savefolder, f'{modelType}{dataType}_kFoldCrossValidation_AnovaFS{anovaPercentile}percent' + suffices[i] + 'testResults_Combined.csv'))
    
    else:
        # Predict on Sheffield data
        pd.DataFrame({'y_test': y_full, 'y_predict': model.predict(X_full)}).to_csv(os.path.join(savefolder, f'{modelType}{dataType}_kFoldCrossValidation_AnovaFS{anovaPercentile}percent' + suffices[i] + 'testResults_Sheffield.csv'))
        
        # Predict on combined test data
        pd.DataFrame({'y_test': y_test, 'y_predict': model.predict(X_test)}).to_csv(os.path.join(savefolder, f'{modelType}{dataType}_kFoldCrossValidation_AnovaFS{anovaPercentile}percent' + suffices[i] + 'testResults_Combined.csv'))


## Train: Combined

Note that testing on the two separate datasets should produce the same results as testing on the "combined" test set, as it is made by the same mice (in other words, for each element of the confusion matrix, Sheffield+KCL = combined)

In [None]:
dataType = 'Combined'  # 'Sheffield', 'Kings', 'Combined'
for modelType in ['forest', 'SVC', 'XGBOOST', 'MLP', 'rocket', 'hivecote']:
    try:
        # Try to load the model without an underscore in the filename
        model = load(os.path.join(savefolder, f'{modelType}{dataType}_kFoldCrossValidation_AnovaFS{anovaPercentile}percent' + 'Click' + '.joblib'))
    except FileNotFoundError:
        # If not found, try to load the model with an underscore in the filename
        model = load(os.path.join(savefolder, f'{modelType}{dataType}_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + 'Click' + '.joblib'))
    
    if modelType == 'XGBOOST':
        # Print the balanced accuracy score for the combined test set
        print(balanced_accuracy_score(model.predict(X_test), le.fit_transform(y_test)))
        
        # Predict on Sheffield test data
        y_predict2 = model.predict(X_test_Sheffled)
        y_predict = y_predict2.copy().astype(str)
        y_predict[y_predict2 == 0] = '6N'
        y_predict[y_predict2 == 1] = 'Repaired'
        pd.DataFrame({'y_test': y_test_Sheffield, 'y_predict': y_predict}).to_csv(os.path.join(savefolder, f'{modelType}{dataType}_kFoldCrossValidation_AnovaFS{anovaPercentile}percent' + suffices[i] + 'testResults_Sheffield.csv'))
        
        # Predict on Kings test data
        y_predict2 = model.predict(X_kings_test)
        y_predict = y_predict2.copy().astype(str)
        y_predict[y_predict2 == 0] = '6N'
        y_predict[y_predict2 == 1] = 'Repaired'
        pd.DataFrame({'y_test': y_kings_test, 'y_predict': y_predict}).to_csv(os.path.join(savefolder, f'{modelType}{dataType}_kFoldCrossValidation_AnovaFS{anovaPercentile}percent' + suffices[i] + 'testResults_Kings.csv'))
      
    else:
        # Predict on Sheffield test data
        pd.DataFrame({'y_test': y_test_Sheffield, 'y_predict': model.predict(X_test_Sheffled)}).to_csv(os.path.join(savefolder, f'{modelType}{dataType}_kFoldCrossValidation_AnovaFS{anovaPercentile}percent' + suffices[i] + 'testResults_Sheffield.csv'))
        
        # Predict on Kings test data
        pd.DataFrame({'y_test': y_kings_test, 'y_predict': model.predict(X_kings_test)}).to_csv(os.path.join(savefolder, f'{modelType}{dataType}_kFoldCrossValidation_AnovaFS{anovaPercentile}percent' + suffices[i] + 'testResults_Kings.csv'))


# Calculate Shapley values for the kings model

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.utils.class_weight import compute_sample_weight
from sktime.classification.kernel_based import RocketClassifier
from sktime.classification.hybrid import HIVECOTEV2
from sklearn.feature_selection import f_classif, mutual_info_classif, SelectFpr, SelectPercentile
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder
from joblib import dump, load
import shap
#from fastshap import KernelExplainer

# Define the folder to save results
savefolder = '../results/2024-10-27-sheffieldvKings-unscaled'

# Define the frequencies and suffices for the experiments
frequencies = [[100]]  # [100,3000,6000,12000,18000,24000,30000]
suffices = ['Click']  # 'NoHighFreq'
results = []
njobs = -1

# Kings dataset processing
for i, freq in enumerate(frequencies):
    print(freq)
    # Load Kings data
    X_kings_train, X_kings_test, y_kings_train, y_kings_test, X_kings, y_kings = loadKingsData(shift=54, scaling=False)
    X_train = X_kings_train
    X_test = X_kings_test
    
    for anovaPercentile in [10]:
        # Random forest model
        savefile = os.path.join(savefolder, f'forestKings_kFoldCrossValidation_AnovaFS{anovaPercentile}percent' + suffices[i] + 'ShapCoeff.csv')
        if not os.path.exists(savefile):
            try:
                model = load(os.path.join(savefolder, f'forestKings_kFoldCrossValidation_AnovaFS{anovaPercentile}percent' + suffices[i] + '.joblib'))
                X_test_reduced = model[0].transform(X_test)
                X_train_reduced = model[0].transform(X_train)
                X100 = shap.utils.sample(X_train_reduced, 100)
                ke = shap.TreeExplainer(model[1], X100, approximate=True)
                sv = ke.shap_values(X_test_reduced)
                class1Coeff = model[0].inverse_transform(sv[:, :, 0])
                class2Coeff = model[0].inverse_transform(sv[:, :, 1])
                pd.DataFrame(class1Coeff.T).to_csv(savefile)
            except FileNotFoundError:
                pass
        
        # XGBoost model
        savefile = os.path.join(savefolder, f'XGBOOSTKings_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + 'ShapCoeff.csv')
        if not os.path.exists(savefile):
            try:
                model = load(os.path.join(savefolder, f'XGBOOSTKings_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + '.joblib'))
                X_test_reduced = model[0].transform(X_test)
                X_train_reduced = model[0].transform(X_train)
                X100 = shap.utils.sample(X_train_reduced, 100)
                ke = shap.TreeExplainer(model[1], X100, approximate=True)
                sv = ke.shap_values(X_test_reduced)
                class1Coeff = model[0].inverse_transform(sv)
                pd.DataFrame(class1Coeff.T).to_csv(savefile)
            except FileNotFoundError:
                pass

        # Other models (e.g., SVC)
        for modelname in ['SVCKings']:  # :, 'hivecote','Rocket']:,'MLPKings'
            print(modelname)
            savefile = os.path.join(savefolder, f'{modelname}_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + 'ShapCoeff.csv')
            if not os.path.exists(savefile):
                try:
                    model = load(os.path.join(savefolder, f'{modelname}_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + '.joblib'))
                    X_test_reduced = model[0].transform(X_test[:5, :])  # we test on a subsample of data
                    X_train_reduced = model[0].transform(X_train)
                    X100 = shap.utils.sample(X_train_reduced, 50)  # we subsample 50 samples to speed up
                    ke = shap.KernelExplainer(model[1].predict_proba, X100, approximate=True, check_additivity=False)
                    sv = ke.shap_values(X_test_reduced)
                    if sv.ndim == 3:
                        class1Coeff = model[0].inverse_transform(sv[:, :, 0])
                    else:
                        if len(sv) > 1:
                            class1Coeff = model[0].inverse_transform(sv[0])
                        else:
                            class1Coeff = model[0].inverse_transform(sv)
                    pd.DataFrame(class1Coeff.T).to_csv(savefile)
                except FileNotFoundError:
                    pass

# Sheffield dataset processing
for i, freq in enumerate(frequencies):
    print(freq)
    # Load Sheffield data
    X_train_Sheffield, X_test_Sheffled, y_train_Sheffield, y_test_Sheffield, X_full, y_full, dataVersion = loadSheffieldData(shift=54)
    X_train = X_train_Sheffield
    X_test = X_test_Sheffled
    
    for anovaPercentile in [10]:
        # Random forest model
        savefile = os.path.join(savefolder, f'forestSheffield_kFoldCrossValidation_AnovaFS{anovaPercentile}percent' + suffices[i] + 'ShapCoeff.csv')
        if not os.path.exists(savefile):
            try:
                model = load(os.path.join(savefolder, f'forestSheffield_kFoldCrossValidation_AnovaFS{anovaPercentile}percent' + suffices[i] + '.joblib'))
                X_test_reduced = model[0].transform(X_test)
                X_train_reduced = model[0].transform(X_train)
                X100 = shap.utils.sample(X_train_reduced, 100)
                ke = shap.TreeExplainer(model[1], X100, approximate=True)
                sv = ke.shap_values(X_test_reduced)
                class1Coeff = model[0].inverse_transform(sv[:, :, 0])
                class2Coeff = model[0].inverse_transform(sv[:, :, 1])
                pd.DataFrame(class1Coeff.T).to_csv(savefile)
            except FileNotFoundError:
                pass
        
        # XGBoost model
        savefile = os.path.join(savefolder, f'XGBOOSTSheffield_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + 'ShapCoeff.csv')
        if not os.path.exists(savefile):
            try:
                model = load(os.path.join(savefolder, f'XGBOOSTSheffield_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + '.joblib'))
                X_test_reduced = model[0].transform(X_test)
                X_train_reduced = model[0].transform(X_train)
                X100 = shap.utils.sample(X_train_reduced, 100)
                ke = shap.TreeExplainer(model[1], X100, approximate=True)
                sv = ke.shap_values(X_test_reduced)
                class1Coeff = model[0].inverse_transform(sv)
                pd.DataFrame(class1Coeff.T).to_csv(savefile)
            except FileNotFoundError:
                pass
        
        # Other models (e.g., SVC)
        for modelname in ['SVCSheffield']:  # :, 'hivecote','Rocket']:,'MLPSheffield'
            print(modelname)
            savefile = os.path.join(savefolder, f'{modelname}_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + 'ShapCoeff.csv')
            if not os.path.exists(savefile):
                try:
                    model = load(os.path.join(savefolder, f'{modelname}_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + '.joblib'))
                    X_test_reduced = model[0].transform(X_test[:5, :])  # we test on a subsample of data
                    X_train_reduced = model[0].transform(X_train)
                    X100 = shap.utils.sample(X_train_reduced, 50)  # we subsample 50 samples to speed up
                    ke = shap.KernelExplainer(model[1].predict_proba, X100, approximate=True, check_additivity=False)
                    sv = ke.shap_values(X_test_reduced)
                    if sv.ndim == 3:
                        class1Coeff = model[0].inverse_transform(sv[:, :, 0])
                    else:
                        if len(sv) > 1:
                            class1Coeff = model[0].inverse_transform(sv[0])
                        else:
                            class1Coeff = model[0].inverse_transform(sv)
                    pd.DataFrame(class1Coeff.T).to_csv(savefile)
                except FileNotFoundError:
                    pass

# Combined dataset processing
for i, freq in enumerate(frequencies):
    print(freq)
    # Load Kings and Sheffield data
    X_kings_train, X_kings_test, y_kings_train, y_kings_test, X_kings, y_kings = loadKingsData(shift=54, scaling=False)
    X_train_Sheffield, X_test_Sheffled, y_train_Sheffield, y_test_Sheffield, X_full, y_full, dataVersion = loadSheffieldData(shift=54)
    
    # Combine Kings and Sheffield data
    X_train = np.vstack((X_train_Sheffield, X_kings_train))
    X_test = np.vstack((X_test_Sheffled, X_kings_test))
    y_train = np.hstack((y_train_Sheffield, y_kings_train))
    y_test = np.hstack((y_test_Sheffield, y_kings_test))
    X_combined = np.vstack((X_train, X_test))
    y_combined = np.hstack((y_train, y_test))

    for anovaPercentile in [10]:
        # Random forest model
        savefile = os.path.join(savefolder, f'forestCombined_kFoldCrossValidation_AnovaFS{anovaPercentile}percent' + suffices[i] + 'ShapCoeff.csv')
        if not os.path.exists(savefile):
            try:
                model = load(os.path.join(savefolder, f'forestCombined_kFoldCrossValidation_AnovaFS{anovaPercentile}percent' + suffices[i] + '.joblib'))
                X_test_reduced = model[0].transform(X_test)
                X_train_reduced = model[0].transform(X_train)
                X100 = shap.utils.sample(X_train_reduced, 100)
                ke = shap.TreeExplainer(model[1], X100, approximate=True)
                sv = ke.shap_values(X_test_reduced)
                class1Coeff = model[0].inverse_transform(sv[:, :, 0])
                class2Coeff = model[0].inverse_transform(sv[:, :, 1])
                pd.DataFrame(class1Coeff.T).to_csv(savefile)
            except FileNotFoundError:
                pass
        
        # XGBoost model
        savefile = os.path.join(savefolder, f'XGBOOSTCombined_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + 'ShapCoeff.csv')
        if not os.path.exists(savefile):
            try:
                model = load(os.path.join(savefolder, f'XGBOOSTCombined_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + '.joblib'))
                X_test_reduced = model[0].transform(X_test)
                X_train_reduced = model[0].transform(X_train)
                X100 = shap.utils.sample(X_train_reduced, 100)
                ke = shap.TreeExplainer(model[1], X100, approximate=True)
                sv = ke.shap_values(X_test_reduced)
                class1Coeff = model[0].inverse_transform(sv)
                pd.DataFrame(class1Coeff.T).to_csv(savefile)
            except FileNotFoundError:
                pass

        # Other models (e.g., SVC)
        for modelname in ['SVCCombined']:  # :, 'hivecote','Rocket']:,'MLPCombined'
            print(modelname)
            savefile = os.path.join(savefolder, f'{modelname}_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + 'ShapCoeff.csv')
            if not os.path.exists(savefile):
                try:
                    model = load(os.path.join(savefolder, f'{modelname}_kFoldCrossValidation_AnovaFS{anovaPercentile}percent_' + suffices[i] + '.joblib'))
                    X_test_reduced = model[0].transform(X_test[:5, :])  # we test on a subsample of data
                    X_train_reduced = model[0].transform(X_train)
                    X100 = shap.utils.sample(X_train_reduced, 50)  # we subsample 50 samples to speed up
                    ke = shap.KernelExplainer(model[1].predict_proba, X100, approximate=True, check_additivity=False)
                    sv = ke.shap_values(X_test_reduced)
                    if sv.ndim == 3:
                        class1Coeff = model[0].inverse_transform(sv[:, :, 0])
                    else:
                        if len(sv) > 1:
                            class1Coeff = model[0].inverse_transform(sv[0])
                        else:
                            class1Coeff = model[0].inverse_transform(sv)
                    pd.DataFrame(class1Coeff.T).to_csv(savefile)
                except FileNotFoundError:
                    pass
