# Hyperparameter optimisation for RF, XGBOOST, MLP, Rocket, SVC models.


We perform parameters search optimising for recall, f1 score and balanced accuracy. At the moment, f1 score is used for parameter tuning.

In [None]:
%pylab
%matplotlib inline
%reload_ext autoreload
import pandas as pd
import sys
import seaborn as sns
sys.path.append('../src')
import abrTools as at
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier

from sklearn.preprocessing import StandardScaler,Normalizer
from sklearn.metrics import ConfusionMatrixDisplay,confusion_matrix,classification_report
from collections import Counter
import pretty_confusion_matrix as pcm
from scipy.signal import savgol_filter


fs = 195000.0/2.0 # Acquisition sampling rate

from datetime import date
# savefolder = os.path.join('..','results',str(date.today()))

# if not os.path.exists(savefolder):
#     os.makedirs(savefolder)

In [None]:
X_train,  X_test,y_train,y_test,dataVersion = at.createClassificationDataset(test_size=0.25,oversample=False,ages=[1,],frequencies=[100,3000,6000,12000,18000,24000,30000,36000,42000])
X = np.vstack([X_train,X_test])
y = np.hstack([y_train,y_test])

In [None]:

import xgboost as xgb
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.feature_selection import f_classif,mutual_info_classif, SelectFpr, SelectPercentile
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder
from joblib import dump, load
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.utils.class_weight import compute_sample_weight
from sktime.classification.kernel_based import RocketClassifier
from sktime.classification.hybrid import HIVECOTEV2

### Parameter search for XGBOOST

In [None]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import make_scorer,recall_score,balanced_accuracy_score,f1_score
anova_fs = SelectPercentile(f_classif,percentile=10)
sample_weights = compute_sample_weight(class_weight='balanced', y=y_train)
reg = xgb.XGBClassifier(use_label_encoder=False,n_estimators=795,verbosity=0,n_jobs=-1,random_state=42,max_depth=21,sample_weight=sample_weights)
xg_pip = make_pipeline(anova_fs,reg)


# Define the parameter grid
param_grid = {
        'xgbclassifier__n_estimators': [50, 100, 200],                     # 3 values
        'xgbclassifier__learning_rate': [0.01, 0.05, 0.1],                 # 3 values
        'xgbclassifier__max_depth': [3, 5, 7, 10],                         # 4 values
        'xgbclassifier__subsample': [0.6, 0.8, 1.0],                      # 3 values
        'xgbclassifier__colsample_bytree': [0.6, 0.8, 1.0],                # 3 values
    }
    


y_train2 = y_train.copy()
y_train2[y_train=='6N']=0
y_train2[y_train=='Repaired']=1
y_test2 = y_test.copy()
y_test2[y_test=='6N']=0
y_test2[y_test=='Repaired']=1

y_train2 = y_train2.astype(int)
y_test2 = y_test2.astype(int)
# Perform GridSearchCV with cross-validation
grid_search = GridSearchCV(estimator=xg_pip, 
                           param_grid=param_grid, 
                           cv=5,                          # 5-fold cross-validation
                           #scoring=make_scorer(recall_score,pos_label=0),              # Optimize for recall \make_scorer(balanced_accuracy_score)
                           scoring = make_scorer(f1_score,pos_label=0),
                           #scoring = make_scorer(balanced_accuracy_score),

                           verbose=1,                     # Output progress
                           n_jobs=-1)                     # Use all processors

# Fit the model to the training data
grid_search.fit(X_train, y_train2)

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_score_

In [None]:
#Summary of results
best_params_recall = {'xgbclassifier__colsample_bytree': 0.8, # Best parameters using recall as a scoring metric
 'xgbclassifier__learning_rate': 0.1,
 'xgbclassifier__max_depth': 3,
 'xgbclassifier__n_estimators': 200,
 'xgbclassifier__subsample': 0.6}


best_params_f1_score = {'xgbclassifier__colsample_bytree': 0.8,
 'xgbclassifier__learning_rate': 0.1,
 'xgbclassifier__max_depth': 3,
 'xgbclassifier__n_estimators': 200,
 'xgbclassifier__subsample': 0.6}

### Parameter search for Random forest

In [None]:
anova_fs = SelectPercentile(f_classif,percentile=10)
forest_cl = RandomForestClassifier(n_estimators=1000,random_state=42,class_weight='balanced',n_jobs=-1)
forest_pip = make_pipeline(anova_fs,forest_cl)

param_grid = {
    'randomforestclassifier__n_estimators': [50, 100, 200, 300, 1000],  # Number of trees
    'randomforestclassifier__max_depth': [None, 5, 10, 30, 50],          # Maximum depth of the tree
    'randomforestclassifier__min_samples_split': [2, 5, 10],             # Minimum number of samples required to split a node
    'randomforestclassifier__min_samples_leaf': [1, 2,  6, 8],             # Minimum number of samples required to be at a leaf node
    'randomforestclassifier__bootstrap': [True, False],                      # Whether bootstrap samples are used when building trees
    'randomforestclassifier__max_features': ['auto', 'sqrt', 'log2'],  # Number of features to consider for the best split
    'randomforestclassifier__class_weight': [None, 'balanced'],              # Adjusting weights for handling class imbalance
}

# Perform GridSearchCV with cross-validation
grid_search = GridSearchCV(estimator=forest_pip, 
                           param_grid=param_grid, 
                           cv=5,                          # 5-fold cross-validation
                           #scoring=make_scorer(recall_score,pos_label='6N'),              # Optimize for recall \make_scorer(balanced_accuracy_score)
                           scoring = make_scorer(f1_score,pos_label='6N'),
                           # scoring = make_scorer(balanced_accuracy_score),
                           verbose=1,                     # Output progress
                           n_jobs=-1)                     # Use all processors

# Fit the model to the training data
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
print(grid_search.best_score_)

In [None]:
best_params_recall = {'randomforestclassifier__bootstrap': True, 'randomforestclassifier__class_weight': 'balanced', 'randomforestclassifier__max_depth': None, 'randomforestclassifier__max_features': 'sqrt', 'randomforestclassifier__min_samples_leaf': 1, 'randomforestclassifier__min_samples_split': 2, 'randomforestclassifier__n_estimators': 100}
best_score = 0.7928571428571429

best_params_f1_score = {'randomforestclassifier__bootstrap': True, 'randomforestclassifier__class_weight': None, 'randomforestclassifier__max_depth': None, 'randomforestclassifier__max_features': 'sqrt', 'randomforestclassifier__min_samples_leaf': 1, 'randomforestclassifier__min_samples_split': 5, 'randomforestclassifier__n_estimators': 100}

best_score_f1_score = 0.81


best_params_balanced_accuracy = {'randomforestclassifier__bootstrap': True, 'randomforestclassifier__class_weight': None, 'randomforestclassifier__max_depth': None, 'randomforestclassifier__max_features': 'sqrt', 'randomforestclassifier__min_samples_leaf': 1, 'randomforestclassifier__min_samples_split': 5, 'randomforestclassifier__n_estimators': 100}
0.8335317460317461

### Parameter search for SVC model

In [None]:
# Perform GridSearchCV with cross-validation
anova_fs = SelectPercentile(f_classif,percentile=10)
svc_cl = SVC(probability=True,kernel='linear',C=0.2,class_weight='balanced')
svc_pip = make_pipeline(anova_fs,svc_cl)

param_grid = {
    'svc__C': [0.01, 0.1, 1],                # Regularization parameter
    'svc__kernel': ['linear', 'rbf', 'poly', 'sigmoid'],  # Kernel type
    'svc__gamma': ['scale', 'auto', 0.001, 0.01, 0.1],  # Kernel coefficient (for rbf, poly, sigmoid)
    'svc__degree': [2, 3, 4],                         # Degree of the polynomial kernel (used if kernel='poly')
    'svc__shrinking': [True, False],                  # Whether to use the shrinking heuristic
    'svc__class_weight': [None, 'balanced']           # Adjusting weights for handling class imbalance
}
grid_search = GridSearchCV(estimator=svc_pip, 
                           param_grid=param_grid, 
                           cv=5,                          # 5-fold cross-validation
                           #scoring=make_scorer(recall_score,pos_label='6N'),              # Optimize for recall \make_scorer(balanced_accuracy_score)
                           scoring = make_scorer(f1_score,pos_label='6N'),
                           #scoring = make_scorer(balanced_accuracy_score),
                           verbose=1,                     # Output progress
                           n_jobs=-1)                     # Use all processors

# Fit the model to the training data
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
print(grid_search.best_score_)

In [None]:
best_params_recall = {'svc__C': 1, 'svc__class_weight': 'balanced', 'svc__degree': 2, 'svc__gamma': 0.01, 'svc__kernel': 'sigmoid', 'svc__shrinking': True}
best_score = 0.9464285714285714

best_params_f1_score = {'svc__C': 0.01, 'svc__class_weight': None, 'svc__degree': 2, 'svc__gamma': 0.01, 'svc__kernel': 'poly', 'svc__shrinking': True}
best_score_f1_score = 0.833

best_params_balanced_accuracy ={'svc__C': 0.01, 'svc__class_weight': None, 'svc__degree': 2, 'svc__gamma': 'scale', 'svc__kernel': 'linear', 'svc__shrinking': True}


### Rocket parameter search

In [None]:
anova_fs = SelectPercentile(f_classif,percentile=10)    
rocket = RocketClassifier(num_kernels=1000,n_jobs=-1,random_state=42)
rocket_pip = make_pipeline(anova_fs,rocket)

param_grid= {
        'rocketclassifier__num_kernels': [100,1000,5000],   # Number of random convolution kernels
        'rocketclassifier__max_dilations_per_kernel': [16,32, 64],        # Max dilations per kernel
        'rocketclassifier__n_features_per_kernel': [2, 4, 8],               # Number of features per kernel
         'rocketclassifier__use_multivariate': ['yes', 'no'],                # Whether to use multivariate data or not
     
    }

grid_search = GridSearchCV(estimator=rocket_pip, 
                           param_grid=param_grid, 
                           cv=5,                          # 5-fold cross-validation
                           #scoring=make_scorer(recall_score,pos_label='6N'),              # Optimize for recall \make_scorer(balanced_accuracy_score)
                           scoring = make_scorer(f1_score,pos_label='6N'),
                           # scoring = make_scorer(balanced_accuracy_score),
                           verbose=3,                     # Output progress
                           n_jobs=-1)                     # Use all processors

# Fit the model to the training data
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
print(grid_search.best_score_)

In [None]:
best_params_recall = {'rocketclassifier__max_dilations_per_kernel': 16, 'rocketclassifier__n_features_per_kernel': 2, 'rocketclassifier__num_kernels': 5000, 'rocketclassifier__use_multivariate': 'yes'}
best_score_recall = 0.8392857142857142

best_params_f1_score = {'rocketclassifier__max_dilations_per_kernel': 16, 'rocketclassifier__n_features_per_kernel': 2, 'rocketclassifier__num_kernels': 5000, 'rocketclassifier__use_multivariate': 'yes'}
best_score_f1_Score = 0.87

best_params_balanced_accuracy = {'rocketclassifier__max_dilations_per_kernel': 16, 'rocketclassifier__n_features_per_kernel': 2, 'rocketclassifier__num_kernels': 5000, 'rocketclassifier__use_multivariate': 'yes'}


### Grid search for MLP

In [None]:
anova_fs = SelectPercentile(f_classif,percentile=10)
mlp = MLPClassifier(solver = 'lbfgs',random_state=42, early_stopping=True)
mlp_pip = make_pipeline(anova_fs,mlp)

param_grid = {
        'mlpclassifier__hidden_layer_sizes': [(50,), (100,), (150,), (100, 50)],  # 4 values
        'mlpclassifier__activation': ['relu', 'tanh'],                            # 2 values
        'mlpclassifier__learning_rate_init': [0.001, 0.01, 0.1],                  # 3 values
        'mlpclassifier__alpha': [ 0.001, 0.01,0.05,0.1],                           # 4 values
        'mlpclassifier__max_iter': [100,200, 300]                                     # 2 values
    }

grid_search = GridSearchCV(estimator=mlp_pip, 
                           param_grid=param_grid, 
                           cv=5,                          # 5-fold cross-validation
                           #scoring=make_scorer(recall_score,pos_label='6N'),              # Optimize for recall \make_scorer(balanced_accuracy_score)
                           scoring = make_scorer(f1_score,pos_label='6N'),                          
                           #scoring = make_scorer(balanced_accuracy_score),
                           verbose=3,                     # Output progress
                           n_jobs=-1)                     # Use all processors

# Fit the model to the training data
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
print(grid_search.best_score_)

In [None]:
best_params_recall = {
'activation':'relu',
'alpha':0.1,
'hidden_layer_size':100,
'learning_rate_init':0.001,
'max_iter':100
}
best_score_recall = 0.89


best_params_f1_score={'mlpclassifier__activation': 'tanh', 'mlpclassifier__alpha': 0.05, 'mlpclassifier__hidden_layer_sizes': (150,), 'mlpclassifier__learning_rate_init': 0.001, 'mlpclassifier__max_iter': 100}
best_score_f1_Score = 0.8667948717948718


best_params_balanced_accuracy={'mlpclassifier__activation': 'tanh', 'mlpclassifier__alpha': 0.1, 'mlpclassifier__hidden_layer_sizes': (50,), 'mlpclassifier__learning_rate_init': 0.001, 'mlpclassifier__max_iter': 200}