In [1]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/

Mounted at /content/drive
/content/drive/MyDrive


In [2]:
import os
import glob
import torch
import torchaudio
import torchaudio.transforms as T
import librosa
import IPython.display as ipd
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader

train_healthy_paths = list(glob.glob('./SVD/train/healthy/*.wav'))
train_pathology_paths = list(glob.glob('./SVD/train/pathology/*.wav'))
test_healthy_paths = list(glob.glob('./SVD/test/healthy/*.wav'))
test_pathology_paths = list(glob.glob('./SVD/test/pathology/*.wav'))
print(f'train healthy   : {len(train_healthy_paths)} audios')
print(f'train_pathology : {len(train_pathology_paths)} audios')
print(f'test_healthy    : {len(test_healthy_paths)} audios')
print(f'test_pathology  : {len(test_pathology_paths)} audios')

train healthy   : 535 audios
train_pathology : 762 audios
test_healthy    : 100 audios
test_pathology  : 100 audios


### Class extract features jitter, shimmer, etc... from wav file
- ref: https://github.com/hyyoka/Acoustic-Features/blob/main/praat_features.py

In [3]:
# need installlation to define class below
!pip install textgrid praat-parselmouth

import parselmouth
from parselmouth import Sound
from parselmouth.praat import call
import numpy as np
import pandas as pd

class FeatureGenerator:
    def __init__(self, sound): # sound = Sound(wav_file)
        self.sample_rate = 50000
        self.fft_size = 512
        self.window_size = 1024
        self.hop_size = 512
        self.time_step = self.hop_size/self.sample_rate
        self.call = parselmouth.praat.call
        self.high_resolution_desired = False


    def _get_wv_feats(self, sound):
        pre_emphasis = call(sound, 'Filter (pre-emphasis)', 80)
        spectrum = call(pre_emphasis, 'To Spectrum', 'yes')
        cog = call(spectrum, 'Get centre of gravity', 2) # Center of gravity'
        std = call(spectrum, 'Get standard deviation', 2)
        skw = call(spectrum, 'Get skewness', 2)
        kur = call(spectrum, 'Get kurtosis', 2)
        return {'KUR':kur,'SkW':skw,'COG':cog,'SD':std}

    def _get_mfcc(self, sound, start, end):
        """mfcc extract: pre-emphasis => window => DFT => MelFilterBank => log => IDFT """
        mfcc = sound.to_mfcc(number_of_coefficients=12, time_step=self.time_step, window_length=self.window_size/self.sample_rate, maximum_frequency=7600)
        mfcc_arr = mfcc.to_array()[1:] # 0 index is the energy of cepstrum
        mfcc_bins = mfcc.x_bins()[:, 0]

        def to_frame(time):
            frame = np.searchsorted(mfcc_bins, time)-1
            return frame if frame >= 0 else 0

        start = to_frame(start)
        end = to_frame(end)

        _mfcc = mfcc_arr[:, start:end+1]
        _mfcc = np.mean(_mfcc, axis=-1)
        return _mfcc

    def _get_formant(self, sound, start, end, get_part = False):
        """
        formant extract
        get_part: extraction of one third of the center
        """
        formant = sound.to_formant_burg(time_step=self.time_step, max_number_of_formants=4.5, maximum_formant=4700.0, window_length=self.window_size/self.sample_rate, pre_emphasis_from=50)
        duration = end - start
        _formant = {}
        for f in range(3):  # f1~f3
            formant_ls = []
            for i in range(3,100,3): # total 33 sections
                formant_ls.append(formant.get_value_at_time(formant_number=f+1, time=start + i/100*duration))
            if get_part:
                formant_ls = formant_ls[10:-11]
            formant_name = "f"+str(f+1)
            _formant[formant_name] = formant_ls
        return _formant


    def _get_pitch(self, sound, start, end, get_part = False):
        """f0 extract"""

        duration = end - start
        pitch = sound.to_pitch(time_step=self.time_step, pitch_floor=75.0, pitch_ceiling=600.0)

        _ff = []
        for i in range(3,100,3): # total 24 sections 8-8-8
          _ff.append(pitch.get_value_at_time(time=start + i/100*duration))
        if get_part:
          _ff = _ff[10:-11]

        return _ff

    def _get_intensity(self, sound, start, end, get_part = False):
        """intensity extract"""

        duration = end - start
        intensity = sound.to_intensity(minimum_pitch=100.0, time_step=self.time_step, subtract_mean=True)
        _int = []
        for i in range(3,100,3):
            _int.append(intensity.get_value(time=start + i/100*duration))
        if get_part:
          _int = _int[10:-11]

        return _int

    def _get_jitter(self, sound, start, end):
        """jitter extract"""
        point_process = self.call(sound, "To PointProcess (periodic, cc)", 75, 600)  # pitch_floor=75, pitch_ceiling=600
        local_jitter = self.call(point_process, "Get jitter (local)", start, end, 0.0001, 0.02, 1.3)
        localabsolute_jitter = self.call(point_process, "Get jitter (local, absolute)", start, end, 0.0001, 0.02, 1.3)
        rap_jitter = self.call(point_process, "Get jitter (rap)", start, end, 0.0001, 0.02, 1.3)
        ppq5_jitter = self.call(point_process, "Get jitter (ppq5)", start, end, 0.0001, 0.02, 1.3)
        ddp_jitter = self.call(point_process, "Get jitter (ddp)", start, end, 0.0001, 0.02, 1.3)

        _jitter = [local_jitter, localabsolute_jitter, rap_jitter, ppq5_jitter, ddp_jitter]

        return _jitter

    def _get_shimmer(self, sound, start, end):
        """shimmer extract"""
        point_process = self.call(sound, "To PointProcess (periodic, cc)", 75, 600)  # pitch_floor=75, pitch_ceiling=600
        local_shimmer = self.call([sound, point_process], "Get shimmer (local)", start, end, 0.0001, 0.02, 1.3, 1.6)
        localdb_shimmer = self.call([sound, point_process], "Get shimmer (local_dB)", start, end, 0.0001, 0.02, 1.3, 1.6)
        apq3_shimmer = self.call([sound, point_process], "Get shimmer (apq3)", start, end, 0.0001, 0.02, 1.3, 1.6)
        apq5_shimmer = self.call([sound, point_process], "Get shimmer (apq5)", start, end, 0.0001, 0.02, 1.3, 1.6)
        apq11_shimmer = self.call([sound, point_process], "Get shimmer (apq11)", start, end, 0.0001, 0.02, 1.3, 1.6)
        dda_shimmer = self.call([sound, point_process], "Get shimmer (dda)", start, end, 0.0001, 0.02, 1.3, 1.6)
        _shimmer = [local_shimmer, localdb_shimmer, apq3_shimmer, apq5_shimmer, apq11_shimmer, dda_shimmer]

        return _shimmer

    def _get_hnr(self, sound, start, end, method='cc', get_part=True):
        """
        Calculate Harmonics-to-Noise Ratio (HNR); represents the degree of acoustic periodicity and Voice quality
        """
        if method == 'ac':
            hnr= sound.to_harmonicity_ac(time_step=self.time_step) # cross-correlation method (preferred).
        else:
            hnr= sound.to_harmonicity_cc(time_step=self.time_step) # cross-correlation method (preferred).

        duration = end - start
        _hnr= []

        for i in range(3,100,3):
            _hnr.append(hnr.get_value(time=start + i/100*duration))
        if get_part:
          _hnr = _hnr[10:-11]
        _hnr = [h for h in _hnr if h != -200]
        return _hnr



In [4]:
!pip install textgrid praat-parselmouth

import parselmouth
from parselmouth.praat import call

class FeatureGenerator:
    def __init__(self, sound):  # sound = Sound(wav_file)
        self.sample_rate = 50000
        self.fft_size = 2048
        self.window_size = 1200
        self.hop_size = 600
        self.time_step = self.hop_size / self.sample_rate
        self.call = parselmouth.praat.call
        self.high_resolution_desired = False

    def _get_jitter(self, sound, start, end):
        """extract jitter"""
        point_process = self.call(sound, "To PointProcess (periodic, cc)", 75, 600)  # pitch_floor=75, pitch_ceiling=600
        local_jitter = self.call(point_process, "Get jitter (local)", start, end, 0.0001, 0.02, 1.3)
        localabsolute_jitter = self.call(point_process, "Get jitter (local, absolute)", start, end, 0.0001, 0.02, 1.3)
        rap_jitter = self.call(point_process, "Get jitter (rap)", start, end, 0.0001, 0.02, 1.3)
        ppq5_jitter = self.call(point_process, "Get jitter (ppq5)", start, end, 0.0001, 0.02, 1.3)
        ddp_jitter = self.call(point_process, "Get jitter (ddp)", start, end, 0.0001, 0.02, 1.3)

        _jitter = [local_jitter, localabsolute_jitter, rap_jitter, ppq5_jitter, ddp_jitter]

        return _jitter

    def _get_shimmer(self, sound, start, end):
        """extract shimmer"""
        point_process = self.call(sound, "To PointProcess (periodic, cc)", 75, 600)  # pitch_floor=75, pitch_ceiling=600
        local_shimmer = self.call([sound, point_process], "Get shimmer (local)", start, end, 0.0001, 0.02, 1.3, 1.6)
        localdb_shimmer = self.call([sound, point_process], "Get shimmer (local_dB)", start, end, 0.0001, 0.02, 1.3, 1.6)
        apq3_shimmer = self.call([sound, point_process], "Get shimmer (apq3)", start, end, 0.0001, 0.02, 1.3, 1.6)
        apq5_shimmer = self.call([sound, point_process], "Get shimmer (apq5)", start, end, 0.0001, 0.02, 1.3, 1.6)
        apq11_shimmer = self.call([sound, point_process], "Get shimmer (apq11)", start, end, 0.0001, 0.02, 1.3, 1.6)
        dda_shimmer = self.call([sound, point_process], "Get shimmer (dda)", start, end, 0.0001, 0.02, 1.3, 1.6)
        _shimmer = [local_shimmer, localdb_shimmer, apq3_shimmer, apq5_shimmer, apq11_shimmer, dda_shimmer]

        return _shimmer

    def _get_jitter_shimmer(self, sound, start, end, num_intervals = 1):
        # return dictionary
        # Divide jitters and shimmers into num_intervals & save in one dict
        duration = end - start # total speech length
        interval_length = duration / num_intervals  # num_intervals equal parts
        mat = []
        for i in range(num_intervals):
            interval_start = start + i * interval_length
            interval_end = interval_start + interval_length
            jitter_list = self._get_jitter(sound, interval_start, interval_end)
            shimmer_list = self._get_shimmer(sound, interval_start, interval_end)
            feature_list = jitter_list + shimmer_list
            mat.append(feature_list)
        df = pd.DataFrame(mat)
        df_t = df.T
        return df_t.values.tolist()

def extract_features(path, n_intervals=10):
    sound = Sound(path)
    fg = FeatureGenerator(sound)
    start_time = 0.0  # sec
    end_time = sound.get_total_duration()  # sec
    feature_list = fg._get_jitter_shimmer(sound, start_time, end_time, n_intervals)
    words = ["local_jitter_", "abs_jitter_", "rap_jitter_", "ppq5_", "ddp_",
              "local_shimmer_", "abs_shimmer_", "apq3_", "apq5_", "dda_"]
    features ={} # dictionary contains instances
    for i in range(len(words)):
        for j in range(n_intervals):
            features[words[i]+str(j)] = feature_list[i][j]
    return features
# create Dataframe function
def create_dataframe(paths, label):
    data = []
    for file_path in paths:
        features = extract_features(file_path)
        features["IsPathology"] = label
        data.append(features)
    return pd.DataFrame(data)
# make Train dataset
train_healthy_df = create_dataframe(train_healthy_paths, 0)
train_pathology_df = create_dataframe(train_pathology_paths, 1)
train_df = pd.concat([train_healthy_df, train_pathology_df], ignore_index=True)

# make test dataset
test_healthy_df = create_dataframe(test_healthy_paths, 0)
test_pathology_df = create_dataframe(test_pathology_paths, 1)
test_df = pd.concat([test_healthy_df, test_pathology_df], ignore_index=True)

# Datasets to csv file (save)
train_df.to_csv('train_100features.csv', index=False)
test_df.to_csv('test_100features.csv', index=False)

print("Train and test 100feature csvfiles have been saved.")

Train and test 100feature csvfiles have been saved.


## Analysis through Sci-kit Learn

In [5]:
import pandas as pd #
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

# use 7 classifier and finally ensemble them with voting(soft, hard both)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier

from sklearn.metrics import accuracy_score, f1_score

In [6]:
# load CSV
train_df = pd.read_csv('train_100features.csv')
test_df = pd.read_csv('test_100features.csv')

In [7]:
print((train_df.isnull().sum()))
print((test_df.isnull().sum()))

# check a lot of missing values

local_jitter_0      8
local_jitter_1      3
local_jitter_2      4
local_jitter_3     11
local_jitter_4    168
                 ... 
dda_6             467
dda_7             955
dda_8             278
dda_9             486
IsPathology         0
Length: 101, dtype: int64
local_jitter_0      1
local_jitter_1      1
local_jitter_2      1
local_jitter_3      1
local_jitter_4     19
                 ... 
dda_6              73
dda_7             152
dda_8              45
dda_9              78
IsPathology         0
Length: 101, dtype: int64


In [8]:
from sklearn.impute import KNNImputer

# imputation by KNN Imputer
# It does not change the result significantly.
imputer = KNNImputer(n_neighbors=5) # caution on parameter
train_df[:] = imputer.fit_transform(train_df)
test_df[:] = imputer.fit_transform(test_df)

# check result(no missing values)
print((train_df.isnull().sum()))
print((test_df.isnull().sum()))

local_jitter_0    0
local_jitter_1    0
local_jitter_2    0
local_jitter_3    0
local_jitter_4    0
                 ..
dda_6             0
dda_7             0
dda_8             0
dda_9             0
IsPathology       0
Length: 101, dtype: int64
local_jitter_0    0
local_jitter_1    0
local_jitter_2    0
local_jitter_3    0
local_jitter_4    0
                 ..
dda_6             0
dda_7             0
dda_8             0
dda_9             0
IsPathology       0
Length: 101, dtype: int64


In [9]:
# Label
# Seperate feature cols and Label col(IsPathology)
X_train = train_df.drop('IsPathology', axis=1)
y_train = train_df['IsPathology']
X_test = test_df.drop('IsPathology', axis=1)
y_test = test_df['IsPathology']

In [10]:
# Scaler
# I've tried several scalers, but I think the basic standard is the best, so I annotate it.
#from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler, RobustScaler, Normalizer

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [11]:
# Apply SMOTE 적용
smote = SMOTE(random_state=42)
X_train_scaled, y_train = smote.fit_resample(X_train_scaled, y_train)

### Adjust individual classifier parameters

In [12]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

In [13]:
#1.LogisticRegression
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)


log_reg_param_dist = {
    'C': uniform(0.01, 10),
    'solver': ['lbfgs', 'newton-cg', 'liblinear'],
    'max_iter': [100, 200, 500]
}

log_reg_search = RandomizedSearchCV(LogisticRegression(random_state=42),
                                    param_distributions=log_reg_param_dist,
                                    n_iter=10, cv=5, scoring='accuracy', random_state=42)
log_reg_search.fit(X_train_scaled, y_train)
print('best score at train : ', round(log_reg_search.best_score_, 4))

y_pred_log = log_reg_search.predict(X_test_scaled)
log_accuracy = accuracy_score(y_test, y_pred_log)
log_f1 = f1_score(y_test, y_pred_log)
print(f'Logistic Regression Accuracy: {log_accuracy}, F1 Score: {log_f1}')

best score at train :  0.6969
Logistic Regression Accuracy: 0.67, F1 Score: 0.6373626373626374


In [14]:
#2.RandomForestClassifier
#rnd_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rnd_clf = RandomForestClassifier(n_estimators=10, max_depth=3, random_state=42)

rf_param_dist = {
    'n_estimators': randint(10, 100),
    'max_depth': [None, 3, 10, 20, 30],
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 10),
    'max_features': ['sqrt', 'log2']
}

rf_search = RandomizedSearchCV(RandomForestClassifier(random_state=42),
                               param_distributions=rf_param_dist,
                               n_iter=10, cv=5, scoring='accuracy', random_state=42)


rf_search.fit(X_train_scaled, y_train)
print('best score at train : ', round(rf_search.best_score_, 4))

y_pred_rf = rf_search.predict(X_test_scaled)
rf_accuracy = accuracy_score(y_test, y_pred_rf)
rf_f1 = f1_score(y_test, y_pred_rf)
print(f'Random Forest Accuracy: {rf_accuracy}, F1 Score: {rf_f1}')

best score at train :  0.7107
Random Forest Accuracy: 0.67, F1 Score: 0.6796116504854369


In [15]:
#3.SupportVectorMachine(Classifier)
#svm_clf = SVC(gamma="scale", probability=True, random_state=42)
#svm_clf = SVC(kernel='linear', gamma=10, C=1.0, probability=True, random_state=42)
#svm_clf = SVC(kernel='rbf', gamma='auto', C=10.0, probability=True, random_state=42)
#svm_clf = SVC(kernel='rbf', gamma='auto', C=1.0, class_weight={0: 1, 1: 2}, probability=True, random_state=42)
svm_clf = SVC(gamma=10, probability=True, random_state=42)

svm_param_dist = {
    'C': uniform(0.1, 10),
    'gamma': uniform(0.001, 1),
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid']
}

svm_search = RandomizedSearchCV(SVC(probability=True, random_state=42),
                                param_distributions=svm_param_dist,
                                n_iter=10, cv=5, scoring='accuracy', random_state=42)

svm_search.fit(X_train_scaled, y_train)
print('best score at train : ', round(svm_search.best_score_, 4))

y_pred_svm = svm_search.predict(X_test_scaled)
svm_accuracy = accuracy_score(y_test, y_pred_svm)
svm_f1 = f1_score(y_test, y_pred_svm)
print(f'Random Forest Accuracy: {svm_accuracy}, F1 Score: {svm_f1}')

best score at train :  0.7435
Random Forest Accuracy: 0.545, F1 Score: 0.5844748858447488


In [16]:
#4.XGBoost Classifier
from sklearn.exceptions import FitFailedWarning
from sklearn.utils._testing import ignore_warnings
# Ignore FitFailedWarning, UserWarning
warnings.filterwarnings("ignore", category=FitFailedWarning)
warnings.filterwarnings("ignore", category=UserWarning)
xgb_clf = XGBClassifier(n_estimators=100, max_depth=30, random_state=42)

xgb_param_dist = {
    'n_estimators': randint(50, 200),
    'max_depth': randint(3, 20),
    'learning_rate': uniform(0.01, 0.3),
    'subsample': uniform(0.5, 1.0),
    'colsample_bytree': uniform(0.5, 1.0)
}

xgb_search = RandomizedSearchCV(XGBClassifier(random_state=42),
                                param_distributions=xgb_param_dist,
                                n_iter=10, cv=5, scoring='accuracy', random_state=42)

xgb_search.fit(X_train_scaled, y_train)
print('best score at trai : ', round(xgb_search.best_score_, 4))

y_pred_xgb = xgb_search.predict(X_test_scaled)
xgb_accuracy = accuracy_score(y_test, y_pred_xgb)
xgb_f1 = f1_score(y_test, y_pred_xgb)
print(f'Random Forest Accuracy: {xgb_accuracy}, F1 Score: {xgb_f1}')

best score at trai :  0.756
Random Forest Accuracy: 0.67, F1 Score: 0.6796116504854369


In [17]:
#5 Gaussian.Naive Bayes
from scipy.stats import loguniform

nb_param_dist = {
    'var_smoothing': loguniform(1e-12, 1e-1)
}

nb_search = RandomizedSearchCV(GaussianNB(),
                                param_distributions=nb_param_dist,
                                n_iter=10, cv=5, scoring='accuracy', random_state=42)

nb_search.fit(X_train_scaled, y_train)
print('best score at trai : ', round(nb_search.best_score_, 4))

y_pred_nb = nb_search.predict(X_test_scaled)
nb_accuracy = accuracy_score(y_test, y_pred_nb)
nb_f1 = f1_score(y_test, y_pred_nb)
print(f'Naive Bayes model Accuracy: {nb_accuracy}, F1 Score: {nb_f1}')

best score at trai :  0.6345
Naive Bayes model Accuracy: 0.585, F1 Score: 0.47798742138364775


In [18]:
#6. AdaBoostClassifier
adaboost_clf = AdaBoostClassifier(random_state=42)
adaboost_param_dist = {
    'n_estimators': randint(50, 200),
    'learning_rate': uniform(0.01, 2.0),
    'algorithm': ['SAMME', 'SAMME.R']
}

adaboost_search = RandomizedSearchCV(AdaBoostClassifier(random_state=42),
                                     param_distributions=adaboost_param_dist,
                                     n_iter=10, cv=5, scoring='accuracy', random_state=42)

adaboost_search.fit(X_train_scaled, y_train)

print('Best score at train:', round(adaboost_search.best_score_, 4))

y_pred_adaboost = adaboost_search.predict(X_test_scaled)

adaboost_accuracy = accuracy_score(y_test, y_pred_adaboost)
adaboost_f1 = f1_score(y_test, y_pred_adaboost)
print(f'AdaBoost Classifier Accuracy: {adaboost_accuracy}, F1 Score: {adaboost_f1}')

Best score at train: 0.6969
AdaBoost Classifier Accuracy: 0.67, F1 Score: 0.6597938144329897


In [19]:
# This part causes errors, but does not matter to performance.
'''#7.BaggingClassifier
from sklearn.exceptions import FitFailedWarning
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=FitFailedWarning)

bagging_clf = BaggingClassifier(estimator= log_reg_search, n_estimators=50, random_state=42)
bagging_param_dist = {
    'n_estimators': randint(10, 100),
    'max_samples': uniform(0.5, 1.0),
    'max_features': uniform(0.5, 1.0),
    'bootstrap': [True, False],
    'bootstrap_features': [True, False]
}

bagging_search = RandomizedSearchCV(BaggingClassifier(estimator=log_reg_search),
                                    param_distributions=bagging_param_dist,
                                    n_iter=10, cv=5, scoring='accuracy', random_state=42)

bagging_search.fit(X_train_scaled, y_train)
print('best score at train: ', round(bagging_search.best_score_, 4))

y_pred_bagging = bagging_search.predict(X_test_scaled)
bagging_accuracy = accuracy_score(y_test, y_pred_bagging)
bagging_f1 = f1_score(y_test, y_pred_bagging)
print(f'Bagging Classifier Accuracy: {bagging_accuracy}, F1 Score: {bagging_f1}')'''



### 앙상블

In [20]:
# Define multiple different classifiers to fit into an ensemble

print('Performance of individual models')
print(f'Logistic Regression Accuracy: {log_accuracy}, F1 Score: {log_f1}')
print(f'Random Forest Accuracy: {rf_accuracy}, F1 Score: {rf_f1}')
print(f'Support Vector Machine Accuracy: {svm_accuracy}, F1 Score: {svm_f1}')
print(f'XGBoost Classifier Accuracy: {xgb_accuracy}, F1 Score: {xgb_f1}')
print(f'Naive Bayes model Accuracy: {nb_accuracy}, F1 Score: {nb_f1}')
print(f'adaboost Classifier Accuracy: {adaboost_accuracy}, F1 Score: {nb_f1}')
#print(f'bagging Classifier Accuracy: {bagging_accuracy}, F1 Score: {bagging_f1}')
print('-------------------------------------------------------------------')

## Defining ensemble learning model (I've tried bagging, boosting, and voting, and I think voting is the best in my experience)
voting_clf = VotingClassifier(
    estimators=[('lr', log_reg_search.best_estimator_),
                ('rf', rf_search.best_estimator_),
                ('xgb', xgb_search.best_estimator_),
                ('ada', adaboost_search.best_estimator_),
                #('svc', svm_search.best_estimator_),
                #('bagging', bagging_search.best_estimator_),
                #('nb', nb_search.best_estimator_)
                ],
    voting='soft' # To hard voting, change to 'hard'
)
# train model
voting_clf.fit(X_train_scaled, y_train)
y_pred = voting_clf.predict(X_test_scaled)

voting_accuracy = accuracy_score(y_test, y_pred)
voting_f1 = f1_score(y_test, y_pred)

print('Performance of Voting model')
print(f'Accuracy: {voting_accuracy}')
print(f'F1 Score: {voting_f1}')


print('-------------------------------------------------------------------')

Performance of individual models
Logistic Regression Accuracy: 0.67, F1 Score: 0.6373626373626374
Random Forest Accuracy: 0.67, F1 Score: 0.6796116504854369
Support Vector Machine Accuracy: 0.545, F1 Score: 0.5844748858447488
XGBoost Classifier Accuracy: 0.67, F1 Score: 0.6796116504854369
Naive Bayes model Accuracy: 0.585, F1 Score: 0.47798742138364775
adaboost Classifier Accuracy: 0.67, F1 Score: 0.47798742138364775
-------------------------------------------------------------------
Performance of Voting model
Accuracy: 0.695
F1 Score: 0.6903553299492386
-------------------------------------------------------------------
