In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=False)

Mounted at /content/gdrive


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap

import math

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.decomposition import PCA
from sklearn.svm import SVC, LinearSVC
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import StratifiedKFold

In [None]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [None]:
def load_data(train_path, test_path):
    # Load your data
    train_df = pd.read_csv(train_path, header=[0, 1])
    # Compute mean only for numeric columns
    train_df = train_df.dropna(axis=1, how='all')
    # mean_values = train_df.select_dtypes(include=[np.number]).median()

    # Fill NaNs with the computed means in the numeric columns
    train_df.fillna(0, inplace=True)
    test_df = pd.read_csv(test_path, header=[0, 1], index_col=0)
    test_df = test_df.dropna(axis=1, how='all')


    # Compute mean only for numeric columns
    mean_values = test_df.select_dtypes(include=[np.number]).median()

    # Fill NaNs with the computed means in the numeric columns
    test_df.fillna(0,inplace=True)

    train_df.replace([np.inf, -np.inf], 0, inplace=True)
    test_df.replace([np.inf, -np.inf], 0, inplace=True)

    return train_df, test_df


def to_array(train_df, test_df, group=False):
    if not group:
        train_df.columns = train_df.columns.droplevel(level=0)
        test_df.columns = test_df.columns.droplevel(level=0)

    # Prepare the datasets
    X_train = train_df.drop(['label', 'filename'], axis=1).values
    y_train = train_df['label'].values
    X_val = test_df.drop(['label', 'filename'], axis=1).values
    y_val = test_df['label'].values

    return X_train, y_train, X_val, y_val

In [None]:
train_path = '/content/gdrive/MyDrive/speech_analysis/train_with_groups.csv'
test_path = '/content/gdrive/MyDrive/speech_analysis/test_with_groups1.csv'

train_df, test_df = load_data(train_path, test_path)
# train_df = train_df.fillna(0)
# test_df = test_df.fillna(0)
X_train_all_features, y_train, X_test_all_features, y_test = to_array(train_df.copy(), test_df.copy(), group=False)
group_names = set(train_df.columns.droplevel(level=1))
group_names.remove("Info")
feature_names = train_df.columns.droplevel(level=0).tolist()[:-2]


# Normalize features
scaler = MinMaxScaler()
X_train_all_features = scaler.fit_transform(X_train_all_features)
X_test_all_features = scaler.transform(X_test_all_features)

In [None]:
train_df

Unnamed: 0_level_0,Pausing behavior,Pausing behavior,Pausing behavior,Pausing behavior,Pausing behavior,Pausing behavior,Pausing behavior,Pausing behavior,Pausing behavior,Pausing behavior,...,Complexity,Complexity,Complexity,Complexity,Complexity,Complexity,Complexity,Complexity,Info,Info
Unnamed: 0_level_1,count_pause_segments,hesitation_rate,num_words_to_pauses,pasue_speech_ratio,pause_length,pause_lengths_avg,pause_speech_duration_ratio,pause_to_syllable,pause_to_tokens,pause_totallength_ratio,...,AMP_ENTROPY_sma_de_iqr1_2,AMP_ENTROPY_sma_de_iqr2_3,AMP_ENTROPY_sma_de_iqr1_3,AMP_ENTROPY_sma_de_percentile1,AMP_ENTROPY_sma_de_percentile99,AMP_ENTROPY_sma_de_pctlrange0_1,AMP_ENTROPY_sma_de_upleveltime75,AMP_ENTROPY_sma_de_upleveltime90,filename,label
0,0.439404,0.030303,300.406955,1.000000,410.999615,935.357143,0.697792,0.002729,0.002663,0.006450,...,0.118040,0.128968,0.247008,-1.917533,1.926447,3.843980,25.009815,1.452689,adrso018.wav,0
1,0.482767,0.025000,82.855682,1.100000,447.012193,925.937500,0.808358,0.009109,0.008940,0.019618,...,0.122531,0.122757,0.245288,-1.960159,1.794788,3.754947,25.054945,1.428571,adrso010.wav,0
2,0.501897,0.052239,266.986895,1.033333,322.211991,641.987903,0.475388,0.002868,0.002970,0.005217,...,0.125259,0.127236,0.252495,-1.978139,1.901130,3.879269,24.989874,1.579587,adrso005.wav,0
3,0.382228,0.031746,164.822963,1.111111,299.299089,783.037500,0.427142,0.004838,0.004838,0.011440,...,0.115967,0.091518,0.207486,-1.737854,1.830432,3.568286,24.976077,1.435407,adrso007.wav,0
4,0.569401,0.064516,272.215678,1.000000,244.009350,428.536585,0.322768,0.003061,0.003061,0.003389,...,0.104113,0.106256,0.210369,-1.952743,1.888825,3.841568,25.008684,1.354637,adrso017.wav,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161,0.219805,0.091954,395.805412,1.041667,831.471127,3782.767500,4.933701,0.002155,0.001963,0.007310,...,0.112802,0.104624,0.217426,-0.497967,0.470122,0.968089,25.000000,0.065963,adrso212.wav,1
162,0.148088,0.095238,850.843350,1.000000,889.260755,6004.933333,8.030222,0.001058,0.000841,0.008779,...,0.151142,0.151171,0.302313,-1.544396,1.517685,3.062082,25.012346,0.543210,adrso046.wav,1
163,0.311652,0.050000,256.696667,1.043478,757.690660,2431.208333,3.126956,0.003213,0.003026,0.009839,...,0.106125,0.106622,0.212746,-0.556243,0.539525,1.095768,25.008120,0.032478,adrso035.wav,1
164,0.299072,0.000000,117.028607,1.043478,765.078806,2558.174479,3.256747,0.007121,0.006797,0.009534,...,0.106628,0.110117,0.216746,-0.650028,0.606313,1.256341,25.000000,0.436409,adrso033.wav,1


In [None]:
X_test_all_features.shape

(71, 6846)

In [None]:
# 129 150 166
pca = PCA(n_components=150)
X_train_pca = pca.fit_transform(X_train_all_features)
X_test_pca = pca.transform(X_test_all_features)

In [None]:
from sklearn.metrics import accuracy_score, classification_report, f1_score, roc_auc_score

def train_and_evaluate(clf, params, rs, X_train, y_train, X_test, y_test):
    """
    Perform 5-fold cross-validation using GridSearchCV to find the best hyperparameters,
    and then evaluate the best model on the test set, reporting F1-score and AUC-ROC for binary classification.

    Parameters:
    clf: Classifier model (e.g., RandomForestClassifier(), ExtraTreesClassifier(), SVC).
    params: Dictionary of hyperparameters for tuning.
    rs: Random seed for reproducibility.
    X_train: Training feature set.
    y_train: Training labels.
    X_test: Test feature set.
    y_test: Test labels.

    Returns:
    best_model: The best model found by GridSearchCV.
    test_accuracy: Accuracy score of the best model on the test set.
    f1: F1-score of the best model on the test set.
    auc_roc: AUC-ROC score of the best model on the test set.
    """

    # Set the random state for the classifier
    clf.random_state = rs

    # Set up GridSearchCV with 5-fold cross-validation
    grid_search = GridSearchCV(estimator=clf, param_grid=params, cv=5, n_jobs=-1, verbose=2)

    # Fit the model on the training data
    grid_search.fit(X_train, y_train)

    # Get the best model based on cross-validation
    best_model = grid_search.best_estimator_
    print("Best parameters found: ", grid_search.best_params_)

    # Predict on the test set
    y_pred_test = best_model.predict(X_test)
    y_proba_test = best_model.predict_proba(X_test)[:, 1]  # Probability estimates for AUC-ROC

    # Calculate metrics
    test_accuracy = accuracy_score(y_test, y_pred_test)
    f1 = f1_score(y_test, y_pred_test)
    auc_roc = roc_auc_score(y_test, y_proba_test)

    # Display metrics
    print(f"Test Accuracy: {test_accuracy}")
    print(f"F1-Score: {f1}")
    print(f"AUC-ROC: {auc_roc}")
    print("Classification Report:\n", classification_report(y_test, y_pred_test))

    return best_model, test_accuracy #, f1, auc_roc


In [None]:
# Example usage
from sklearn.ensemble import RandomForestClassifier

# Define parameters for the RandomForest
params = {
    'n_estimators': [10, 100, 500, 1000],
    'max_features': [2, 4, 6, 8],
    'criterion': ["gini", "entropy"],
}

# Example classifier and random seed
clf = RandomForestClassifier()
rs = 42

# Assuming X_train, y_train, X_test, and y_test are defined
best_model, test_accuracy = train_and_evaluate(clf, params, rs, X_train_pca, y_train, X_test_pca, y_test)


Fitting 5 folds for each of 32 candidates, totalling 160 fits
Best parameters found:  {'criterion': 'gini', 'max_features': 6, 'n_estimators': 500}
Test Accuracy: 0.6338028169014085
F1-Score: 0.6388888888888888
AUC-ROC: 0.7158730158730159
Classification Report:
               precision    recall  f1-score   support

           0       0.65      0.61      0.63        36
           1       0.62      0.66      0.64        35

    accuracy                           0.63        71
   macro avg       0.63      0.63      0.63        71
weighted avg       0.63      0.63      0.63        71



In [None]:
from sklearn.ensemble import ExtraTreesClassifier

# Parameters for ExtraTreesClassifier
extra_trees_params = {
    "min_samples_split": [2, 5, 15],
    "max_features": [1, 5, 10],
    "n_estimators": [50, 500, 1000, 5000],
}

# Instantiate ExtraTreesClassifier
clf1 = ExtraTreesClassifier(random_state=42)

# Call the function for ExtraTreesClassifier
best_model_et, test_accuracy_et = train_and_evaluate(clf1, extra_trees_params, 42, X_train_pca, y_train, X_test_pca, y_test)


Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best parameters found:  {'max_features': 5, 'min_samples_split': 2, 'n_estimators': 50}
Test Accuracy: 0.5774647887323944
F1-Score: 0.605263157894737
AUC-ROC: 0.653968253968254
Classification Report:
               precision    recall  f1-score   support

           0       0.60      0.50      0.55        36
           1       0.56      0.66      0.61        35

    accuracy                           0.58        71
   macro avg       0.58      0.58      0.58        71
weighted avg       0.58      0.58      0.57        71



In [None]:
from xgboost import XGBClassifier

# Parameters for XGBoostClassifier
xgb_params = {
    'min_child_weight': [1, 5, 10],
    'gamma': [0.5, 1, 1.5, 2, 5],
    'subsample': [1.0, 0.8, 0.6],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'max_depth': [3, 4, 5],
}

# Instantiate XGBoostClassifier
clf3 = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')

# Call the function for XGBoostClassifier
best_model_xgb, test_accuracy_xgb = train_and_evaluate(clf3, xgb_params, 42, X_train_pca, y_train, X_test_pca, y_test)


Fitting 5 folds for each of 405 candidates, totalling 2025 fits
Best parameters found:  {'colsample_bytree': 0.8, 'gamma': 5, 'max_depth': 3, 'min_child_weight': 10, 'subsample': 0.8}
Test Accuracy: 0.6338028169014085
F1-Score: 0.606060606060606
AUC-ROC: 0.6222222222222222
Classification Report:
               precision    recall  f1-score   support

           0       0.62      0.69      0.66        36
           1       0.65      0.57      0.61        35

    accuracy                           0.63        71
   macro avg       0.64      0.63      0.63        71
weighted avg       0.63      0.63      0.63        71



In [None]:
from sklearn.ensemble import AdaBoostClassifier

# Parameters for AdaBoostClassifier
adaboost_params = {
    'n_estimators': [500, 1000, 2000, 5000],
    'learning_rate': [0.001, 0.01, 0.1],
}

# Instantiate AdaBoostClassifier
clf4 = AdaBoostClassifier(random_state=42)

# Call the function for AdaBoostClassifier
best_model_ada, test_accuracy_ada = train_and_evaluate(clf4, adaboost_params, 42, X_train_pca, y_train, X_test_pca, y_test)


Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best parameters found:  {'learning_rate': 0.01, 'n_estimators': 2000}
Test Accuracy: 0.7323943661971831
F1-Score: 0.6885245901639345
AUC-ROC: 0.7555555555555555
Classification Report:
               precision    recall  f1-score   support

           0       0.69      0.86      0.77        36
           1       0.81      0.60      0.69        35

    accuracy                           0.73        71
   macro avg       0.75      0.73      0.73        71
weighted avg       0.75      0.73      0.73        71



In [None]:
from sklearn.svm import SVC

# Parameters for SVM
svm_params = {
    "kernel": ["poly", "rbf", "linear"],
    "degree": [2, 4, 8],
    "gamma": ["scale", "auto"],
}

# Instantiate SVC
clf5 = SVC(random_state=42, probability=True)

# Call the function for SVC
best_model_svm, test_accuracy_svm = train_and_evaluate(clf5, svm_params, 42, X_train_pca, y_train, X_test_pca, y_test)


Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best parameters found:  {'degree': 2, 'gamma': 'scale', 'kernel': 'linear'}
Test Accuracy: 0.704225352112676
F1-Score: 0.7123287671232877
AUC-ROC: 0.7793650793650794
Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.67      0.70        36
           1       0.68      0.74      0.71        35

    accuracy                           0.70        71
   macro avg       0.71      0.70      0.70        71
weighted avg       0.71      0.70      0.70        71

