## IMPORTS

In [None]:
# downloading data
from google.colab import drive
import zipfile

# saving data
import csv

# some basics
import pandas as pd
import numpy as np
from glob import glob
import matplotlib.pyplot as plt
import os

# for sound processing
!pip install librosa
import librosa
import librosa.display
!pip install soundfile
import soundfile as sf
from IPython.display import Audio

# whisper
!pip install git+https://github.com/openai/whisper.git
import whisper

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-c5lfs9mo
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-c5lfs9mo
  Resolved https://github.com/openai/whisper.git to commit 8bc8860694949db53c42ba47ddc23786c2e02a8b
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tiktoken (from openai-whisper==20231117)
  Downloading tiktoken-0.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: openai-whisper
  Building wheel for openai-whisper (pyproject.toml) ... [?25l[?25hdone
  Created wheel for openai-whisper: filename=openai_whisper-20231117-py3-none-a

## DOWNLOADING DATA

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


### ANDROID

ANDROID DATASET (preprocessed, from google drive):

reading_hc + interview_hc -> health control, non-depresed

reading_pt + interview_pt -> patients, depressed

all files have the same length

In [None]:
!gdown 'https://drive.google.com/uc?id=1OQ87c6vEKkTuLu2Z3jYz0P6-pvCytojz'

Downloading...
From: https://drive.google.com/uc?id=1OQ87c6vEKkTuLu2Z3jYz0P6-pvCytojz
To: /content/android_segmented_5s.zip
100% 757M/757M [00:11<00:00, 67.5MB/s]


In [None]:
with zipfile.ZipFile('/content/android_segmented_5s.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/android_dataset')

In [None]:
# paths to folders with data (the folder structure is not 100% straightforward, so maybe will be useful)
android_interview_depressed_path = '/content/android_dataset/interview_pt/kaggle/working/segmented_files/interview_pt/'
android_reading_depressed_path = '/content/android_dataset/reading_pt/'
android_interview_healthy_path = '/content/android_dataset/interview_hc/'
android_reading_healthy_path = '/content/android_dataset/reading_hc/'

In [None]:
android_interview_depressed_count = sum(1 for item in os.listdir(android_interview_depressed_path) if os.path.isfile(os.path.join(android_interview_depressed_path, item)))
android_reading_depressed_count = sum(1 for item in os.listdir(android_reading_depressed_path) if os.path.isfile(os.path.join(android_reading_depressed_path, item)))
android_interview_healthy_count = sum(1 for item in os.listdir(android_interview_healthy_path) if os.path.isfile(os.path.join(android_interview_healthy_path, item)))
android_reading_healthy_count = sum(1 for item in os.listdir(android_reading_healthy_path) if os.path.isfile(os.path.join(android_reading_healthy_path, item)))

print('android_interview_depressed_count: ', android_interview_depressed_count)
print('android_reading_depressed_count: ', android_reading_depressed_count)
print('android_interview_healthy_count: ', android_interview_healthy_count)
print('android_reading_healthy_count: ', android_reading_healthy_count)

android_count = android_interview_depressed_count + android_reading_depressed_count + android_interview_healthy_count + android_reading_healthy_count

print('all samples: ', android_count)

android_interview_depressed_count:  2208
android_reading_depressed_count:  553
android_interview_healthy_count:  2659
android_reading_healthy_count:  479
all samples:  5899


In [None]:
# random sample, just to check what's going on
audio_path = '/content/android_dataset/interview_pt/kaggle/working/segmented_files/interview_pt/01_PM58_2_0'
data, sr = sf.read(audio_path, channels=2, samplerate=44100, format='RAW', subtype='PCM_16')

In [None]:
# if stereo convert to mono
if data.shape[1] == 2:
    data = librosa.to_mono(data.T)

duration = librosa.get_duration(y=data, sr=sr)
print("duration of file: ", duration)

# I've testes the duration of a few random files, and it is always the same

duration of file:  0.9072789115646258


In [None]:
# check if recording is not silence (also done on a few random samples)
frame_gains = np.abs(librosa.effects.preemphasis(data))
silence_removed = any(frame_gains > 0)

print(silence_removed)

True


## E_DAIC

E_DAIC DATASET (preprocessed, from drive)

already splitted into train, test, validation sets

labels in csv (https://drive.google.com/drive/folders/17jjD-cIZXS5EnqpvUNdosh6LCwDPYstX)

labels meaning:  0 is non-depressed, 1 is depressed

all files have the same length (also the same as the Android ones)

In [None]:
!gdown 'https://drive.google.com/uc?id=1PT9Iij7DJOB1s4i0T4gT3jZxpoZqSpzU'

Downloading...
From: https://drive.google.com/uc?id=1PT9Iij7DJOB1s4i0T4gT3jZxpoZqSpzU
To: /content/edaic_segmented_5second.zip
100% 2.03G/2.03G [00:39<00:00, 51.4MB/s]


In [None]:
with zipfile.ZipFile('/content/edaic_segmented_5second.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/edaic_dataset')

In [None]:
# paths to folders with data
edaic_train_path = '/content/edaic_dataset/edaic_segmented_5second/segmented_files/train'
edaic_test_path = '/content/edaic_dataset/edaic_segmented_5second/segmented_files/test'
edaic_validation_path = '/content/edaic_dataset/edaic_segmented_5second/segmented_files/val'

In [None]:
edaic_train_count = sum(1 for item in os.listdir(edaic_train_path) if os.path.isfile(os.path.join(edaic_train_path, item)))
edaic_test_count = sum(1 for item in os.listdir(edaic_test_path) if os.path.isfile(os.path.join(edaic_test_path, item)))
edaic_validation_count = sum(1 for item in os.listdir(edaic_validation_path) if os.path.isfile(os.path.join(edaic_validation_path, item)))

print('edaic_train_count: ', edaic_train_count)
print('edaic_test_count: ', edaic_test_count)
print('edaic_validation_count: ', edaic_validation_count)

edaic_count = edaic_train_count + edaic_test_count+ edaic_validation_count

print('all samples: ', edaic_count)

edaic_train_count:  12547
edaic_test_count:  2446
edaic_validation_count:  2987
all samples:  17980


In [None]:
# downloading labels

# validation
!gdown 'https://drive.google.com/uc?id=13PDjse2cjgT9Ns4s7v21DRbHB4nA2EtC'

# training
!gdown 'https://drive.google.com/uc?id=1LAEwPM3XPcDV3dh2XaxJKyKNdilBGVNK'

# test
!gdown 'https://drive.google.com/uc?id=1UQywlWldvqriiYDvNcj2iSq-fwvpIjA6'

Downloading...
From: https://drive.google.com/uc?id=13PDjse2cjgT9Ns4s7v21DRbHB4nA2EtC
To: /content/edaic_validation_labels.csv
100% 321/321 [00:00<00:00, 1.80MB/s]
Downloading...
From: https://drive.google.com/uc?id=1LAEwPM3XPcDV3dh2XaxJKyKNdilBGVNK
To: /content/edaic_training_labels.csv
100% 975/975 [00:00<00:00, 5.09MB/s]
Downloading...
From: https://drive.google.com/uc?id=1UQywlWldvqriiYDvNcj2iSq-fwvpIjA6
To: /content/edaic_testing_labels.csv
100% 339/339 [00:00<00:00, 1.64MB/s]


In [None]:
edaic_train_labels = pd.read_csv('/content/edaic_training_labels.csv')
edaic_test_labels = pd.read_csv('/content/edaic_testing_labels.csv')
edaic_validation_labels = pd.read_csv('/content/edaic_validation_labels.csv')

In [None]:
edaic_train_labels.head(25)

Unnamed: 0,ID,Value
0,335,0
1,424,0
2,399,0
3,364,0
4,330,1
5,489,0
6,383,0
7,410,1
8,426,1
9,341,0


In [None]:
# numbers of samples in each set
print("train: ", len(edaic_train_labels), " test: ", len(edaic_test_labels), " validation: ", len(edaic_validation_labels))
print("all labels: ", len(edaic_train_labels) + len(edaic_test_labels) + len(edaic_validation_labels))

train:  161  test:  55  validation:  52
all labels:  268


The total number of labels differs from the number of all samples because there are multiple recordings of each individual (each person's recording is divided into many parts, with each part being identified by the individual's ID in the labels)

In [None]:
# labeling each recording

# train set
edaic_train_labels_dict = edaic_train_labels.set_index('ID')['Value'].to_dict()
edaic_train_recordings_names = os.listdir(edaic_train_path)
edaic_train_recordings_labels = []
for file in edaic_train_recordings_names:
    file_id = file.split('_')[0]
    if int(file_id) in edaic_train_labels_dict:
        edaic_train_recordings_labels.append((file, edaic_train_labels_dict[int(file_id)]))
    else:
        edaic_train_recordings_labels.append((file, None))

# test set
edaic_test_labels_dict = edaic_test_labels.set_index('ID')['Value'].to_dict()
edaic_test_recordings_names = os.listdir(edaic_test_path)
edaic_test_recordings_labels = []
for file in edaic_test_recordings_names:
    file_id = file.split('_')[0]
    if int(file_id) in edaic_test_labels_dict:
        edaic_test_recordings_labels.append((file, edaic_test_labels_dict[int(file_id)]))
    else:
        edaic_test_recordings_labels.append((file, None))

# validation set
edaic_validation_labels_dict = edaic_validation_labels.set_index('ID')['Value'].to_dict()
edaic_validation_recordings_names = os.listdir(edaic_validation_path)
edaic_validation_recordings_labels = []

for file in edaic_validation_recordings_names:
    file_id = file.split('_')[0]
    if int(file_id) in edaic_validation_labels_dict:
        edaic_validation_recordings_labels.append((file, edaic_validation_labels_dict[int(file_id)]))
    else:
        edaic_validation_recordings_labels.append((file, None))

In [None]:
audio_path = '/content/edaic_dataset/edaic_segmented_5second/segmented_files/test/600_AUDIO_0'
data, sr = sf.read(audio_path, channels=2, samplerate=44100, format='RAW', subtype='PCM_16')

In [None]:
# if stereo convert to mono
if data.shape[1] == 2:
    data = librosa.to_mono(data.T)

duration = librosa.get_duration(y=data, sr=sr)
print("duration of file: ", duration)

# I've testes the duration of a few random files, and it is always the same (also the same as for the android dataset)

duration of file:  0.9072789115646258


In [None]:
# check if recording is not silence
frame_gains = np.abs(librosa.effects.preemphasis(data))
silence_removed = any(frame_gains > 0)

print(silence_removed)

True


In [None]:
import librosa
import torch
import librosa.display
import warnings
warnings.filterwarnings("ignore")
# to play the audio files
from IPython.display import Audio
plt.style.use('seaborn-white')

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [None]:
# Whisper- Base
from transformers import AutoFeatureExtractor, WhisperModel
# from datasets import load_dataset

model = WhisperModel.from_pretrained("openai/whisper-base")
feature_extractor = AutoFeatureExtractor.from_pretrained("openai/whisper-base")
model.to(device)
import torchaudio
def extract_features(path):
    sample_rate = 16000
    array, fs = torchaudio.load(path)
    input = feature_extractor(array.squeeze(), sampling_rate = sample_rate, return_tensors = 'pt')
    input = input.to(device)
    input = input.input_features
    with torch.no_grad():
        outputs = model.encoder(input)
    last_hidden_states = outputs.last_hidden_state.squeeze().mean(axis = 0).to("cpu").numpy()
    return last_hidden_states

config.json:   0%|          | 0.00/1.98k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/290M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

## Features extraction - WHISPER

Extracting features from files, data saved as an array. 1st "column" is a file's name, second target value (0 - healthy, 1 - depressed), the rest are extracted features. Datasets are saved as csv files:

- android_reading_healthy_whisper.csv
- android_reading_depressed_whisper.csv
- android_interview_healthy_whisper.csv
- android_interview_depressed_whisper.csv

## ANDROID

In [None]:
android_reading_healthy_files = [os.path.join(android_reading_healthy_path, file) for file in os.listdir(android_reading_healthy_path)]

android_reading_healthy_features_whisper = []

for file in android_reading_healthy_files:
    features = extract_features(file)
    file_name = os.path.basename(file)
    android_reading_healthy_features_whisper.append([file_name, 0] + list(features))



In [None]:
output_csv_path = "android_reading_healthy_whisper.csv"

with open(output_csv_path, 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)

    for row in android_reading_healthy_features_whisper:
        writer.writerow(row)


In [None]:
android_reading_depressed_files = [os.path.join(android_reading_depressed_path, file) for file in os.listdir(android_reading_depressed_path)]

android_reading_depressed_features_whisper = []

for file in android_reading_depressed_files:
    features = extract_features(file)
    file_name = os.path.basename(file)
    android_reading_depressed_features_whisper.append([file_name, 1] + list(features))


In [None]:
#ignore when executing code
output_csv_path = "android_reading_depressed_whisper.csv"

with open(output_csv_path, 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)

    for row in android_reading_depressed_features_whisper:
        writer.writerow(row)

In [None]:
android_interview_healthy_files = [os.path.join(android_interview_healthy_path, file) for file in os.listdir(android_interview_healthy_path)]

android_interview_healthy_features_whisper = []

for file in android_interview_healthy_files:
    features = extract_features(file)
    file_name = os.path.basename(file)
    android_interview_healthy_features_whisper.append([file_name, 0] + list(features))

In [None]:
#ignore when executing code
output_csv_path = "android_interview_healthy_whisper.csv"

with open(output_csv_path, 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)

    for row in android_interview_healthy_features_whisper:
        writer.writerow(row)

In [None]:
android_interview_depressed_files = [os.path.join(android_interview_depressed_path, file) for file in os.listdir(android_interview_depressed_path)]

android_interview_depressed_features_whisper = []

for file in android_interview_depressed_files:
    features = extract_features(file)
    file_name = os.path.basename(file)
    android_interview_depressed_features_whisper.append([file_name, 1] + list(features))


In [None]:
#ignore when executing code
output_csv_path = "android_interview_depressed_whisper.csv"

with open(output_csv_path, 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)

    for row in android_interview_depressed_features_whisper:
        writer.writerow(row)

## EDAIC

In [None]:
def extract_features_with_labels(recordings_labels, base_path):
    features_labels_dataset = []
    for file, label in recordings_labels:
        file_path = os.path.join(base_path, file)
        features = extract_features(file_path)

        features_labels_dataset.append([file, label] + list(features))
    return features_labels_dataset


edaic_train_features_whisper = extract_features_with_labels(edaic_train_recordings_labels, edaic_train_path)

edaic_test_features_whisper = extract_features_with_labels(edaic_test_recordings_labels, edaic_test_path)

edaic_validation_whisper = extract_features_with_labels(edaic_validation_recordings_labels, edaic_validation_path)


In [None]:
#ignore when executing code
output_csv_path = "edaic_train_features_whisper.csv"

with open(output_csv_path, 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)

    for row in edaic_train_features_whisper:
        writer.writerow(row)


output_csv_path = "edaic_test_features_whisper.csv"

with open(output_csv_path, 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)

    for row in edaic_test_features_whisper:
        writer.writerow(row)


output_csv_path = "edaic_validation_features_whisper.csv"

with open(output_csv_path, 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)

    for row in edaic_validation_whisper:
        writer.writerow(row)

Android baseline

In [None]:
#edaic_train_features_whisper[1]
#android_reading_healthy_features_whisper[1]
android_reading_healthy_features_whisper_df = pd.DataFrame(android_reading_healthy_features_whisper)
android_reading_depressed_features_whisper_df = pd.DataFrame(android_reading_depressed_features_whisper)
android_interview_healthy_features_whisper_df = pd.DataFrame(android_interview_healthy_features_whisper)
android_interview_depressed_features_whisper_df = pd.DataFrame(android_interview_depressed_features_whisper)

android = android_reading_healthy_features_whisper_df.append([ android_reading_depressed_features_whisper_df,android_interview_healthy_features_whisper_df,android_interview_depressed_features_whisper_df])
android.reset_index(inplace = True)
android = android.drop('index', axis=1)
android




Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,504,505,506,507,508,509,510,511,512,513
0,38_CM27_3_4,0,-0.325004,-0.773003,0.336232,-0.826059,0.200828,1.663229,-0.456727,0.036573,...,-0.180605,-0.337708,-0.404180,-0.125415,0.475406,-0.407302,0.561507,0.133164,-0.709229,0.359743
1,44_CF37_3_5,0,-0.323559,-0.755329,0.209577,-0.853966,0.209243,1.502434,-0.385898,-0.245348,...,-0.106871,-0.339001,-0.415959,-0.284999,0.334237,-0.568364,0.536553,0.201802,-0.622588,0.429866
2,41_CM71_2_8,0,-0.316730,-0.754254,0.186741,-0.993816,0.174278,1.494282,-0.547302,0.042305,...,-0.115851,-0.209866,-0.392283,-0.190826,0.583642,-0.446115,0.402427,0.183371,-0.567524,0.552812
3,10_CF51_2_8,0,-0.251689,-0.507035,0.329449,-0.812264,0.245561,1.537125,-0.494353,-0.107126,...,-0.235331,-0.199536,-0.453336,-0.275341,0.271637,-0.371649,0.289454,0.106774,-0.372619,0.465858
4,38_CM27_3_2,0,-0.385444,-0.721874,0.129210,-1.014975,0.111982,1.407310,-0.593221,-0.001678,...,-0.205863,-0.259184,-0.323361,-0.197869,0.380636,-0.510807,0.488610,0.217727,-0.565809,0.391385
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5894,03_PF66_3_44,1,-0.334607,-0.785543,0.057590,-0.919029,0.169060,1.408214,-0.606477,-0.110251,...,-0.089612,-0.291604,-0.416818,-0.154152,0.514873,-0.417331,0.625526,0.244014,-0.647574,0.480804
5895,26_PM54_3_37,1,-0.326652,-0.920170,0.069577,-0.910188,0.164791,1.423738,-0.465127,-0.045132,...,-0.151777,-0.498567,-0.325020,-0.156849,0.648424,-0.258233,0.632452,0.171573,-0.745716,0.365148
5896,54_PF48_3_4,1,-0.412105,-0.870238,0.116980,-0.853752,0.319272,1.507318,-0.529454,-0.161949,...,-0.176099,-0.358167,-0.458347,-0.289491,0.492805,-0.536066,0.574084,0.235399,-0.660443,0.340653
5897,22_PF40_3_18,1,-0.308228,-0.791172,0.221217,-0.936032,0.158140,1.376904,-0.450771,-0.199768,...,-0.116831,-0.425178,-0.404799,-0.238075,0.471417,-0.332177,0.438441,0.139048,-0.671979,0.407255


With Android we have one baseline: train/test

In [None]:
X = android.iloc[:,2:]
Y = android[1]


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.33, random_state=42)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.neural_network import MLPClassifier

classifier_accuracies = {}

classifiers = {
    "Random Forest": RandomForestClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "SVM": SVC(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Logistic Regression": LogisticRegression(),
    "Naive Bayes": GaussianNB(),
    "AdaBoost": AdaBoostClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Bagging": BaggingClassifier(),
    "Extra Trees": ExtraTreesClassifier(),
    "Voting": VotingClassifier(estimators=[
        ('lr', LogisticRegression()),
        ('rf', RandomForestClassifier()),
        ('svc', SVC())
    ]),
    "Stacking": StackingClassifier(estimators=[
        ('lr', LogisticRegression()),
        ('rf', RandomForestClassifier()),
        ('svc', SVC())
    ], final_estimator=RandomForestClassifier()),
    "MLP": MLPClassifier(max_iter=1000)
}

for name, classifier in classifiers.items():
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    classifier_accuracies[name] = accuracy_score(y_test, y_pred)
    print(f"{classifier.__class__.__name__} had an accuraccy of {classifier_accuracies[name]}")

print(classifier_accuracies)


RandomForestClassifier had an accuraccy of 0.7452491011813046
DecisionTreeClassifier had an accuraccy of 0.6224961479198767
SVC had an accuraccy of 0.7170005136106831
KNeighborsClassifier had an accuraccy of 0.731381612737545
LogisticRegression had an accuraccy of 0.844889573703133
GaussianNB had an accuraccy of 0.6640986132511556
AdaBoostClassifier had an accuraccy of 0.724191063174114
GradientBoostingClassifier had an accuraccy of 0.7652799178222907
BaggingClassifier had an accuraccy of 0.7015921931176169
ExtraTreesClassifier had an accuraccy of 0.7416538263995891
VotingClassifier had an accuraccy of 0.7632254750898819
StackingClassifier had an accuraccy of 0.8376990241397021
MLPClassifier had an accuraccy of 0.8397534668721109
{'Random Forest': 0.7452491011813046, 'Decision Tree': 0.6224961479198767, 'SVM': 0.7170005136106831, 'K-Nearest Neighbors': 0.731381612737545, 'Logistic Regression': 0.844889573703133, 'Naive Bayes': 0.6640986132511556, 'AdaBoost': 0.724191063174114, 'Gradien

In [None]:
!pip install hyperopt==0.2.5
!pip install hpsklearn
import hpsklearn
dir(hpsklearn)



Android hyperopt

In [None]:
from hyperopt import tpe, hp
from hpsklearn import (
    HyperoptEstimator, any_preprocessing,
    sklearn_DecisionTreeClassifier,
    sklearn_AdaBoostClassifier,
    sklearn_DecisionTreeClassifier,
    sklearn_ExtraTreesClassifier,
    sklearn_GradientBoostingClassifier,
    sklearn_KNeighborsClassifier,
    sklearn_OneVsOneClassifier,
    sklearn_OneVsRestClassifier,
    sklearn_OutputCodeClassifier,
    sklearn_PassiveAggressiveClassifier,
    sklearn_RandomForestClassifier,
    sklearn_SGDClassifier,
    sklearn_BernoulliRBM,
    sklearn_ColumnKMeans,
    sklearn_LinearSVC,
    sklearn_MultinomialNB,
    sklearn_XGBClassifier,
    any_sparse_classifier,
    any_classifier,
    sklearn_SVC
)

classifiers =  hp.choice(name, [
    sklearn_DecisionTreeClassifier(),
    sklearn_KNeighborsClassifier(),
    sklearn_PassiveAggressiveClassifier(),
    sklearn_SGDClassifier(),
    sklearn_XGBClassifier(),
    sklearn_LinearSVC(),
    sklearn_SVC(),
    sklearn_DecisionTreeClassifier(),
    sklearn_KNeighborsClassifier(),
    sklearn_PassiveAggressiveClassifier(),
    sklearn_SGDClassifier(),
    sklearn_XGBClassifier()
])

classifier = HyperoptEstimator(classifier=classifiers,
                              algo=tpe.suggest,
                              max_evals=40,
                              trial_timeout=120)

classifier.fit(X_train.values, y_train.values)
y_pred = classifier.predict(X_test.values)
accuracy = accuracy_score(y_test.values, y_pred)

print(accuracy)

100%|██████████| 1/1 [00:15<00:00, 15.47s/trial, best loss: 0.22756005056890016]
100%|██████████| 2/2 [00:00<00:00,  2.82trial/s, best loss: 0.22756005056890016]
100%|██████████| 3/3 [00:11<00:00, 11.76s/trial, best loss: 0.22756005056890016]
100%|██████████| 4/4 [00:00<00:00,  2.00trial/s, best loss: 0.22756005056890016]
100%|██████████| 5/5 [00:00<00:00,  2.95trial/s, best loss: 0.1479140328697851]
100%|██████████| 6/6 [00:02<00:00,  2.06s/trial, best loss: 0.1479140328697851]
100%|██████████| 7/7 [00:00<00:00,  2.80trial/s, best loss: 0.1479140328697851]
100%|██████████| 8/8 [00:00<00:00,  2.61trial/s, best loss: 0.1479140328697851]
100%|██████████| 9/9 [00:01<00:00,  1.92s/trial, best loss: 0.1479140328697851]
100%|██████████| 10/10 [00:11<00:00, 11.89s/trial, best loss: 0.1479140328697851]
100%|██████████| 11/11 [00:00<00:00,  2.95trial/s, best loss: 0.1479140328697851]
100%|██████████| 12/12 [00:01<00:00,  1.88s/trial, best loss: 0.1479140328697851]
100%|██████████| 13/13 [00:00<

note: preprocessing made result worse

In [None]:
classifier.best_model()

{'learner': SGDClassifier(),
 'preprocs': (StandardScaler(with_std=False),),
 'ex_preprocs': ()}

In [None]:
classifier._best_learner

In [None]:
plt.bar(range(len(classifier._best_learner.feature_importances_)), classifier._best_learner.feature_importances_)
plt.xticks(range(len(X)), X, rotation=90)
plt.show()

Edaic baseline


In [None]:
edaic_train_features_whisper_df = pd.DataFrame(edaic_train_features_whisper)

In [None]:
edaic_test_features_whisper_df = pd.DataFrame(edaic_test_features_whisper)

In [None]:
edaic_validation_whisper_df = pd.DataFrame(edaic_validation_whisper)

With E-daic, we have 2 baselines: train/test and train/val

In [None]:
X_train =edaic_train_features_whisper_df.iloc[:,2:]
Y_train =edaic_train_features_whisper_df[1]

X_test =edaic_test_features_whisper_df.iloc[:,2:]
Y_test =edaic_test_features_whisper_df[1]

X_vali = edaic_validation_whisper_df.iloc[:,2:]
Y_vali = edaic_validation_whisper_df[1]

In [None]:
classifier_accuracies = {}

classifiers = {
    "Random Forest": RandomForestClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "SVM": SVC(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Logistic Regression": LogisticRegression(),
    "Naive Bayes": GaussianNB(),
    "AdaBoost": AdaBoostClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Bagging": BaggingClassifier(),
    "Extra Trees": ExtraTreesClassifier(),
    "Voting": VotingClassifier(estimators=[
        ('lr', LogisticRegression()),
        ('rf', RandomForestClassifier()),
        ('svc', SVC())
    ]),
    "Stacking": StackingClassifier(estimators=[
        ('lr', LogisticRegression()),
        ('rf', RandomForestClassifier()),
        ('svc', SVC())
    ], final_estimator=RandomForestClassifier()),
    "MLP": MLPClassifier(max_iter=1000)
}

for name, classifier in classifiers.items():
    classifier.fit(X_train, Y_train)
    y_pred = classifier.predict(X_test)
    classifier_accuracies[name] = accuracy_score(Y_test, y_pred)
    print(f"{classifier.__class__.__name__} had an accuraccy of {classifier_accuracies[name]}")

print(classifier_accuracies)

RandomForestClassifier had an accuraccy of 0.7747342600163533
DecisionTreeClassifier had an accuraccy of 0.6618969746524939
SVC had an accuraccy of 0.7755519215044971
KNeighborsClassifier had an accuraccy of 0.7334423548650858
LogisticRegression had an accuraccy of 0.7694194603434178
GaussianNB had an accuraccy of 0.42804578904333607
AdaBoostClassifier had an accuraccy of 0.7641046606704824
GradientBoostingClassifier had an accuraccy of 0.7751430907604252
BaggingClassifier had an accuraccy of 0.7661488143908421
ExtraTreesClassifier had an accuraccy of 0.7759607522485691
VotingClassifier had an accuraccy of 0.7755519215044971
StackingClassifier had an accuraccy of 0.7547015535568274
MLPClassifier had an accuraccy of 0.6888798037612428
{'Random Forest': 0.7747342600163533, 'Decision Tree': 0.6618969746524939, 'SVM': 0.7755519215044971, 'K-Nearest Neighbors': 0.7334423548650858, 'Logistic Regression': 0.7694194603434178, 'Naive Bayes': 0.42804578904333607, 'AdaBoost': 0.7641046606704824, 

E-daic train/val baseline

In [None]:
for name, classifier in classifiers.items():
    classifier.fit(X_train, Y_train)
    y_pred = classifier.predict(X_vali)
    classifier_accuracies[name] = accuracy_score(Y_vali, y_pred)
    print(f"{classifier.__class__.__name__} had an accuraccy of {classifier_accuracies[name]}")

print(classifier_accuracies)

RandomForestClassifier had an accuraccy of 0.8101774355540676
DecisionTreeClassifier had an accuraccy of 0.6906595246066287
SVC had an accuraccy of 0.8105122196183462
KNeighborsClassifier had an accuraccy of 0.7760294609976565
LogisticRegression had an accuraccy of 0.8028121861399398
GaussianNB had an accuraccy of 0.5038500167392033
AdaBoostClassifier had an accuraccy of 0.7833947104117844
GradientBoostingClassifier had an accuraccy of 0.8068295949112823
BaggingClassifier had an accuraccy of 0.7937730164044191
ExtraTreesClassifier had an accuraccy of 0.8098426514897891
VotingClassifier had an accuraccy of 0.8101774355540676
StackingClassifier had an accuraccy of 0.7820555741546702
MLPClassifier had an accuraccy of 0.792433880147305
{'Random Forest': 0.8101774355540676, 'Decision Tree': 0.6906595246066287, 'SVM': 0.8105122196183462, 'K-Nearest Neighbors': 0.7760294609976565, 'Logistic Regression': 0.8028121861399398, 'Naive Bayes': 0.5038500167392033, 'AdaBoost': 0.7833947104117844, 'Gr

In [None]:
from hyperopt import tpe, hp
from hpsklearn import (
    HyperoptEstimator, any_preprocessing,
    sklearn_DecisionTreeClassifier,
    sklearn_AdaBoostClassifier,
    sklearn_DecisionTreeClassifier,
    sklearn_ExtraTreesClassifier,
    sklearn_GradientBoostingClassifier,
    sklearn_KNeighborsClassifier,
    sklearn_OneVsOneClassifier,
    sklearn_OneVsRestClassifier,
    sklearn_OutputCodeClassifier,
    sklearn_PassiveAggressiveClassifier,
    sklearn_RandomForestClassifier,
    sklearn_SGDClassifier,
    sklearn_BernoulliRBM,
    sklearn_ColumnKMeans,
    sklearn_LinearSVC,
    sklearn_MultinomialNB,
    sklearn_XGBClassifier,
    any_sparse_classifier,
    any_classifier,
    sklearn_SVC
)

classifiers =  hp.choice(name, [
    sklearn_DecisionTreeClassifier(),
    sklearn_KNeighborsClassifier(),
    sklearn_PassiveAggressiveClassifier(),
    sklearn_SGDClassifier(),
    sklearn_XGBClassifier(),
    sklearn_LinearSVC(),
    sklearn_SVC(),
    sklearn_DecisionTreeClassifier(),
    sklearn_KNeighborsClassifier(),
    sklearn_PassiveAggressiveClassifier(),
    sklearn_SGDClassifier(),
    sklearn_XGBClassifier()
])


classifier = HyperoptEstimator(classifier=classifiers,
                              algo=tpe.suggest,
                              max_evals=40,
                              trial_timeout=120)

classifier.fit(X_train.values, Y_train.values)
y_pred = classifier.predict(X_vali.values)
accuracy = accuracy_score(Y_vali.values, y_pred)

print(accuracy)

100%|██████████| 1/1 [00:02<00:00,  2.47s/trial, best loss: 0.6195219123505976]
100%|██████████| 2/2 [00:01<00:00,  1.16s/trial, best loss: 0.17091633466135459]
100%|██████████| 3/3 [00:01<00:00,  1.19s/trial, best loss: 0.17091633466135459]
100%|██████████| 4/4 [00:12<00:00, 12.14s/trial, best loss: 0.17091633466135459]
100%|██████████| 5/5 [00:18<00:00, 18.22s/trial, best loss: 0.17091633466135459]
100%|██████████| 6/6 [00:23<00:00, 23.82s/trial, best loss: 0.17091633466135459]
100%|██████████| 7/7 [00:01<00:00,  1.15s/trial, best loss: 0.17091633466135459]
100%|██████████| 8/8 [00:01<00:00,  1.15s/trial, best loss: 0.17091633466135459]
100%|██████████| 9/9 [00:00<00:00,  1.22trial/s, best loss: 0.17091633466135459]
100%|██████████| 10/10 [00:11<00:00, 11.52s/trial, best loss: 0.17091633466135459]
100%|██████████| 11/11 [00:02<00:00,  2.22s/trial, best loss: 0.17091633466135459]
100%|██████████| 12/12 [00:18<00:00, 18.06s/trial, best loss: 0.17091633466135459]
100%|██████████| 13/13 

In [None]:
classifier.best_model()

{'learner': SGDClassifier(),
 'preprocs': (MinMaxScaler(feature_range=(0.0, 1.0)),),
 'ex_preprocs': ()}

In [None]:
classifier._best_learner