## IMPORTS

In [None]:
# downloading data
from google.colab import drive
import zipfile

# saving data
import csv

# some basics
import pandas as pd
import numpy as np
from glob import glob
import matplotlib.pyplot as plt
import os

# for sound processing
!pip install librosa
import librosa
import librosa.display
!pip install soundfile
import soundfile as sf
from IPython.display import Audio

# whisper
!pip install git+https://github.com/openai/whisper.git
import whisper

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-q394h_8l
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-q394h_8l
  Resolved https://github.com/openai/whisper.git to commit 8bc8860694949db53c42ba47ddc23786c2e02a8b
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


## DOWNLOADING DATA

In [None]:
drive.mount('/content/drive')

KeyboardInterrupt: ignored

### ANDROID

ANDROID DATASET (preprocessed, from google drive):

reading_hc + interview_hc -> health control, non-depresed

reading_pt + interview_pt -> patients, depressed

all files have the same length

In [None]:
!gdown 'https://drive.google.com/uc?id=1OQ87c6vEKkTuLu2Z3jYz0P6-pvCytojz'

In [None]:
with zipfile.ZipFile('/content/android_segmented_5s.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/android_dataset')

In [None]:
# paths to folders with data (the folder structure is not 100% straightforward, so maybe will be useful)
android_interview_depressed_path = '/content/android_dataset/interview_pt/kaggle/working/segmented_files/interview_pt/'
android_reading_depressed_path = '/content/android_dataset/reading_pt/'
android_interview_healthy_path = '/content/android_dataset/interview_hc/'
android_reading_healthy_path = '/content/android_dataset/reading_hc/'

In [None]:
android_interview_depressed_count = sum(1 for item in os.listdir(android_interview_depressed_path) if os.path.isfile(os.path.join(android_interview_depressed_path, item)))
android_reading_depressed_count = sum(1 for item in os.listdir(android_reading_depressed_path) if os.path.isfile(os.path.join(android_reading_depressed_path, item)))
android_interview_healthy_count = sum(1 for item in os.listdir(android_interview_healthy_path) if os.path.isfile(os.path.join(android_interview_healthy_path, item)))
android_reading_healthy_count = sum(1 for item in os.listdir(android_reading_healthy_path) if os.path.isfile(os.path.join(android_reading_healthy_path, item)))

print('android_interview_depressed_count: ', android_interview_depressed_count)
print('android_reading_depressed_count: ', android_reading_depressed_count)
print('android_interview_healthy_count: ', android_interview_healthy_count)
print('android_reading_healthy_count: ', android_reading_healthy_count)

android_count = android_interview_depressed_count + android_reading_depressed_count + android_interview_healthy_count + android_reading_healthy_count

print('all samples: ', android_count)

In [None]:
# random sample, just to check what's going on
audio_path = '/content/android_dataset/interview_pt/kaggle/working/segmented_files/interview_pt/01_PM58_2_0'
data, sr = sf.read(audio_path, channels=2, samplerate=44100, format='RAW', subtype='PCM_16')

In [None]:
# if stereo convert to mono
if data.shape[1] == 2:
    data = librosa.to_mono(data.T)

duration = librosa.get_duration(y=data, sr=sr)
print("duration of file: ", duration)

# I've testes the duration of a few random files, and it is always the same

In [None]:
# check if recording is not silence (also done on a few random samples)
frame_gains = np.abs(librosa.effects.preemphasis(data))
silence_removed = any(frame_gains > 0)

print(silence_removed)

## E_DAIC

E_DAIC DATASET (preprocessed, from drive)

already splitted into train, test, validation sets

labels in csv (https://drive.google.com/drive/folders/17jjD-cIZXS5EnqpvUNdosh6LCwDPYstX)

labels meaning:  0 is non-depressed, 1 is depressed

all files have the same length (also the same as the Android ones)

In [None]:
!gdown 'https://drive.google.com/uc?id=1PT9Iij7DJOB1s4i0T4gT3jZxpoZqSpzU'

In [None]:
with zipfile.ZipFile('/content/edaic_segmented_5second.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/edaic_dataset')

In [None]:
# paths to folders with data
edaic_train_path = '/content/edaic_dataset/edaic_segmented_5second/segmented_files/train'
edaic_test_path = '/content/edaic_dataset/edaic_segmented_5second/segmented_files/test'
edaic_validation_path = '/content/edaic_dataset/edaic_segmented_5second/segmented_files/val'

In [None]:
edaic_train_count = sum(1 for item in os.listdir(edaic_train_path) if os.path.isfile(os.path.join(edaic_train_path, item)))
edaic_test_count = sum(1 for item in os.listdir(edaic_test_path) if os.path.isfile(os.path.join(edaic_test_path, item)))
edaic_validation_count = sum(1 for item in os.listdir(edaic_validation_path) if os.path.isfile(os.path.join(edaic_validation_path, item)))

print('edaic_train_count: ', edaic_train_count)
print('edaic_test_count: ', edaic_test_count)
print('edaic_validation_count: ', edaic_validation_count)

edaic_count = edaic_train_count + edaic_test_count+ edaic_validation_count

print('all samples: ', edaic_count)

In [None]:
# downloading labels

# validation
!gdown 'https://drive.google.com/uc?id=13PDjse2cjgT9Ns4s7v21DRbHB4nA2EtC'

# training
!gdown 'https://drive.google.com/uc?id=1LAEwPM3XPcDV3dh2XaxJKyKNdilBGVNK'

# test
!gdown 'https://drive.google.com/uc?id=1UQywlWldvqriiYDvNcj2iSq-fwvpIjA6'

In [None]:
edaic_train_labels = pd.read_csv('/content/edaic_training_labels.csv')
edaic_test_labels = pd.read_csv('/content/edaic_testing_labels.csv')
edaic_validation_labels = pd.read_csv('/content/edaic_validation_labels.csv')

In [None]:
edaic_train_labels.head(25)

In [None]:
# numbers of samples in each set
print("train: ", len(edaic_train_labels), " test: ", len(edaic_test_labels), " validation: ", len(edaic_validation_labels))
print("all labels: ", len(edaic_train_labels) + len(edaic_test_labels) + len(edaic_validation_labels))

The total number of labels differs from the number of all samples because there are multiple recordings of each individual (each person's recording is divided into many parts, with each part being identified by the individual's ID in the labels)

In [None]:
# labeling each recording

# train set
edaic_train_labels_dict = edaic_train_labels.set_index('ID')['Value'].to_dict()
edaic_train_recordings_names = os.listdir(edaic_train_path)
edaic_train_recordings_labels = []
for file in edaic_train_recordings_names:
    file_id = file.split('_')[0]
    if int(file_id) in edaic_train_labels_dict:
        edaic_train_recordings_labels.append((file, edaic_train_labels_dict[int(file_id)]))
    else:
        edaic_train_recordings_labels.append((file, None))

# test set
edaic_test_labels_dict = edaic_test_labels.set_index('ID')['Value'].to_dict()
edaic_test_recordings_names = os.listdir(edaic_test_path)
edaic_test_recordings_labels = []
for file in edaic_test_recordings_names:
    file_id = file.split('_')[0]
    if int(file_id) in edaic_test_labels_dict:
        edaic_test_recordings_labels.append((file, edaic_test_labels_dict[int(file_id)]))
    else:
        edaic_test_recordings_labels.append((file, None))

# validation set
edaic_validation_labels_dict = edaic_validation_labels.set_index('ID')['Value'].to_dict()
edaic_validation_recordings_names = os.listdir(edaic_validation_path)
edaic_validation_recordings_labels = []

for file in edaic_validation_recordings_names:
    file_id = file.split('_')[0]
    if int(file_id) in edaic_validation_labels_dict:
        edaic_validation_recordings_labels.append((file, edaic_validation_labels_dict[int(file_id)]))
    else:
        edaic_validation_recordings_labels.append((file, None))

In [None]:
audio_path = '/content/edaic_dataset/edaic_segmented_5second/segmented_files/test/600_AUDIO_0'
data, sr = sf.read(audio_path, channels=2, samplerate=44100, format='RAW', subtype='PCM_16')

In [None]:
# if stereo convert to mono
if data.shape[1] == 2:
    data = librosa.to_mono(data.T)

duration = librosa.get_duration(y=data, sr=sr)
print("duration of file: ", duration)

# I've testes the duration of a few random files, and it is always the same (also the same as for the android dataset)

In [None]:
# check if recording is not silence
frame_gains = np.abs(librosa.effects.preemphasis(data))
silence_removed = any(frame_gains > 0)

print(silence_removed)

In [None]:
import librosa
import torch
import librosa.display
import warnings
warnings.filterwarnings("ignore")
# to play the audio files
from IPython.display import Audio
plt.style.use('seaborn-white')

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
# Whisper- Base
from transformers import AutoFeatureExtractor, WhisperModel
# from datasets import load_dataset

model = WhisperModel.from_pretrained("openai/whisper-base")
feature_extractor = AutoFeatureExtractor.from_pretrained("openai/whisper-base")
model.to(device)
import torchaudio
def extract_features(path):
    sample_rate = 16000
    array, fs = torchaudio.load(path)
    input = feature_extractor(array.squeeze(), sampling_rate = sample_rate, return_tensors = 'pt')
    input = input.to(device)
    input = input.input_features
    with torch.no_grad():
        outputs = model.encoder(input)
    last_hidden_states = outputs.last_hidden_state.squeeze().mean(axis = 0).to("cpu").numpy()
    return last_hidden_states

## Features extraction - WHISPER

Extracting features from files, data saved as an array. 1st "column" is a file's name, second target value (0 - healthy, 1 - depressed), the rest are extracted features. Datasets are saved as csv files:

- android_reading_healthy_whisper.csv
- android_reading_depressed_whisper.csv
- android_interview_healthy_whisper.csv
- android_interview_depressed_whisper.csv

## ANDROID

In [None]:
android_reading_healthy_files = [os.path.join(android_reading_healthy_path, file) for file in os.listdir(android_reading_healthy_path)]

android_reading_healthy_features_whisper = []

for file in android_reading_healthy_files:
    features = extract_features(file)
    file_name = os.path.basename(file)
    android_reading_healthy_features_whisper.append([file_name, 0] + list(features))



In [None]:
output_csv_path = "android_reading_healthy_whisper.csv"

with open(output_csv_path, 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)

    for row in android_reading_healthy_features_whisper:
        writer.writerow(row)


In [None]:
android_reading_depressed_files = [os.path.join(android_reading_depressed_path, file) for file in os.listdir(android_reading_depressed_path)]

android_reading_depressed_features_whisper = []

for file in android_reading_depressed_files:
    features = extract_features(file)
    file_name = os.path.basename(file)
    android_reading_depressed_features_whisper.append([file_name, 1] + list(features))


In [None]:
output_csv_path = "android_reading_depressed_whisper.csv"

with open(output_csv_path, 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)

    for row in android_reading_depressed_features_whisper:
        writer.writerow(row)

In [None]:
android_interview_healthy_files = [os.path.join(android_interview_healthy_path, file) for file in os.listdir(android_interview_healthy_path)]

android_interview_healthy_features_whisper = []

for file in android_interview_healthy_files:
    features = extract_features(file)
    file_name = os.path.basename(file)
    android_interview_healthy_features_whisper.append([file_name, 0] + list(features))

In [None]:
output_csv_path = "android_interview_healthy_whisper.csv"

with open(output_csv_path, 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)

    for row in android_interview_healthy_features_whisper:
        writer.writerow(row)

In [None]:
android_interview_depressed_files = [os.path.join(android_interview_depressed_path, file) for file in os.listdir(android_interview_depressed_path)]

android_interview_depressed_features_whisper = []

for file in android_interview_depressed_files:
    features = extract_features(file)
    file_name = os.path.basename(file)
    android_interview_depressed_features_whisper.append([file_name, 1] + list(features))


In [None]:
output_csv_path = "android_interview_depressed_whisper.csv"

with open(output_csv_path, 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)

    for row in android_interview_depressed_features_whisper:
        writer.writerow(row)

## EDAIC

In [None]:
def extract_features_with_labels(recordings_labels, base_path):
    features_labels_dataset = []
    for file, label in recordings_labels:
        file_path = os.path.join(base_path, file)
        features = extract_features(file_path)

        features_labels_dataset.append([file, label] + list(features))
    return features_labels_dataset


edaic_train_features_whisper = extract_features_with_labels(edaic_train_recordings_labels, edaic_train_path)

edaic_test_features_whisper = extract_features_with_labels(edaic_test_recordings_labels, edaic_test_path)

edaic_validation_whisper = extract_features_with_labels(edaic_validation_recordings_labels, edaic_validation_path)


KeyboardInterrupt: ignored

In [None]:
output_csv_path = "edaic_train_features_whisper.csv"

with open(output_csv_path, 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)

    for row in edaic_train_features_whisper:
        writer.writerow(row)


output_csv_path = "edaic_test_features_whisper.csv"

with open(output_csv_path, 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)

    for row in edaic_test_features_whisper:
        writer.writerow(row)


output_csv_path = "edaic_validation_features_whisper.csv"

with open(output_csv_path, 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)

    for row in edaic_validation_whisper:
        writer.writerow(row)