In [None]:
import os
import numpy as np
import torch
import torchaudio
import librosa
from transformers import Wav2Vec2Processor, Wav2Vec2Model
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Drive directory for colab, will be changed when switched to local
data_dir = 'drive/MyDrive/Colab/Depression Dataset'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base")
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def preprocess_audio(file_path, layer_num):
    #Wav2Vec needs resampling (16kHz)
    audio, _ = librosa.load(file_path, sr=16000)

    input_values = processor(audio, return_tensors="pt", sampling_rate=16000).input_values
    input_values = input_values.to(device)

    with torch.no_grad():
        outputs = model(input_values, output_hidden_states=True)
        hidden_states = outputs.hidden_states  # Hidden states from all layers

    selected_hidden_state = hidden_states[layer_num]  # Desired hidden state

    # Average over frames, results with a fixed-length feature vector
    features = selected_hidden_state.mean(dim=1).squeeze().cpu().numpy()

    return features

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda:0")
else:
    device = torch.device("cpu")

model.to(device)

num_layers = len(model.encoder.layers)

In [None]:
# Classifiers to experiment with
# TODO: (Try with different C and Gamma hyperparameters as well)
classifiers = [
    ("Logistic Regression", LogisticRegression()),
    ("SVM (linear)", SVC(kernel='linear')),
    ("SVM (rbf)", SVC(kernel='rbf')),
    ("SVM (poly)", SVC(kernel='poly')),
    ("SVM (sigmoid)", SVC(kernel='sigmoid'))
]

for layer in range(num_layers): # for each layer in transformer
    print(f"\nProcessing {layer + 1}")

    X = [] #features
    y = [] #labels

    for file_name in os.listdir(data_dir): #for each audio file in dataset
        if file_name.endswith('.wav'):
            label = -1
            if '_P' in file_name: #label for depressed is 1 (they have _P in filename)
                label = 1
            elif '_C' in file_name:
                label = 0

            file_path = os.path.join(data_dir, file_name)
            features = preprocess_audio(file_path, layer)
            X.append(features)
            y.append(label)

    X = np.array(X)
    y = np.array(y)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    for clf_name, classifier in classifiers:   # Try each classifier
        print(f"\nClassifier: {clf_name}")

        classifier.fit(X_train, y_train)
        y_pred = classifier.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred, target_names=["Healthy", "Depressed"])

        print(f"Accuracy: {accuracy}")
        print("Classification Report:")
        print(report)


Processing Layer 1

Classifier: Logistic Regression
Accuracy: 0.8260869565217391
Classification Report:
              precision    recall  f1-score   support

     Healthy       0.77      0.91      0.83        11
   Depressed       0.90      0.75      0.82        12

    accuracy                           0.83        23
   macro avg       0.83      0.83      0.83        23
weighted avg       0.84      0.83      0.83        23


Classifier: SVM (linear)
Accuracy: 0.8260869565217391
Classification Report:
              precision    recall  f1-score   support

     Healthy       0.77      0.91      0.83        11
   Depressed       0.90      0.75      0.82        12

    accuracy                           0.83        23
   macro avg       0.83      0.83      0.83        23
weighted avg       0.84      0.83      0.83        23


Classifier: SVM (rbf)
Accuracy: 0.782608695652174
Classification Report:
              precision    recall  f1-score   support

     Healthy       0.71      0.91 