## Training on mfcc features extracted by librosa 

### Extracting features using librosa

In [2]:
# standard 
import numpy as np
import pandas as pd

# libs for audio
from scipy.io import wavfile as wav
import librosa

# Processing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold

# Classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

# Metrics
from sklearn import utils
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score, \
        f1_score, precision_score, recall_score


In [3]:
def extract_features_librosa(file_name):
    audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast') 
    mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40) # 40
    mfccs_processed = np.mean(mfccs.T,axis=0)
     
    return mfccs_processed

In [None]:
# Read Data
df = pd.read_csv("D:/SUTD/ISTD/Term 6/cds/Project/FULL DATASET_edited.csv")
y = df['PHQ_Binary']
x = df['Participant_ID']


features = []

# Audio files with lots of static
prob = [300, 305, 306, 308, 315, 316, 343, 354, 362, 375, 378, 381, 382, 385, 387, 388, 390, 392, 393, 395, 408, 413, 421, 438, 473, 476, 479, 490, 492]
for index, audio_num in enumerate(x):
    if audio_num not in prob:
        try:
            fn = 'D:/SUTD/ISTD/Term 6/cds/Project/Audio/processed/{}_AUDIO_p.wav'.format(audio_num)
            print(fn)
            features.append([audio_num,extract_features_librosa(fn),y[index]])
        except Exception as e:
            print(e)
        """ librosa_audio, librosa_sample_rate = librosa.load(fn)
        scipy_sample_rate, scipy_audio = wav.read(fn) """

featuresdf = pd.DataFrame(features,columns = ['Participant_ID','feature','PHQ8_Binary'])
featuresdf.to_csv('librosa_mfcc_feature.csv')

### Training

#### Read data

In [11]:
from sklearn.model_selection import train_test_split

RANDOM_STATE = 42

data = pd.read_csv("librosa_mfcc_feature_1.csv")
#print(data.head(5))
X_train, X_test, y_train, y_test = train_test_split(np.array(data['feature']), 
                                                    data['PHQ8_Binary'],
                                                    test_size=0.3,
                                                    random_state=RANDOM_STATE)
print(X_train)


-01  6.52974784e-01\n  2.52961349e+00  1.05071634e-01  1.25963783e+00  8.14714730e-01\n  2.47260615e-01  1.05277812e+00  9.96869028e-01  1.35152566e+00\n  1.93907097e-01 -1.01792149e-01  1.33925915e-01  2.35384583e-01\n  3.89108956e-01 -8.48656818e-02  1.56922072e-01  6.57889247e-01]'
 '[-5.6178265e+02  1.2825723e+02 -1.6314333e+01  3.1222893e+01\n -1.1403203e+01  7.3447132e+00 -3.1940978e+00 -1.1936405e+00\n  8.5313978e+00 -2.9090800e+00  6.2141933e+00 -4.8472276e+00\n  7.5399983e-01  9.8049909e-01 -2.6537273e+00 -6.2214500e-01\n -2.1775460e+00 -1.1632501e+00 -2.0155172e+00  1.2791947e+00\n -1.9853412e+00  1.5186614e-02  2.6772969e+00 -1.3134743e+00\n  5.6662363e-01  2.6006474e+00  1.1555395e+00  1.6069045e+00\n  1.8088884e+00  6.9903207e-01  1.7133214e+00  8.6264199e-01\n  1.2269332e+00  1.8012236e+00  2.9132953e+00  5.2156739e+00\n  6.2323027e+00  7.2210612e+00  6.2674260e+00  5.6347022e+00]'
 '[-5.1199222e+02  8.5943642e+01  1.7893383e+01  2.4990393e+01\n  9.0411396e+00  8.0131073e

#### Evaluation

In [6]:
def evaluate_on_training_set(y_test, y_pred):
    print(y_pred)
    print(y_test)
    
    # Calculate AUC
    print("AUC is: ", roc_auc_score(y_test, y_pred))

    # print out recall and precision
    print(classification_report(y_test, y_pred))

    # print out confusion matrix
    print("Confusion Matrix: \n", confusion_matrix(y_test, y_pred))

    # # calculate points for ROC curve
    fpr, tpr, thresholds = roc_curve(y_test, y_pred)

    # Plot ROC curve
    plt.plot(fpr, tpr, label='ROC curve (area = %0.3f)' % roc_auc_score(y_test, y_pred))
    plt.plot([0, 1], [0, 1], 'k--')
    plt.ylim([0.0, 1.0])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')

In [7]:
def evaluate_model(model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    evaluate_on_training_set(y_test, y_pred)
    
    return model

#### K cross

In [8]:
def k_cross(model, X=X_train, y=y_train, k=10, n=1, random_state=RANDOM_STATE):
    f1_scores = []
    recall_scores = []
    rkf = RepeatedKFold(n_splits=k, n_repeats=n, random_state=RANDOM_STATE)
    for train_index, val_index in rkf.split(X):
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]
        
        model.fit(X_train, y_train) 
        
        y_pred = model.predict(X_val)
        f1 = f1_score(y_val, y_pred)
        f1_scores.append(f1)
        recall = recall_score(y_val, y_pred)
        recall_scores.append(recall)
        
    return f1_scores, recall_scores

### Models implementation

#### Logistic Regression

In [9]:
def find_best_logreg_model(power):
    best_f1_model = None
    best_recall_model = None
    best_f1 = -1
    best_recall = -1
    
    for i in range(power + 1):
        model = LogisticRegression(n_jobs=3, C=10**i)
        
        f1_scores, recall_scores = k_cross(model)
        f1 = np.mean(f1_scores)
        recall = np.mean(recall_scores)

        if f1 > best_f1:
            print(f"power: {i}: f1 = {f1} > best f1 = {best_f1}")
            best_f1 = f1
            best_f1_model = model

        if recall > best_recall:
            print(f"power: {i}: recall = {recall} > best recall = {best_recall}")
            best_recall = recall
            best_recall_model = model
    
    return best_f1_model, best_recall_model

best_logreg_f1_model, best_logreg_recall_model = find_best_logreg_model(6)

KeyError: 'Passing list-likes to .loc or [] with any missing labels is no longer supported, see https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike'

In [None]:
# Based on f1
evaluate_model(best_logreg_f1_model)

In [None]:
# Based on recall
evaluate_model(best_logreg_recall_model)

#### Decision Tree

In [None]:
def find_best_tree_model(upper_depth, upper_leaf):
    best_f1_model = None
    best_recall_model = None
    best_f1 = -1
    best_recall = -1
    
    for depth in range(1, upper_depth + 1):
        for leaf in range(1, upper_leaf + 1):
            model = DecisionTreeClassifier(random_state=RANDOM_STATE, max_depth=depth, min_samples_leaf=leaf) 
            
            f1_scores, recall_scores = k_cross(model)
            f1 = np.mean(f1_scores)
            recall = np.mean(recall_scores)
            
            if f1 > best_f1:
                print(f"depth: {depth}, leaf: {leaf}: f1 = {f1} > best f1 = {best_f1}")
                best_f1 = f1
                best_f1_model = model
                
            if recall > best_recall:
                print(f"depth: {depth}, leaf: {leaf}: recall = {recall} > best recall = {best_recall}")
                best_recall = recall
                best_recall_model = model
    
    return best_f1_model, best_recall_model

best_tree_f1_model, best_tree_recall_model = find_best_tree_model(20, 30)

In [None]:
# Based on f1
evaluate_model(best_tree_f1_model)

In [None]:
# Based on recall
evaluate_model(best_tree_recall_model)

#### Random Forest

In [None]:
def find_best_forest_model(n_estimators):
    best_f1_model = None
    best_recall_model = None
    best_f1 = -1
    best_recall = -1
    
    for estimator in range(1, n_estimators + 1):
        model = RandomForestClassifier(random_state=RANDOM_STATE, n_estimators=estimator) 
        
        f1_scores, recall_scores = k_cross(model)
        f1 = np.mean(f1_scores)
        recall = np.mean(recall_scores)

        if f1 > best_f1:
            print(f"estimator: {estimator}: f1 = {f1} > best f1 = {best_f1}")
            best_f1 = f1
            best_f1_model = model

        if recall > best_recall:
            print(f"estimator: {estimator}: recall = {recall} > best recall = {best_recall}")
            best_recall = recall
            best_recall_model = model
    
    return best_f1_model, best_recall_model

best_forest_f1_model, best_forest_recall_model = find_best_forest_model(30)

In [None]:
# Based on f1
evaluate_model(best_forest_f1_model)

In [None]:
# Based on recall
evaluate_model(best_forest_recall_model)

#### SVM with grid search

In [None]:
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100]},
                    {'kernel': ['poly'], 'degree': [3, 4, 5], 'C': [1, 10, 100]},
                    {'kernel': ['linear'], 'C': [1, 10, 100]}]

svm_model_cv = GridSearchCV(SVC(), tuned_parameters, cv=10, scoring='f1', verbose=1, n_jobs=4)

best_svm_model = evaluate_model(svm_model_cv)
print(f"Best SVM parameters: {best_svm_model.best_params_}")