# Import necessary modules

In [1]:
!pip install gensim

# Standard libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

# Processing
from sklearn.base import clone
from sklearn.utils import shuffle
from sklearn.model_selection import RepeatedKFold
from tqdm import tqdm
import random

# gensim
import gensim
from gensim.models import doc2vec
from gensim.models.doc2vec import TaggedDocument

# Metrics
from sklearn import utils
from sklearn.metrics import f1_score, recall_score

# Classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV



# Prepare data

In [2]:
df = pd.read_csv("clean_compiled_transcripts.csv", index_col = "Participant_ID")
df.head()

Unnamed: 0_level_0,Transcript,PHQ_Score,PHQ_Binary
Participant_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
300,good atlanta georgia um my parents um i love i...,2,0
301,thank you mmm k i good thank you i los angeles...,3,0
302,i fine yourself i los angeles california part ...,4,0
303,okay bout yourself california yeah oh well it ...,0,0
304,i good um los angeles california um cool weath...,6,0


## Split the dataset (skip if you have the numpy data) 

In [3]:
X = df.Transcript
y = df.PHQ_Binary

In [4]:
def train_test(X, y, testfile='test_split_Depression_AVEC2017.csv'):
    test_participants = pd.read_csv(testfile)['participant_ID'].values
    X_train = []
    X_test = []
    y_train = []
    y_test = []
    
    for i in range(y.shape[0]):
        participant_no = y.index[i]
        
        if participant_no in test_participants:
            X_test.append(X[participant_no])
            y_test.append(y[participant_no])
        else:
            X_train.append(X[participant_no])
            y_train.append(y[participant_no])
    
    return np.array(X_train), np.array(X_test), np.array(y_train), np.array(y_test)

X_train, X_test, y_train, y_test = train_test(X, y)

In [5]:
# shuffle the train data in unison because data is in order
# reduces poor performance during k-cross validation when sampling data

RANDOM_STATE = 42

X_train, y_train = shuffle(X_train, y_train, random_state=RANDOM_STATE)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((136,), (45,), (136,), (45,))

## Label sentences (skip if you have the numpy data)

In [6]:
def label_sentences(corpus, label_type):
    """
    Gensim's Doc2Vec implementation requires each document/paragraph to have a label associated with it.
    We do this by using the TaggedDocument method. The format will be "TRAIN_i" or "TEST_i" where "i" is
    a dummy index of the post.
    """
    labeled = []
    for i, v in enumerate(corpus):
        label = label_type + '_' + str(i)
        labeled.append(doc2vec.TaggedDocument(v.split(), [label]))
    return labeled

In [7]:
X_train = label_sentences(X_train, 'train')
X_test = label_sentences(X_test, 'test')
all_data = X_train + X_test

all_data[:10]

[TaggedDocument(words=['sure', 'i', 'good', 'i', 'caught', 'cold', 'morning', 'la', 'born', 'raised', 'born', 'raised', 'it', 'home', 'food', 'people', 'culture', 'traffic', 'uh', 'lot', 'us', 'many', 'us', 'probably', 'oh', 'i', 'past', 'i', 'traveled', 'lately', 'uh', 'destination', 'i', 'enjoy', 'traveling', 'anymore', 'i', 'i', 'young', 'longer', 'it', 'destination', 'point', 'mm', 'okay', 'uh', 'ninety', 'two', 'i', 'went', 'spain', 'my', 'grandma', 'my', 'fathers', 'mother', 'born', 'spain', 'i', 'went', 'back', 'trace', 'her', 'roots', 'i', 'spent', 'week', 'madrid', 'week', 'barcelona', 'week', 'majorca', 'great', 'time', 'i', 'enjoyed', 'it', 'ooh', 'it', 'everyday', 'thing', 'people', 'friendly', 'uh', 'finding', 'things', 'my', 'grandmother', 'her', 'family', 'interesting', 'uh', 'i', 'studied', 'uh', 'poly', 'sci', 'uh', 'sociology', 'uh', 'well', 'time', 'it', 'seemed', 'like', 'easiest', 'thing', 'uh', 'well', 'i', 'my', 'dream', 'job', 'i', 'sports', 'research', 'uh', 'n

# Train doc2vec model (skip if you have the numpy data)
Parameters:
* dm=0: distributed bag of words (DBOW) is used
* vector_size=300: 300 vector dimensional feature vectors
* negative=5: specifies how many "noise words" should be drawn
* min_count=3: ignores all words with total frequency lower than this
* alpha=0.065: the initial learning rate

In [8]:
model_dbow = doc2vec.Doc2Vec(dm=0, vector_size=300, negative=5, min_count=3, alpha=0.065, min_alpha=0.065)
model_dbow.build_vocab([x for x in tqdm(all_data)])

100%|██████████| 181/181 [00:00<00:00, 480486.72it/s]


In [9]:
for epoch in range(30):
    model_dbow.train(X_train, total_examples=len(X_train), epochs=1)
    model_dbow.alpha -= 0.002 #set the learning rate
    model_dbow.min_alpha = model_dbow.alpha

In [10]:
def get_vectors(model, corpus_size, vectors_size, vectors_type):
    """
    Get vectors from trained doc2vec model
    :param doc2vec_model: Trained Doc2Vec model
    :param corpus_size: Size of the data
    :param vectors_size: Size of the embedding vectors
    :param vectors_type: Training or Testing vectors
    :return: list of vectors
    """
    vectors = np.zeros((corpus_size, vectors_size))
    for i in range(0, corpus_size):
        prefix = vectors_type + '_' + str(i)
        vectors[i] = model.docvecs[prefix]
    return vectors

X_train = get_vectors(model_dbow, len(X_train), 300, 'train')
X_test = get_vectors(model_dbow, len(X_test), 300, 'test')

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((136, 300), (45, 300), (136,), (45,))

# More preprocessing

## Evaluation metrics
Some considerations in evaluation metrics when deciding our model.
1. In our use case, it is more important to have high sensitivity as want to correctly identify as many depression cases out of all actual depression cases for early intervention. Predicting a non-depressed person as depressed is comparatively less severe, meaning we prioritize TPR (sensitivity) over FPR.
2. Hence we will focus on `f1 score` and `recall` for the positive class.

Note:
* Sensitivity = true positive rate = recall = TP / (TP + FN)
* Specificity = true negative rate = TN / (TN + FP)
* Fall out = false positive rate = FP / (FP + TN)
* Miss rate = false negative rate = FN / (FN + TP)

Metrics can be found on this [website](https://scikit-learn.org/stable/modules/model_evaluation.html).

## k cross 

In [11]:
def k_cross(input_model, X=X_train, y=y_train, k=4, n=3, random_state=RANDOM_STATE):
    f1_scores = []
    recall_scores = []
    rkf = RepeatedKFold(n_splits=k, n_repeats=n, random_state=RANDOM_STATE)
        
    for train_index, val_index in rkf.split(X):
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]
                
        model = clone(input_model) # prevents incremental fitting
        model.fit(X_train, y_train) 
        
        y_pred = model.predict(X_val)
        f1 = f1_score(y_val, y_pred)
        f1_scores.append(f1)
        recall = recall_score(y_val, y_pred)
        recall_scores.append(recall)
        
    return f1_scores, recall_scores

# Model Implementation

## Logistic Regression 

In [12]:
def find_best_logreg_model(power):
    best_f1_model = None
    best_f1 = -1
    best_recall = -1
    
    for i in range(power + 1):
        model = LogisticRegression(n_jobs=3, C=10**i)
        
        f1_scores, recall_scores = k_cross(model)
        f1 = np.mean(f1_scores)
        recall = np.mean(recall_scores)

        if f1 > best_f1:
            best_f1 = f1
            best_recall = recall
            best_f1_model = model
    
    print(f"best scores: f1 = {best_f1}, recall = {best_recall}")
    
    return best_f1_model

find_best_logreg_model(6)

best scores: f1 = 0.4021975725662292, recall = 0.41012159137159143


LogisticRegression(C=100000, n_jobs=3)

## Decision Tree

In [13]:
def find_best_tree_model(upper_depth, upper_leaf):
    best_f1_model = None
    best_f1 = -1
    best_recall = -1
    
    for depth in range(1, upper_depth + 1):
        for leaf in range(1, upper_leaf + 1):
            model = DecisionTreeClassifier(random_state=RANDOM_STATE, max_depth=depth, min_samples_leaf=leaf) 
            
            f1_scores, recall_scores = k_cross(model)
            f1 = np.mean(f1_scores)
            recall = np.mean(recall_scores)
            
            if f1 > best_f1:
                best_f1 = f1
                best_recall = recall
                best_f1_model = model
    
    print(f"best scores: f1 = {best_f1}, recall = {best_recall}")
    
    return best_f1_model

find_best_tree_model(20, 30)

best scores: f1 = 0.28061796563193114, recall = 0.28481634106634107


DecisionTreeClassifier(max_depth=7, min_samples_leaf=4, random_state=42)

## Random Forest

In [14]:
def find_best_forest_model(n_estimators):
    best_f1_model = None
    best_f1 = -1
    best_recall = -1
    
    for estimator in range(1, n_estimators + 1):
        model = RandomForestClassifier(random_state=RANDOM_STATE, n_estimators=estimator) 
        
        f1_scores, recall_scores = k_cross(model)
        f1 = np.mean(f1_scores)
        recall = np.mean(recall_scores)

        if f1 > best_f1:
            best_f1 = f1
            best_recall = recall
            best_f1_model = model
    
    print(f"best scores: f1 = {best_f1}, recall = {best_recall}")
    
    return best_f1_model

find_best_forest_model(30)

best scores: f1 = 0.3893653413698171, recall = 0.466211334961335


RandomForestClassifier(n_estimators=1, random_state=42)

## SVM with grid search

In [15]:
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100]},
                    {'kernel': ['poly'], 'degree': [3, 4, 5], 'C': [1, 10, 100]},
                    {'kernel': ['linear'], 'C': [1, 10, 100]}]

svm_model_cv = GridSearchCV(SVC(), tuned_parameters, cv=10, scoring='f1', verbose=1, n_jobs=4)

f1_scores, recall_scores = k_cross(svm_model_cv)

print(f"f1 mean score: {np.mean(f1_scores)}")
print(f"recall mean score: {np.mean(recall_scores)}")

svm_model_cv.fit(X_train, y_train)
print(svm_model_cv.best_params_)

Fitting 10 folds for each of 18 candidates, totalling 180 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done 158 tasks      | elapsed:    0.8s
[Parallel(n_jobs=4)]: Done 180 out of 180 | elapsed:    0.9s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 10 folds for each of 18 candidates, totalling 180 fits


[Parallel(n_jobs=4)]: Done 130 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done 180 out of 180 | elapsed:    0.7s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 10 folds for each of 18 candidates, totalling 180 fits


[Parallel(n_jobs=4)]: Done 130 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done 180 out of 180 | elapsed:    0.7s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 10 folds for each of 18 candidates, totalling 180 fits


[Parallel(n_jobs=4)]: Done 130 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done 180 out of 180 | elapsed:    0.7s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 10 folds for each of 18 candidates, totalling 180 fits


[Parallel(n_jobs=4)]: Done 130 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done 180 out of 180 | elapsed:    0.6s finished


Fitting 10 folds for each of 18 candidates, totalling 180 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done 130 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done 180 out of 180 | elapsed:    0.6s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 10 folds for each of 18 candidates, totalling 180 fits


[Parallel(n_jobs=4)]: Done 130 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done 180 out of 180 | elapsed:    0.6s finished


Fitting 10 folds for each of 18 candidates, totalling 180 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done 130 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done 180 out of 180 | elapsed:    0.7s finished


Fitting 10 folds for each of 18 candidates, totalling 180 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done 130 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done 180 out of 180 | elapsed:    0.6s finished


Fitting 10 folds for each of 18 candidates, totalling 180 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done 130 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done 180 out of 180 | elapsed:    0.7s finished


Fitting 10 folds for each of 18 candidates, totalling 180 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done 130 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done 180 out of 180 | elapsed:    0.6s finished


Fitting 10 folds for each of 18 candidates, totalling 180 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done 130 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done 180 out of 180 | elapsed:    0.6s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


f1 mean score: 0.3764527981291677
recall mean score: 0.3898834961334961
Fitting 10 folds for each of 18 candidates, totalling 180 fits


[Parallel(n_jobs=4)]: Done 123 tasks      | elapsed:    0.6s


{'C': 1, 'kernel': 'linear'}


[Parallel(n_jobs=4)]: Done 180 out of 180 | elapsed:    0.9s finished
