In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.multioutput import MultiOutputClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.model_selection import train_test_split
from random import seed, randint
from utils import *

In [2]:
### load train data
data_dict = {}
# 2020
pickle_in = open('../Data/dict2020_allposts.pkl', 'rb')
loaded_dict2020 = pickle.load(pickle_in)
# 2019
pickle_in = open('../Data/dict2019_allposts.pkl', 'rb')
loaded_dict2019 = pickle.load(pickle_in)
# concat
data_dict.update(loaded_dict2020)
data_dict.update(loaded_dict2019)
X_train = pd.DataFrame.from_dict(data_dict).T
X_train.index.names = ['dummy', 'subject']  # set names to indexes
X_train = X_train.groupby('subject').mean() # group by 'subject' (mean)
X_train = X_train.sort_index()              # sort by sybject's name


### load gt
# 2020
csv_path20 = '../Data/golden_truth20.csv'
y_2020 = pd.read_csv(csv_path20, index_col=1)
# 2019
csv_path19 = '../Data/golden_truth19.csv'
y_2019 = pd.read_csv(csv_path19, index_col=1)
# concat
y_train = pd.concat([y_2020,y_2019])
# drop an unnecessary column
y_train.pop('Unnamed: 0')
y_train = y_train.sort_index()

In [3]:
# load test data
# 2021
pickle_in = open('../Data/dict2021_allposts.pkl', 'rb')
loaded_dict2021 = pickle.load(pickle_in)

X_test = pd.DataFrame.from_dict(loaded_dict2021).T
X_test.index.names = ['dummy', 'subject']  # set names to indexes
X_test = X_test.groupby('subject').mean() # group by 'subject' (mean)
X_test = X_test.sort_index()

subjects2021 = X_test.index.tolist()

In [4]:
names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes"]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=1000),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB()]


for name, clf in zip(names, classifiers):
    ### training
    multi_target_clf = MultiOutputClassifier(clf, n_jobs=-1)
    y_pred_arr = multi_target_clf.fit(X_train, y_train).predict(X_test)
    
    y_pred = pd.DataFrame(y_pred_arr)
    y_pred['subject'] = subjects2021
    y_pred.set_index('subject', inplace=True)
    
    filename = str(name)+'.txt'
    f = open(filename, 'a')
    f.seek(0)
    f.truncate()
    
    for sub in subjects2021:
        f.write(str(sub) + ' ')
        for i in range(0,21):
            ans = y_pred.iloc[:,i].loc[y_pred.index==sub].values[0]

            if (i==15 or i==17):
                if ans == 1:
                    ans = '1a'
                elif ans == 2:
                    ans = '1b'
                elif ans == 3:
                    ans = '2a'
                elif ans == 4:
                    ans = '2b'
                elif ans == 5:
                    ans = '3a'
                elif ans == 6:
                    ans = '3b'

            f.write(str(ans) + ' ')
        f.write('\n')
    f.close()

    print(name)
    evaluation(filename)
    print('\n')

Nearest Neighbors
Average Hit Rate: 27.5%
Average Closeness Rate: 62.96%
Average Difference between Overall Depression Levels: 73.95%
Depression Category Hit Rate: 22.5%


Linear SVM
Average Hit Rate: 35.36%
Average Closeness Rate: 67.18%
Average Difference between Overall Depression Levels: 73.97%
Depression Category Hit Rate: 15.0%


RBF SVM
Average Hit Rate: 32.44%
Average Closeness Rate: 63.69%
Average Difference between Overall Depression Levels: 67.3%
Depression Category Hit Rate: 7.5%


Gaussian Process
Average Hit Rate: 31.19%
Average Closeness Rate: 63.51%
Average Difference between Overall Depression Levels: 71.77%
Depression Category Hit Rate: 10.0%


Decision Tree
Average Hit Rate: 31.37%
Average Closeness Rate: 66.23%
Average Difference between Overall Depression Levels: 83.1%
Depression Category Hit Rate: 36.25%


Random Forest
Average Hit Rate: 35.06%
Average Closeness Rate: 67.8%
Average Difference between Overall Depression Levels: 75.81%
Depression Category Hit Rate: 