In [120]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import logging
from sklearn.metrics import *
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.linear_model import LogisticRegression
import random
import torch

In [124]:
def set_seed(seed: int):
    random.seed(seed)
    np.random.seed(seed)


set_seed(1)

In [None]:
# first we train the LR model on regular dreaddit dataset
train_data = pd.read_csv('dreaddit-train.csv')
test_data = pd.read_csv('dreaddit-test.csv')

In [126]:
# lets check what features we'll use in the model
test = train_data.iloc[:,8::]
test

Unnamed: 0,social_karma,syntax_ari,lex_liwc_WC,lex_liwc_Analytic,lex_liwc_Clout,lex_liwc_Authentic,lex_liwc_Tone,lex_liwc_WPS,lex_liwc_Sixltr,lex_liwc_Dic,...,lex_dal_min_activation,lex_dal_min_imagery,lex_dal_avg_activation,lex_dal_avg_imagery,lex_dal_avg_pleasantness,social_upvote_ratio,social_num_comments,syntax_fk_grade,sentiment,roberta_prediction
0,5,1.806818,116,72.64,15.04,89.26,1.00,29.00,12.93,87.07,...,1.1250,1.0,1.77000,1.52211,1.89556,0.86,1,3.253573,-0.002742,1
1,4,9.429737,109,79.08,76.85,56.75,98.18,27.25,21.10,87.16,...,1.0000,1.0,1.69586,1.62045,1.88919,0.65,2,8.828316,0.292857,0
2,2,7.769821,167,33.80,76.38,86.24,25.77,33.40,17.37,91.02,...,1.1429,1.0,1.83088,1.58108,1.85828,0.67,0,7.841667,0.011894,1
3,0,2.667798,273,2.98,15.25,95.42,79.26,54.60,8.06,98.90,...,1.1250,1.0,1.75356,1.52114,1.98848,0.50,5,4.104027,0.141671,1
4,24,7.554238,89,32.22,28.71,84.01,1.00,17.80,31.46,88.76,...,1.1250,1.0,1.77644,1.64872,1.81456,1.00,1,7.910952,-0.204167,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2833,13,-1.369333,89,19.41,99.00,37.57,99.00,17.80,5.62,97.75,...,1.0000,1.0,1.71133,1.45301,2.00304,0.84,16,0.254444,0.552066,0
2834,33,9.425478,135,40.97,4.45,98.01,1.00,27.00,17.78,96.30,...,1.0000,1.0,1.65003,1.56842,1.81527,0.96,6,8.640664,-0.220370,1
2835,2,11.060675,134,79.52,97.34,2.27,80.01,22.33,25.37,84.33,...,1.1250,1.0,1.79768,1.49074,1.92286,1.00,1,9.951524,0.045455,0
2836,4,2.421912,68,29.74,61.58,21.06,25.77,13.60,16.18,92.65,...,1.1429,1.0,1.71642,1.57627,1.89972,0.75,7,4.036765,0.159722,0


In [68]:
# gets all featurs, including the roberta ones if the dataset has them
def get_examples_features_as_np(df, col_start = 0, col_end = None):
    if (col_end is None):
        col_end = len(df)
    df_cols = dtrain_data2 = get_examples_features_as_np(train_data, 8)
test_data2 = get_examples_features_as_np(test_data, 8)f.iloc[:,col_start:col_end]
    return df_cols.to_numpy()



In [125]:
model_names = ["LR - all features, no Roberta predictions",
               "LR - all features + base Roberta predictions",
               "LR - all features + pretrained Roberta (our dataset) predictions",
               "LR - all features + pretrained Roberta (mental health) predictions"]

model_train_datasets = ["dreaddit-train.csv",
                        "train_pred_base_roberta.csv",
                        "train_pred_roberta_our_scraped_data.csv",
                        "train_pred_mental_health.csv"]

model_test_datasets = ["dreaddit-test.csv",
                       "test_pred_base_roberta.csv",
                       "test_pred_roberta_our_scraped_data.csv",
                        "test_pred_mental_health.csv"]

def test_lr_roberta_all_features(model_name, dataset_path_train = None,dataset_path_test = None):
    if (dataset_path_train is None or dataset_path_test is None):
        raise RuntimeException("Error! Dataset must be provided")
    train_data = pd.read_csv(dataset_path_train)
    test_data = pd.read_csv(dataset_path_test)
    
    train_labels = train_data[["label"]].to_numpy().transpose()[0]
    test_labels = test_data[["label"]].to_numpy().transpose()[0]
    
    # include all features in tranining, including roberta predictions
    train_data_np = get_examples_features_as_np(train_data, 8)
    test_data_np = get_examples_features_as_np(test_data, 8)
    
    model = LogisticRegression(max_iter=1000000, solver="lbfgs")
    model.fit(train_data_np, train_labels)
    
    y_pred = model.predict(test_data_np)
    
    acc = accuracy_score(test_labels, y_pred)
    rec = recall_score(test_labels, y_pred, zero_division=1)
    prec = precision_score(test_labels, y_pred, zero_division=1)
    f1 = f1_score(test_labels, y_pred, zero_division=1)

    print("-----------------------------")
    print(model_name)
    print("Accuracy: ", acc)
    print("Recall: ", rec)
    print("Precision: ", prec)
    print("F1: ", f1)
    print("\n")

for i in range(4):
    test_lr_roberta_all_features(model_names[i], model_train_datasets[i], model_test_datasets[i])

-----------------------------
LR - all features, no Roberta predictions
Accuracy:  0.7482517482517482
Recall:  0.7859078590785907
Precision:  0.7416879795396419
F1:  0.763157894736842


-----------------------------
LR - all features + base Roberta predictions
Accuracy:  0.8111888111888111
Recall:  0.8807588075880759
Precision:  0.78125
F1:  0.8280254777070063


-----------------------------
LR - all features + pretrained Roberta (our dataset) predictions
Accuracy:  0.820979020979021
Recall:  0.907859078590786
Precision:  0.7808857808857809
F1:  0.8395989974937343


-----------------------------
LR - all features + pretrained Roberta (mental health) predictions
Accuracy:  0.7440559440559441
Recall:  0.8292682926829268
Precision:  0.7183098591549296
F1:  0.769811320754717




In [90]:
# interesting - model works better when we include all 108 features than
# when we include only the best LIWC features -> comment on this in the paper

In [113]:
model_names = ["LR - best LIWC features, no Roberta predictions",
               "LR - best LIWC features + base Roberta predictions",
               "LR - best LIWC features + pretrained Roberta (our dataset) predictions",
               "LR - best LIWC features + pretrained Roberta (mental health) predictions"]

model_train_datasets = ["dreaddit-train.csv",
                        "train_pred_base_roberta.csv",
                        "train_pred_roberta_our_scraped_data.csv",
                        "train_pred_mental_health.csv"]

model_test_datasets = ["dreaddit-test.csv",
                       "test_pred_base_roberta.csv",
                       "test_pred_roberta_our_scraped_data.csv",
                        "test_pred_mental_health.csv"]

# test the LR on best performing LIWC features and Roberta predictions (if applicable)
# features are: (Work, Adverb, Shehe, Drives, Ipron, Period, Swear) 
def test_lr_roberta_best_features(model_name, dataset_path_train = None,dataset_path_test = None):
    if (dataset_path_train is None or dataset_path_test is None):
        raise RuntimeException("Error! Dataset must be provided")
    train_data = pd.read_csv(dataset_path_train)
    test_data = pd.read_csv(dataset_path_test)
    
    train_labels = train_data[["label"]].to_numpy().transpose()[0]
    test_labels = test_data[["label"]].to_numpy().transpose()[0]
    
    if (model_name != "LR - best LIWC features, no Roberta predictions"):
        train_data = train_data[['lex_liwc_work', 'lex_liwc_adverb','lex_liwc_shehe',
                             'lex_liwc_drives', 'lex_liwc_ipron', 'lex_liwc_Period',
                            'lex_liwc_swear', 'roberta_prediction']]
    
        test_data = test_data[['lex_liwc_work', 'lex_liwc_adverb','lex_liwc_shehe',
                             'lex_liwc_drives', 'lex_liwc_ipron', 'lex_liwc_Period',
                            'lex_liwc_swear', 'roberta_prediction']]
    else:
        train_data = train_data[['lex_liwc_work', 'lex_liwc_adverb','lex_liwc_shehe',
                                 'lex_liwc_drives', 'lex_liwc_ipron', 'lex_liwc_Period',
                                'lex_liwc_swear']]

        test_data = test_data[['lex_liwc_work', 'lex_liwc_adverb','lex_liwc_shehe',
                                 'lex_liwc_drives', 'lex_liwc_ipron', 'lex_liwc_Period',
                                'lex_liwc_swear']]

    # include all features in tranining, including roberta predictions
    
    train_data_np = train_data.to_numpy()
    test_data_np = test_data.to_numpy()
    
    model = LogisticRegression(max_iter=1000000, solver="lbfgs")
    model.fit(train_data_np, train_labels)
    
    y_pred = model.predict(test_data_np)
    
    acc = accuracy_score(test_labels, y_pred)
    rec = recall_score(test_labels, y_pred, zero_division=1)
    prec = precision_score(test_labels, y_pred, zero_division=1)
    f1 = f1_score(test_labels, y_pred, zero_division=1)

    print("-----------------------------")
    print(model_name)
    print("Accuracy: ", acc)
    print("Recall: ", rec)
    print("Precision: ", prec)
    print("F1: ", f1)
    print("\n")
    
for i in range(4):
    test_lr_roberta_best_features(model_names[i], model_train_datasets[i], model_test_datasets[i])

-----------------------------
LR - best LIWC features, no Roberta predictions
Accuracy:  0.5986013986013986
Recall:  0.6476964769647696
Precision:  0.6035353535353535
F1:  0.6248366013071894


-----------------------------
LR - best LIWC features + base Roberta predictions
Accuracy:  0.8111888111888111
Recall:  0.8861788617886179
Precision:  0.7785714285714286
F1:  0.8288973384030418


-----------------------------
LR - best LIWC features + pretrained Roberta (our dataset) predictions
Accuracy:  0.8167832167832167
Recall:  0.9105691056910569
Precision:  0.7741935483870968
F1:  0.8368617683686178


-----------------------------
LR - best LIWC features + pretrained Roberta (mental health) predictions
Accuracy:  0.7034965034965035
Recall:  0.94579945799458
Precision:  0.6451016635859519
F1:  0.767032967032967


