# Stress Detection Models:
#### RoBERTa Classifier Adapted on Scraped Data with LIWC/Empath 

In [1]:
train_scr = r'domain-adaptation\predictions\train-roberta-scraped.csv'
test_scr = r'domain-adaptation\predictions\test-roberta-scraped.csv'

In [2]:
import pandas as pd
import numpy as np
import random
import phik
import warnings
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import *
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from empath import Empath

In [3]:
# for reproducibility:
seed = 1
def set_seed(seed: int):
    random.seed(seed)
    np.random.seed(seed)

set_seed(seed)

## Data preparation

In [4]:
# Prepares training and testing data for classification from given paths
def prepare_data(dataset_path_train = None, dataset_path_test = None, features=None):
    if (dataset_path_train is None or dataset_path_test is None):
        raise RuntimeException("Error! Dataset must be provided")
    X_train = pd.read_csv(dataset_path_train)
    X_test = pd.read_csv(dataset_path_test)
    
    y_train = X_train['label']
    y_test = X_test['label']
        
    X_train = X_train.drop(columns=['label'])
    X_test = X_test.drop(columns=['label'])
    
    
    if features is not None:
        X_train = X_train[features]
        X_test = X_test[features]
    
    return X_train, y_train, X_test, y_test

# Calculates Empath features and adds them to the given dataframe
def empath(X):
    rows = []
    for t in X['text']:
        empath = Empath().analyze(t, normalize=True)
        rows.append(pd.DataFrame([empath]))
    df = pd.concat(rows, ignore_index=True)
    X = pd.concat([X,df], axis=1)
    return X.drop(columns=['text'])

# Returns only lexical features from a given dataframe
def lex(X):
    features = []
    for feat in X.columns.values:
        if 'lex' in feat:
            features.append(feat)
    return features

In [5]:
basic = ['roberta_prediction', 'text'] # for creating Empath features
lexic = lex(pd.read_csv(train_scr))
lexic.extend(['roberta_prediction']) # for getting LIWC features

In [6]:
# Evaluates performance of a model given its predictions on the test set
def evaluate(y_test, y_pred):
    print("Accuracy: ", accuracy_score(y_test, y_pred))
    print("Precision: ", precision_score(y_test, y_pred, zero_division=1))
    print("Recall: ", recall_score(y_test, y_pred, zero_division=1))
    print("F1: ", f1_score(y_test, y_pred, zero_division=1))

# LIWC

In [7]:
X_train, y_train, X_test, y_test = prepare_data(dataset_path_train=train_scr, \
                                                dataset_path_test=test_scr, \
                                                features=lexic)

## Model training and testing
### Logistic Regression
#### Grid search

In [8]:
warnings.filterwarnings('ignore')
hp = {
        'clf__solver': ['liblinear', 'newton-cg', 'lbfgs'],
        'clf__penalty': ['l2', 'elasticnet'],
        'clf__C': [0.1, 1, 10],
}

pipe = Pipeline([
        ('scale', StandardScaler()),
        ('clf', LogisticRegression(max_iter=1000000, random_state=seed))])

gs_model = GridSearchCV(pipe, hp, n_jobs=-1, scoring='f1')
gs_model.fit(X_train, y_train)
gs_model.best_params_

{'clf__C': 10, 'clf__penalty': 'l2', 'clf__solver': 'liblinear'}

#### Testing model with best hyperparameters

In [28]:
model = Pipeline([
        ('scale', StandardScaler()),
        ('clf', LogisticRegression(max_iter=1000000, C=10, penalty='l2', solver='liblinear', random_state=seed))])
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
evaluate(y_test, y_pred)

Accuracy:  0.8223776223776224
Precision:  0.7880952380952381
Recall:  0.8970189701897019
F1:  0.8390367553865653


In [10]:
cdf = pd.DataFrame(model.get_params()['clf'].coef_.transpose(), X_train.columns, columns=['coefficient'])
print(cdf.sort_values(by='coefficient',  key=abs, ascending=False).to_string())

                          coefficient
roberta_prediction           1.941677
lex_liwc_function            0.364690
lex_liwc_feel                0.340625
lex_liwc_social             -0.294864
lex_liwc_OtherP             -0.267551
lex_liwc_swear               0.241359
lex_liwc_auxverb            -0.233335
lex_liwc_tentat              0.231733
lex_liwc_Dic                 0.226625
lex_liwc_Apostro             0.208648
lex_liwc_ingest             -0.205350
lex_liwc_Tone               -0.204162
lex_liwc_percept            -0.199847
lex_liwc_death               0.180748
lex_liwc_Clout              -0.172249
lex_liwc_AllPunc            -0.164019
lex_liwc_informal           -0.159971
lex_liwc_prep               -0.159533
lex_liwc_certain             0.156124
lex_liwc_ipron              -0.149457
lex_liwc_anx                 0.144994
lex_dal_avg_pleasantness    -0.139682
lex_dal_min_pleasantness    -0.138055
lex_liwc_focuspresent        0.137293
lex_liwc_friend              0.137167
lex_liwc_com

### Support Vector Machine
#### Grid search

In [11]:
hp = {
        'clf__kernel': ['linear'],
        'clf__gamma': [1000, 100, 10, 1, 0.1, 0.01],
        'clf__C': [0.01, 0.1, 1, 10],
}

pipe = Pipeline([
        ('scale', StandardScaler()),
        ('clf', SVC(random_state=seed))])

gs_model = GridSearchCV(pipe, hp, n_jobs=-1, scoring='f1')
gs_model.fit(X_train, y_train)
gs_model.best_params_

{'clf__C': 0.01, 'clf__gamma': 1000, 'clf__kernel': 'linear'}

#### Testing model with best hyperparameters

In [12]:
model = Pipeline([
        ('scale', StandardScaler()),
        ('clf', SVC(C=0.01, gamma=1000, kernel='linear', random_state=seed))])
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
evaluate(y_test, y_pred)

Accuracy:  0.8167832167832167
Precision:  0.7741935483870968
Recall:  0.9105691056910569
F1:  0.8368617683686178


### Random Forest Classifier
#### Grid search

In [13]:
hp = {
        'clf__n_estimators': [50, 200],
        'clf__max_features': ['auto', 'sqrt', 'log2'],
        'clf__criterion': ['gini', 'entropy'],
        'clf__min_samples_split': [2, 4]
}

pipe = Pipeline([
        ('scale', StandardScaler()),
        ('clf',  RandomForestClassifier(random_state=seed))])

gs_model = GridSearchCV(pipe, hp, n_jobs=-1, scoring='f1', verbose=4)
gs_model.fit(X_train, y_train)
gs_model.best_params_

Fitting 5 folds for each of 24 candidates, totalling 120 fits


{'clf__criterion': 'gini',
 'clf__max_features': 'auto',
 'clf__min_samples_split': 4,
 'clf__n_estimators': 200}

#### Testing model with best hyperparameters

In [27]:
model = Pipeline([
        ('scale', StandardScaler()),
        ('clf', RandomForestClassifier(criterion='gini', max_features='auto', min_samples_split=4, n_estimators=200, random_state=seed))])
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
evaluate(y_test, y_pred)

Accuracy:  0.8083916083916084
Precision:  0.7697674418604651
Recall:  0.8970189701897019
F1:  0.8285356695869837


# Empath

In [15]:
X_train, y_train, X_test, y_test = prepare_data(dataset_path_train=train_scr, \
                                                dataset_path_test=test_scr, \
                                                features=basic)
X_train = empath(X_train)
X_test = empath(X_test)

## Features Correlation Analysis and Removal

In [16]:
# Determines which features to drop based on the given threshold from phik correlation matrix
def cutoff_features(corr_matrix, threshold):
    upper_tri = corr_matrix.abs().where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool_))
    to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > float(threshold))]
    return to_drop

In [17]:
corrs = X_train.phik_matrix(interval_cols=X_train.columns)

In [18]:
to_drop = cutoff_features(corrs, 0.85)
X_train = X_train.drop(columns=to_drop)
X_test = X_test.drop(columns=to_drop)
to_drop

['health',
 'banking',
 'night',
 'college',
 'exasperation',
 'reading',
 'worship',
 'eating',
 'water',
 'legend',
 'neglect',
 'swimming',
 'love',
 'sympathy',
 'vehicle',
 'disgust',
 'sound',
 'sailing',
 'warmth',
 'fun',
 'joy',
 'affection',
 'lust',
 'shame',
 'anger',
 'car',
 'technology',
 'power',
 'white_collar_job',
 'party',
 'cleaning',
 'competing',
 'ocean',
 'contentment',
 'musical']

## Model training and testing

### Logistic Regression

#### Grid search

In [19]:
warnings.filterwarnings('ignore')
hp = {
        'clf__solver': ['liblinear', 'newton-cg', 'lbfgs'],
        'clf__penalty': ['l2', 'elasticnet'],
        'clf__C': [0.1, 1, 10],
}

pipe = Pipeline([
        ('scale', StandardScaler()),
        ('clf', LogisticRegression(max_iter=1000000, random_state=seed))])

gs_model = GridSearchCV(pipe, hp, n_jobs=-1, scoring='f1')
gs_model.fit(X_train, y_train)
gs_model.best_params_

{'clf__C': 0.1, 'clf__penalty': 'l2', 'clf__solver': 'liblinear'}

#### Testing model with best hyperparameters

In [20]:
model = Pipeline([
        ('scale', StandardScaler()),
        ('clf', LogisticRegression(max_iter=1000000, C=0.1, penalty='l2', solver='liblinear', random_state=seed))])
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
evaluate(y_test, y_pred)

Accuracy:  0.8223776223776224
Precision:  0.7880952380952381
Recall:  0.8970189701897019
F1:  0.8390367553865653


In [21]:
cdf = pd.DataFrame(model.get_params()['clf'].coef_.transpose(), X_train.columns, columns=['coefficient'])
print(cdf.sort_values(by='coefficient',  key=abs, ascending=False).to_string())

                       coefficient
roberta_prediction        2.221256
beauty                   -0.231650
domestic_work             0.229742
childish                 -0.219444
tourism                  -0.209199
order                     0.199517
science                   0.186974
negative_emotion          0.183572
rage                     -0.180759
nervousness               0.159488
heroic                   -0.156076
toy                       0.151255
tool                     -0.147449
prison                   -0.143879
cheerfulness              0.142850
smell                    -0.141665
irritability              0.141119
wealthy                   0.130571
money                    -0.130535
torment                   0.130257
dance                     0.130222
listen                   -0.125616
economics                 0.120398
family                   -0.119987
music                    -0.115232
politeness               -0.108916
restaurant               -0.102343
violence            

### Support Vector Machine

#### Grid search

In [22]:
hp = {
        'clf__kernel': ['linear'],
        'clf__gamma': [1000, 100, 10, 1, 0.1, 0.01],
        'clf__C': [0.01, 0.1, 1, 10],
}

pipe = Pipeline([
        ('scale', StandardScaler()),
        ('clf', SVC(random_state=seed))])

gs_model = GridSearchCV(pipe, hp, n_jobs=-1, scoring='f1')
gs_model.fit(X_train, y_train)
gs_model.best_params_

{'clf__C': 0.01, 'clf__gamma': 1000, 'clf__kernel': 'linear'}

#### Testing model with best hyperparameters

In [23]:
model = Pipeline([
        ('scale', StandardScaler()),
        ('clf', SVC(C=0.01, gamma=1000, kernel='linear', random_state=seed))])
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
evaluate(y_test, y_pred)

Accuracy:  0.8167832167832167
Precision:  0.7741935483870968
Recall:  0.9105691056910569
F1:  0.8368617683686178


### Random Forest Classifier

#### Grid search

In [24]:
hp = {
        'clf__n_estimators': [50, 200],
        'clf__max_features': ['auto', 'sqrt', 'log2'],
        'clf__criterion': ['gini', 'entropy'],
        'clf__min_samples_split': [2, 4]
}

pipe = Pipeline([
        ('scale', StandardScaler()),
        ('clf',  RandomForestClassifier(random_state=seed))])

gs_model = GridSearchCV(pipe, hp, n_jobs=-1, scoring='f1', verbose=4)
gs_model.fit(X_train, y_train)
gs_model.best_params_

Fitting 5 folds for each of 24 candidates, totalling 120 fits


{'clf__criterion': 'entropy',
 'clf__max_features': 'auto',
 'clf__min_samples_split': 4,
 'clf__n_estimators': 200}

#### Testing model with best hyperparameters

In [26]:
model = Pipeline([
        ('scale', StandardScaler()),
        ('clf', RandomForestClassifier(criterion='entropy', max_features='auto', min_samples_split=4, n_estimators=200, random_state=seed))])
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
evaluate(y_test, y_pred)

Accuracy:  0.813986013986014
Precision:  0.7757009345794392
Recall:  0.8997289972899729
F1:  0.8331242158092848
