# Stress Detection Models:
#### Base RoBERTa Classifier with LIWC/Empath 

In [1]:
train_base = r'domain-adaptation\predictions\train-roberta-base.csv'
test_base = r'domain-adaptation\predictions\test-roberta-base.csv'

In [2]:
import pandas as pd
import numpy as np
import random
import phik
import warnings
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import *
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from empath import Empath

In [3]:
# for reproducibility:
seed = 1
def set_seed(seed: int):
    random.seed(seed)
    np.random.seed(seed)

set_seed(seed)

## Data preparation

In [4]:
# Prepares training and testing data for classification from given paths
def prepare_data(dataset_path_train = None, dataset_path_test = None, features=None):
    if (dataset_path_train is None or dataset_path_test is None):
        raise RuntimeException("Error! Dataset must be provided")
    X_train = pd.read_csv(dataset_path_train)
    X_test = pd.read_csv(dataset_path_test)
    
    y_train = X_train['label']
    y_test = X_test['label']
        
    X_train = X_train.drop(columns=['label'])
    X_test = X_test.drop(columns=['label'])
    
    
    if features is not None:
        X_train = X_train[features]
        X_test = X_test[features]
    
    return X_train, y_train, X_test, y_test

# Calculates Empath features and adds them to the given dataframe
def empath(X):
    rows = []
    for t in X['text']:
        empath = Empath().analyze(t, normalize=True)
        rows.append(pd.DataFrame([empath]))
    df = pd.concat(rows, ignore_index=True)
    X = pd.concat([X,df], axis=1)
    return X.drop(columns=['text'])

# Returns only lexical features from a given dataframe
def lex(X):
    features = []
    for feat in X.columns.values:
        if 'lex' in feat:
            features.append(feat)
    return features

In [5]:
basic = ['roberta_prediction', 'text'] # for creating Empath features
lexic = lex(pd.read_csv(train_base))
lexic.extend(['roberta_prediction']) # for getting LIWC features

In [6]:
# Evaluates performance of a model given its predictions on the test set
def evaluate(y_test, y_pred):
    print("Accuracy: ", accuracy_score(y_test, y_pred))
    print("Precision: ", precision_score(y_test, y_pred, zero_division=1))
    print("Recall: ", recall_score(y_test, y_pred, zero_division=1))
    print("F1: ", f1_score(y_test, y_pred, zero_division=1))

# LIWC

In [7]:
X_train, y_train, X_test, y_test = prepare_data(dataset_path_train=train_base, \
                                                dataset_path_test=test_base, \
                                                features=lexic)

## Model training and testing
### Logistic Regression
#### Grid search

In [8]:
warnings.filterwarnings('ignore')
hp = {
        'clf__solver': ['liblinear', 'newton-cg', 'lbfgs'],
        'clf__penalty': ['l2', 'elasticnet'],
        'clf__C': [0.1, 1, 10],
}

pipe = Pipeline([
        ('scale', StandardScaler()),
        ('clf', LogisticRegression(max_iter=1000000, random_state=seed))])

gs_model = GridSearchCV(pipe, hp, n_jobs=-1, scoring='f1')
gs_model.fit(X_train, y_train)
gs_model.best_params_

{'clf__C': 0.1, 'clf__penalty': 'l2', 'clf__solver': 'liblinear'}

#### Testing model with best hyperparameters

In [9]:
model = Pipeline([
        ('scale', StandardScaler()),
        ('clf', LogisticRegression(max_iter=1000000, C=0.1, penalty='l2', solver='liblinear', random_state=seed))])
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
evaluate(y_test, y_pred)

Accuracy:  0.8125874125874126
Precision:  0.7804295942720764
Recall:  0.8861788617886179
F1:  0.8299492385786803


In [10]:
cdf = pd.DataFrame(model.get_params()['clf'].coef_.transpose(), X_train.columns, columns=['coefficient'])
print(cdf.sort_values(by='coefficient',  key=abs, ascending=False).to_string())

                          coefficient
roberta_prediction           2.188519
lex_liwc_function            0.370191
lex_liwc_Dic                 0.319057
lex_liwc_feel                0.294599
lex_liwc_percept            -0.267339
lex_liwc_OtherP             -0.263127
lex_liwc_auxverb            -0.260417
lex_dal_avg_imagery          0.257474
lex_liwc_tentat              0.252739
lex_liwc_focuspresent        0.228942
lex_liwc_cogproc            -0.216422
lex_liwc_certain             0.215301
lex_liwc_Clout              -0.208607
lex_liwc_death               0.206854
lex_liwc_ipron              -0.181219
lex_liwc_differ             -0.180870
lex_dal_min_activation      -0.178761
lex_liwc_AllPunc            -0.168612
lex_liwc_Apostro             0.168567
lex_liwc_swear               0.164655
lex_liwc_ingest             -0.164429
lex_liwc_sexual              0.159023
lex_liwc_Authentic          -0.148617
lex_liwc_prep               -0.146460
lex_liwc_compare            -0.144820
lex_liwc_soc

### Support Vector Machine
#### Grid search

In [11]:
hp = {
        'clf__kernel': ['linear'],
        'clf__gamma': [1000, 100, 10, 1, 0.1, 0.01],
        'clf__C': [0.01, 0.1, 1, 10],
}

pipe = Pipeline([
        ('scale', StandardScaler()),
        ('clf', SVC(random_state=seed))])

gs_model = GridSearchCV(pipe, hp, n_jobs=-1, scoring='f1')
gs_model.fit(X_train, y_train)
gs_model.best_params_

{'clf__C': 0.01, 'clf__gamma': 1000, 'clf__kernel': 'linear'}

#### Testing model with best hyperparameters

In [12]:
model = Pipeline([
        ('scale', StandardScaler()),
        ('clf', SVC(C=0.01, gamma=1000, kernel='linear', random_state=seed))])
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
evaluate(y_test, y_pred)

Accuracy:  0.8111888111888111
Precision:  0.7785714285714286
Recall:  0.8861788617886179
F1:  0.8288973384030418


### Random Forest Classifier
#### Grid search

In [13]:
hp = {
        'clf__n_estimators': [50, 200],
        'clf__max_features': ['auto', 'sqrt', 'log2'],
        'clf__criterion': ['gini', 'entropy'],
        'clf__min_samples_split': [2, 4]
}

pipe = Pipeline([
        ('scale', StandardScaler()),
        ('clf',  RandomForestClassifier(random_state=seed))])

gs_model = GridSearchCV(pipe, hp, n_jobs=-1, scoring='f1', verbose=4)
gs_model.fit(X_train, y_train)
gs_model.best_params_

Fitting 5 folds for each of 24 candidates, totalling 120 fits


{'clf__criterion': 'entropy',
 'clf__max_features': 'auto',
 'clf__min_samples_split': 4,
 'clf__n_estimators': 50}

#### Testing model with best hyperparameters

In [14]:
model = Pipeline([
        ('scale', StandardScaler()),
        ('clf', RandomForestClassifier(criterion='entropy', max_features='auto', min_samples_split=4, n_estimators=50, random_state=seed))])
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
evaluate(y_test, y_pred)

Accuracy:  0.8083916083916084
Precision:  0.777511961722488
Recall:  0.8807588075880759
F1:  0.8259212198221093


# Empath

In [15]:
X_train, y_train, X_test, y_test = prepare_data(dataset_path_train=train_base, \
                                                dataset_path_test=test_base, \
                                                features=basic)
X_train = empath(X_train)
X_test = empath(X_test)

## Features Correlation Analysis and Removal

In [16]:
# Determines which features to drop based on the given threshold from phik correlation matrix
def cutoff_features(corr_matrix, threshold):
    upper_tri = corr_matrix.abs().where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool_))
    to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > float(threshold))]
    return to_drop

In [17]:
corrs = X_train.phik_matrix(interval_cols=X_train.columns)

In [18]:
to_drop = cutoff_features(corrs, 0.85)
X_train = X_train.drop(columns=to_drop)
X_test = X_test.drop(columns=to_drop)
to_drop

['health',
 'banking',
 'night',
 'college',
 'exasperation',
 'reading',
 'worship',
 'eating',
 'water',
 'legend',
 'neglect',
 'swimming',
 'love',
 'sympathy',
 'vehicle',
 'disgust',
 'sound',
 'sailing',
 'warmth',
 'fun',
 'joy',
 'affection',
 'lust',
 'shame',
 'anger',
 'car',
 'technology',
 'power',
 'white_collar_job',
 'party',
 'cleaning',
 'competing',
 'ocean',
 'contentment',
 'musical']

## Model training and testing

### Logistic Regression

#### Grid search

In [19]:
warnings.filterwarnings('ignore')
hp = {
        'clf__solver': ['liblinear', 'newton-cg', 'lbfgs'],
        'clf__penalty': ['l2', 'elasticnet'],
        'clf__C': [0.1, 1, 10],
}

pipe = Pipeline([
        ('scale', StandardScaler()),
        ('clf', LogisticRegression(max_iter=1000000, random_state=seed))])

gs_model = GridSearchCV(pipe, hp, n_jobs=-1, scoring='f1')
gs_model.fit(X_train, y_train)
gs_model.best_params_

{'clf__C': 0.1, 'clf__penalty': 'l2', 'clf__solver': 'liblinear'}

#### Testing model with best hyperparameters

In [20]:
model = Pipeline([
        ('scale', StandardScaler()),
        ('clf', LogisticRegression(max_iter=1000000, C=0.1, penalty='l2', solver='liblinear', random_state=seed))])
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
evaluate(y_test, y_pred)

Accuracy:  0.8111888111888111
Precision:  0.782608695652174
Recall:  0.8780487804878049
F1:  0.8275862068965518


In [21]:
cdf = pd.DataFrame(model.get_params()['clf'].coef_.transpose(), X_train.columns, columns=['coefficient'])
print(cdf.sort_values(by='coefficient',  key=abs, ascending=False).to_string())

                       coefficient
roberta_prediction        2.436719
tourism                  -0.245269
heroic                   -0.214066
irritability              0.205909
order                     0.179457
domestic_work             0.178059
negative_emotion          0.177279
rage                     -0.176513
smell                    -0.175169
toy                       0.163679
journalism               -0.158093
cheerfulness              0.157703
childish                 -0.155344
business                  0.137923
politeness               -0.133528
messaging                -0.133029
prison                   -0.132982
beauty                   -0.130893
internet                 -0.128963
real_estate              -0.124794
writing                   0.123634
hearing                   0.122055
kill                      0.121010
divine                    0.120069
positive_emotion         -0.119736
science                   0.114320
ugliness                  0.113554
urban               

### Support Vector Machine

#### Grid search

In [22]:
hp = {
        'clf__kernel': ['linear'],
        'clf__gamma': [1000, 100, 10, 1, 0.1, 0.01],
        'clf__C': [0.01, 0.1, 1, 10],
}

pipe = Pipeline([
        ('scale', StandardScaler()),
        ('clf', SVC(random_state=seed))])

gs_model = GridSearchCV(pipe, hp, n_jobs=-1, scoring='f1')
gs_model.fit(X_train, y_train)
gs_model.best_params_

{'clf__C': 0.01, 'clf__gamma': 1000, 'clf__kernel': 'linear'}

#### Testing model with best hyperparameters

In [23]:
model = Pipeline([
        ('scale', StandardScaler()),
        ('clf', SVC(C=0.01, gamma=1000, kernel='linear', random_state=seed))])
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
evaluate(y_test, y_pred)

Accuracy:  0.8111888111888111
Precision:  0.7785714285714286
Recall:  0.8861788617886179
F1:  0.8288973384030418


### Random Forest Classifier

#### Grid search

In [24]:
hp = {
        'clf__n_estimators': [50, 200],
        'clf__max_features': ['auto', 'sqrt', 'log2'],
        'clf__criterion': ['gini', 'entropy'],
        'clf__min_samples_split': [2, 4]
}

pipe = Pipeline([
        ('scale', StandardScaler()),
        ('clf',  RandomForestClassifier(random_state=seed))])

gs_model = GridSearchCV(pipe, hp, n_jobs=-1, scoring='f1', verbose=4)
gs_model.fit(X_train, y_train)
gs_model.best_params_

Fitting 5 folds for each of 24 candidates, totalling 120 fits


{'clf__criterion': 'gini',
 'clf__max_features': 'auto',
 'clf__min_samples_split': 4,
 'clf__n_estimators': 200}

#### Testing model with best hyperparameters

In [26]:
model = Pipeline([
        ('scale', StandardScaler()),
        ('clf', RandomForestClassifier(criterion='gini', max_features='auto', min_samples_split=4, n_estimators=200, random_state=seed))])
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
evaluate(y_test, y_pred)

Accuracy:  0.8083916083916084
Precision:  0.777511961722488
Recall:  0.8807588075880759
F1:  0.8259212198221093
