In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import *
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.linear_model import LogisticRegression
import random
import matplotlib.pyplot as plt

def set_seed(seed: int):
    random.seed(seed)
    np.random.seed(seed)

set_seed(1)

In [2]:
def prepare_data(dataset_path_train = None,dataset_path_test = None):
    if (dataset_path_train is None or dataset_path_test is None):
        raise RuntimeException("Error! Dataset must be provided")
    train = pd.read_csv(dataset_path_train)
    test = pd.read_csv(dataset_path_test)
    
    y_train = train['label']
    y_test = test['label']
    
    X_train = train[['roberta_prediction', 'text']]
    X_test = test[['roberta_prediction', 'text']]
    
    return X_train, y_train, X_test, y_test

def evaluate(test_labels, y_pred):
    acc = accuracy_score(test_labels, y_pred)
    rec = recall_score(test_labels, y_pred, zero_division=1)
    prec = precision_score(test_labels, y_pred, zero_division=1)
    f1 = f1_score(test_labels, y_pred, zero_division=1)
    print("Accuracy: ", acc)
    print("Recall: ", rec)
    print("Precision: ", prec)
    print("F1: ", f1)

In [3]:
from empath import Empath
lexicon = Empath()

def empath(X):
    rows = []
    for t in X['text']:
        empath = lexicon.analyze(t, normalize=True)
        rows.append(pd.DataFrame([empath]))
    df = pd.concat(rows, ignore_index=True)
    X = pd.concat([X,df], axis=1)
    return X.drop(columns=['text'])

In [4]:
X_train, y_train, X_test, y_test = prepare_data(dataset_path_train = 'train_pred_roberta_our_scraped_data.csv', dataset_path_test = 'test_pred_roberta_our_scraped_data.csv')
X_train = empath(X_train)
X_test = empath(X_test)

In [5]:
import phik
def pairs(X, threshold):
    corr =  X.phik_matrix(interval_cols=X.columns)
    columns = corr.columns
    high_corr = dict()
    feature_corr_n = dict.fromkeys(X.columns, 0)
    for column1 in columns:
        for column2 in columns:
            value = corr[column1][column2]
            if value > threshold and column1 != column2 and (column2, column1) not in high_corr:
                high_corr[(column1,column2)] = value
    return high_corr

In [6]:
def cutoff_features(threshold, X):
    corr_matrix =  X.phik_matrix(interval_cols=X.columns)
    upper_tri = corr_matrix.abs().where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool_))
    to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > float(threshold))]
    return to_drop
to_drop = cutoff_features(0.85, X_train)

In [7]:
to_drop

['health',
 'banking',
 'night',
 'college',
 'exasperation',
 'reading',
 'worship',
 'eating',
 'water',
 'legend',
 'neglect',
 'swimming',
 'love',
 'sympathy',
 'vehicle',
 'disgust',
 'sound',
 'sailing',
 'warmth',
 'fun',
 'joy',
 'affection',
 'lust',
 'shame',
 'anger',
 'car',
 'technology',
 'power',
 'white_collar_job',
 'party',
 'cleaning',
 'competing',
 'ocean',
 'contentment',
 'musical']

In [8]:
X_train = X_train.drop(columns=to_drop)
X_test = X_test.drop(columns=to_drop)

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

hp = {
        'solver': ['liblinear', 'newton-cg', 'lbfgs'],
        'penalty': ['l2', 'elasticnet'],
        'C': [0.1, 1, 10],
}

sclr = StandardScaler()
X_train_s = sclr.fit_transform(X_train)
X_test_s = sclr.transform(X_test)
gs_model = GridSearchCV(LogisticRegression(max_iter=1000000), hp, n_jobs=-1, scoring='f1', verbose=4)
gs_model.fit(X_train_s, y_train)

model = LogisticRegression(max_iter=1000000, **gs_model.best_params_)
model.fit(X_train, y_train)
y_pred = model.predict(X_test_s)
evaluate(y_test, y_pred)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


 0.88832486 0.88832486 0.88832486        nan        nan        nan
 0.88832486 0.88832486 0.88832486        nan        nan        nan]


Accuracy:  0.8167832167832167
Recall:  0.8943089430894309
Precision:  0.7819905213270142
F1:  0.8343868520859671


In [11]:
cdf = pd.DataFrame(model.coef_.transpose(), X_train.columns, columns=['coefficient'])
print(cdf.sort_values(by='coefficient',  key=abs, ascending=False).to_string())

                       coefficient
roberta_prediction        3.788742
nervousness               0.132881
negative_emotion          0.118690
pain                      0.105791
violence                  0.090998
fear                      0.080961
suffering                 0.070243
sadness                   0.063180
body                      0.056020
communication            -0.052343
hate                      0.047315
friends                  -0.043832
children                 -0.043718
internet                 -0.042569
cold                      0.040574
meeting                  -0.040475
listen                   -0.039278
phone                    -0.035543
messaging                -0.035529
speaking                 -0.032002
help                     -0.029985
childish                 -0.029703
traveling                -0.029033
money                    -0.027654
horror                    0.026969
computer                 -0.026148
family                   -0.025500
medical_emergency   