In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import *
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.linear_model import LogisticRegression
import random
import matplotlib.pyplot as plt

def set_seed(seed: int):
    random.seed(seed)
    np.random.seed(seed)

set_seed(1)

In [2]:
def prepare_data(dataset_path_train = None,dataset_path_test = None):
    if (dataset_path_train is None or dataset_path_test is None):
        raise RuntimeException("Error! Dataset must be provided")
    train = pd.read_csv(dataset_path_train)
    test = pd.read_csv(dataset_path_test)
    
    y_train = train['label']
    y_test = test['label']
    
    X_train = train[['roberta_prediction', 'text']]
    X_test = test[['roberta_prediction', 'text']]
    
    return X_train, y_train, X_test, y_test

def prepare_lex_data(dataset_path_train = None,dataset_path_test = None):
    if (dataset_path_train is None or dataset_path_test is None):
        raise RuntimeException("Error! Dataset must be provided")
    train = pd.read_csv(dataset_path_train)
    test = pd.read_csv(dataset_path_test)
    
    y_train = train['label']
    y_test = test['label']
    
    features = []
    for feat in train.columns.values:
        if 'lex' in feat:
            features.append(feat)
    features.extend(['roberta_prediction', 'syntax_ari', 'social_karma'])
    
    X_train = train[features]
    X_test = test[features]
    
    return X_train, y_train, X_test, y_test

def evaluate(test_labels, y_pred):
    acc = accuracy_score(test_labels, y_pred)
    rec = recall_score(test_labels, y_pred, zero_division=1)
    prec = precision_score(test_labels, y_pred, zero_division=1)
    f1 = f1_score(test_labels, y_pred, zero_division=1)
    print("Accuracy: ", acc)
    print("Recall: ", rec)
    print("Precision: ", prec)
    print("F1: ", f1)

In [3]:
from empath import Empath
lexicon = Empath()

def empath(X):
    rows = []
    for t in X['text']:
        empath = lexicon.analyze(t, normalize=True)
        rows.append(pd.DataFrame([empath]))
    df = pd.concat(rows, ignore_index=True)
    X = pd.concat([X,df], axis=1)
    return X.drop(columns=['text'])

In [4]:
X_train, y_train, X_test, y_test = prepare_data(dataset_path_train = 'train_pred_roberta_our_scraped_data.csv', dataset_path_test = 'test_pred_roberta_our_scraped_data.csv')
X_train = empath(X_train)
X_test = empath(X_test)

In [5]:
import phik
def pairs(X, threshold):
    corr =  X.phik_matrix(interval_cols=X.columns)
    columns = corr.columns
    high_corr = dict()
    feature_corr_n = dict.fromkeys(X.columns, 0)
    for column1 in columns:
        for column2 in columns:
            value = corr[column1][column2]
            if value > threshold and column1 != column2 and (column2, column1) not in high_corr:
                high_corr[(column1,column2)] = value
    return high_corr

In [6]:
def cutoff_features(threshold, X):
    corr_matrix =  X.phik_matrix(interval_cols=X.columns)
    upper_tri = corr_matrix.abs().where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool_))
    to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > float(threshold))]
    return to_drop
to_drop = cutoff_features(0.85, X_train)

In [7]:
to_drop

['health',
 'banking',
 'night',
 'college',
 'exasperation',
 'reading',
 'worship',
 'eating',
 'water',
 'legend',
 'neglect',
 'swimming',
 'love',
 'sympathy',
 'vehicle',
 'disgust',
 'sound',
 'sailing',
 'warmth',
 'fun',
 'joy',
 'affection',
 'lust',
 'shame',
 'anger',
 'car',
 'technology',
 'power',
 'white_collar_job',
 'party',
 'cleaning',
 'competing',
 'ocean',
 'contentment',
 'musical']

In [8]:
X_train = X_train.drop(columns=to_drop)
X_test = X_test.drop(columns=to_drop)

In [9]:
'''from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

hp = {
        'solver': ['liblinear', 'newton-cg', 'lbfgs'],
        'penalty': ['l2', 'elasticnet'],
        'C': [0.1, 1, 10],
}

sclr = StandardScaler()
X_train_s = sclr.fit_transform(X_train)
X_test_s = sclr.transform(X_test)
gs_model = GridSearchCV(LogisticRegression(max_iter=1000000), hp, n_jobs=-1, scoring='f1', verbose=4)
gs_model.fit(X_train_s, y_train)

model = LogisticRegression(max_iter=1000000, **gs_model.best_params_)
model.fit(X_train_s, y_train)
y_pred = model.predict(X_test_s)
evaluate(y_test, y_pred)'''

"from sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.preprocessing import StandardScaler\n\nhp = {\n        'solver': ['liblinear', 'newton-cg', 'lbfgs'],\n        'penalty': ['l2', 'elasticnet'],\n        'C': [0.1, 1, 10],\n}\n\nsclr = StandardScaler()\nX_train_s = sclr.fit_transform(X_train)\nX_test_s = sclr.transform(X_test)\ngs_model = GridSearchCV(LogisticRegression(max_iter=1000000), hp, n_jobs=-1, scoring='f1', verbose=4)\ngs_model.fit(X_train_s, y_train)\n\nmodel = LogisticRegression(max_iter=1000000, **gs_model.best_params_)\nmodel.fit(X_train_s, y_train)\ny_pred = model.predict(X_test_s)\nevaluate(y_test, y_pred)"

In [10]:

from sklearn.ensemble import RandomForestClassifier

hp = {
        'n_estimators': [50, 200],
        'max_features': ['auto', 'sqrt', 'log2'],
        'criterion': ['gini', 'entropy'],
        'min_samples_split': [2, 4]
    }
gs_model = GridSearchCV(RandomForestClassifier(), hp, n_jobs=-1, scoring='f1', verbose=4)
gs_model.fit(X_train, y_train)
    
model = RandomForestClassifier(**gs_model.best_params_)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
evaluate(y_test, y_pred)

NameError: name 'GridSearchCV' is not defined

In [None]:
cdf = pd.DataFrame(model.coef_.transpose(), X_train.columns, columns=['coefficient'])
print(cdf.sort_values(by='coefficient',  key=abs, ascending=False).to_string())

In [None]:
from sklearn.svm import SVC

sclr = StandardScaler()
X_train_s = sclr.fit_transform(X_train)
X_test_s = sclr.transform(X_test)
hp = {
        'kernel': ['linear'],
        'gamma': [1000, 100, 10, 1, 0.1, 0.01],
        'C': [0.01, 0.1, 1, 10],
    }
gs_model = GridSearchCV(SVC(), hp, n_jobs=-1, scoring='f1', verbose=4)
gs_model.fit(X_train_s, y_train)
    
model = SVC(**gs_model.best_params_)
model.fit(X_train_s, y_train)
y_pred = model.predict(X_test_s)
evaluate(y_test, y_pred)

In [None]:
X_train_l, y_train, X_test_l, y_test = prepare_lex_data(dataset_path_train = 'train_pred_roberta_our_scraped_data.csv', dataset_path_test = 'test_pred_roberta_our_scraped_data.csv')

In [None]:
model_l = LogisticRegression(max_iter=1000000)
model_l.fit(X_train_l, y_train)
y_pred_l = model_l.predict(X_test_l)
evaluate(y_test, y_pred_l)

In [None]:
X_train_b, y_train, X_test_b, y_test = prepare_lex_data(dataset_path_train = 'train_pred_base_roberta.csv', dataset_path_test = 'test_pred_base_roberta.csv')

In [None]:
model_b = LogisticRegression(max_iter=1000000)
model_b.fit(X_train_b, y_train)
y_pred_b = model_b.predict(X_test_b)
evaluate(y_test, y_pred_b)

In [None]:
from statsmodels.stats.contingency_tables import mcnemar
from mlxtend.evaluate import mcnemar_table
from mlxtend.plotting import checkerboard_plot

def testsig(y_test, y_model1, y_model2, alpha=0.05):
    tb = mcnemar_table(y_target=y_test, 
                   y_model1=y_pred_b, 
                   y_model2=y_pred)
    brd = checkerboard_plot(tb,
                        figsize=(4, 4),
                        fmt='%d',
                        col_labels=['model 2 correct', 'model 2 incorrect'],
                        row_labels=['model 1 correct', 'model 1 incorrect'])
    
    result = mcnemar(tb, exact=False)
    print('statistic=%.3f, p-value=%.3f' % (result.statistic, result.pvalue))
    if result.pvalue > alpha:
        print('Same proportions of errors (fail to reject H0)')
    else:
        print('Different proportions of errors (reject H0)')

In [None]:
print('Base + LIWC vs Fine tuned + Empath')
testsig(y_test, y_pred_b, y_pred, alpha=0.05)

Our final model is not better than the one Knežević et al. used.

In [None]:
print('Fine tuned + LIWC vs Fine tuned + Empath')
testsig(y_test, y_pred_l, y_pred, alpha=0.05)

Empath features can replace LIWC features.

### Error analysis

In [None]:
indexes = []
predictions = []
for index, (first, second) in enumerate(zip(y_test, y_pred)):
    if first != second:
        indexes.append(index)
        predictions.append(second)
        
indexes2 = []
predictions2 = []
for index, (first, second) in enumerate(zip(y_test, y_pred_l)):
    if first != second:
        indexes2.append(index)
        predictions2.append(second)

In [None]:
sum(predictions)/len(predictions)

In [None]:
sum(predictions2)/len(predictions2)

In [None]:
test_ex = pd.read_csv('dreaddit-test.csv')[['text', 'label']]
for i in indexes:
    if i not in indexes2:
        print(test_ex.loc[[i]].to_string())

In [None]:
test_ex = pd.read_csv('dreaddit-test.csv')[['text', 'label']]
for i in indexes2:
    if i not in indexes:
        print(test_ex.loc[[i]].to_string())

In [None]:
inter = np.intersect1d(indexes, indexes2)
suma = 0
for i in inter:
    print(test_ex.loc[[i]].to_string())
    suma += test_ex['label'].iloc[i]