In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import *
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.linear_model import LogisticRegression
import random
import matplotlib.pyplot as plt

def set_seed(seed: int):
    random.seed(seed)
    np.random.seed(seed)

set_seed(1)

In [2]:
def prepare_data(dataset_path_train = None,dataset_path_test = None):
    if (dataset_path_train is None or dataset_path_test is None):
        raise RuntimeException("Error! Dataset must be provided")
    train = pd.read_csv(dataset_path_train)
    test = pd.read_csv(dataset_path_test)
    
    y_train = train['label']
    y_test = test['label']
    
    features = []
    for feat in train.columns.values:
        if 'lex' in feat:
            features.append(feat)
    features.extend(['roberta_prediction', 'syntax_ari', 'social_karma'])
    
    X_train = train[features]
    X_test = test[features]
    
    return X_train, y_train, X_test, y_test

def evaluate(test_labels, y_pred):
    acc = accuracy_score(test_labels, y_pred)
    rec = recall_score(test_labels, y_pred, zero_division=1)
    prec = precision_score(test_labels, y_pred, zero_division=1)
    f1 = f1_score(test_labels, y_pred, zero_division=1)
    print("Accuracy: ", acc)
    print("Recall: ", rec)
    print("Precision: ", prec)
    print("F1: ", f1)

## Grid Search

### Logistic regression

In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

X_train, y_train, X_test, y_test = prepare_data(dataset_path_train = 'train_pred_mental_health.csv', dataset_path_test = 'test_pred_mental_health.csv')
#sclr = StandardScaler()
#X_train = sclr.fit_transform(X_train)
#X_test = sclr.transform(X_test)
hp = {
        'solver': ['liblinear', 'newton-cg', 'lbfgs'],
        'penalty': ['l2', 'elasticnet'],
        'C': [0.1, 1, 10],
    }
gs_model = GridSearchCV(LogisticRegression(max_iter=1000000), hp, n_jobs=-1, scoring='f1', verbose=4)
gs_model.fit(X_train, y_train)
    
model = LogisticRegression(max_iter=1000000,**gs_model.best_params_)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
evaluate(y_test, y_pred)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


45 fits failed out of a total of 90.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\38599\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\38599\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1091, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\38599\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 71, in _check_solver
    raise ValueError(
ValueError: Only 'saga' solver supports elasticnet penalty, got solver=liblinear.

----------------------------

Accuracy:  0.8195804195804196
Recall:  0.907859078590786
Precision:  0.7790697674418605
F1:  0.8385481852315394


In [4]:
gs_model.best_params_

{'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}

In [5]:
cdf = pd.DataFrame(model.coef_.transpose(), X_train.columns.values, columns=['coefficient'])
print(cdf.sort_values(by='coefficient',  key=abs, ascending=False).to_string())

                          coefficient
roberta_prediction           4.202665
lex_dal_min_activation      -1.354658
lex_dal_min_pleasantness    -1.198125
lex_dal_avg_pleasantness    -1.139446
lex_dal_avg_imagery          0.779881
lex_dal_max_pleasantness    -0.637963
lex_liwc_death               0.414601
lex_liwc_swear               0.367543
lex_liwc_negemo             -0.349047
lex_liwc_feel                0.324575
lex_liwc_nonflu              0.308234
lex_liwc_affect              0.299307
lex_dal_max_activation      -0.254480
lex_dal_max_imagery          0.239398
lex_liwc_posemo             -0.233798
lex_liwc_informal           -0.219533
lex_dal_min_imagery         -0.214619
lex_liwc_hear                0.212610
lex_liwc_ingest             -0.201360
lex_liwc_percept            -0.195031
lex_liwc_assent              0.193342
lex_liwc_anx                 0.175069
lex_liwc_netspeak            0.167057
lex_liwc_auxverb            -0.160698
lex_liwc_tentat              0.146209
lex_liwc_pro

### Support Vector Machine

In [6]:
from sklearn.svm import SVC

X_train, y_train, X_test, y_test = prepare_data(dataset_path_train = 'train_pred_mental_health.csv', dataset_path_test = 'test_pred_mental_health.csv')
sclr = StandardScaler()
X_train = sclr.fit_transform(X_train)
X_test = sclr.transform(X_test)
hp = {
        'kernel': ['linear'],
        'gamma': [1000, 100, 10, 1, 0.1, 0.01],
        'C': [0.01, 0.1, 1, 10],
    }
gs_model = GridSearchCV(SVC(), hp, n_jobs=-1, scoring='f1', verbose=4)
gs_model.fit(X_train, y_train)
    
model = SVC(**gs_model.best_params_)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
evaluate(y_test, y_pred)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Accuracy:  0.8181818181818182
Recall:  0.9214092140921409
Precision:  0.7709750566893424
F1:  0.8395061728395061


In [7]:
gs_model.best_params_

{'C': 0.01, 'gamma': 1000, 'kernel': 'linear'}

In [8]:
from sklearn.ensemble import RandomForestClassifier

X_train, y_train, X_test, y_test = prepare_data(dataset_path_train = 'train_pred_mental_health.csv', dataset_path_test = 'test_pred_mental_health.csv')

hp = {
        'n_estimators': [50, 200],
        'max_features': ['auto', 'sqrt', 'log2'],
        'criterion': ['gini', 'entropy'],
        'min_samples_split': [2, 4]
    }
gs_model = GridSearchCV(RandomForestClassifier(), hp, n_jobs=-1, scoring='f1', verbose=4)
gs_model.fit(X_train, y_train)
    
model = RandomForestClassifier(**gs_model.best_params_)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
evaluate(y_test, y_pred)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


  warn(
  warn(


Accuracy:  0.8125874125874126
Recall:  0.8997289972899729
Precision:  0.7738927738927739
F1:  0.8320802005012531


In [9]:
gs_model.best_params_

{'criterion': 'gini',
 'max_features': 'auto',
 'min_samples_split': 4,
 'n_estimators': 200}