In [11]:
import pandas as pd
import numpy as np
import nltk
import gensim
from gensim import corpora
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import naive_bayes
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score, f1_score, fbeta_score

In [3]:
# read cleaned and processed data
# docs = pd.read_csv("preprocessed_data.csv", converters={'lemmatized_processed_text': pd.eval})
docs = pd.read_csv("preprocessed_data_new_negation.csv")

In [4]:
docs.head(3)

Unnamed: 0,text,class,stemmed_processed_text,lemmatized_processed_text
0,ex wife threatening suicide recently i left my...,suicide,"['ex', 'wife', 'threaten', 'suicid', 'recent',...","['ex', 'wife', 'threatening', 'suicide', 'rece..."
1,am i weird i do not get affected by compliment...,non-suicide,"['weird', 'not', 'get', 'affect', 'compliment'...","['weird', 'not', 'get', 'affected', 'complimen..."
2,finally is almost over so i can never hear has...,non-suicide,"['final', 'almost', 'never', 'hear', 'bad', 'y...","['finally', 'almost', 'never', 'hear', 'bad', ..."


In [5]:
# encode suicide as 1 and non-suicide as 0
docs['class_encoded'] = docs['class'].apply(lambda x:1 if x == 'suicide' else 0)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(docs['lemmatized_processed_text'],docs['class_encoded'],test_size=0.3, random_state=101)

In [7]:
Tfidf_vect = TfidfVectorizer()
Tfidf_vect.fit(docs['lemmatized_processed_text'])
X_train_Tfidf = Tfidf_vect.transform(X_train)
X_test_Tfidf = Tfidf_vect.transform(X_test)

### Get Metrics

In [8]:
def get_metrics(y_test, y_pred):
    
    conf_matrix = confusion_matrix(y_test, y_pred)
    print(conf_matrix)
    TP = conf_matrix[0][0]
    FN = conf_matrix[1][0]
    FP = conf_matrix[0][1]
    TN = conf_matrix[1][1]

    accuracy = accuracy_score(y_test, y_pred)*100
    precision = precision_score(y_test, y_pred)*100
    recall = recall_score(y_test, y_pred)*100

    print('TP:',TP); print('FN:',FN); print('FP:',FP) ;print('TN:',TN)
    print("Accuracy: ", accuracy)
    print('Precision:', precision)
    print('Recall:', recall)

    f1_score = 2*((precision * recall) / ((precision + recall)))
    print('F1 Score:', f1_score)

    f2score = ((1 + 2**2) * precision * recall) / (2**2 * precision + recall)
    print('F2 Score:', f2score)

### sklearn Naive Bayes

In [12]:
# fit the training dataset on the NB classifier
nb = naive_bayes.MultinomialNB()
nb.fit(X_train_Tfidf,y_train)
# predict the labels on validation dataset
predictions_NB = nb.predict(X_test_Tfidf)

get_metrics(y_test, predictions_NB)

[[28664  6046]
 [ 1549 33364]]
TP: 28664
FN: 1549
FP: 6046
TN: 33364
Accuracy:  89.09124858164687
Precision: 84.65871606191322
Recall: 95.56325723942372
F1 Score: 89.78109064488785
F2 Score: 93.16326188694418


In [22]:
print(metrics.classification_report(y_test, predictions_NB, target_names=['Non-Suicide', 'Suicide'], digits=4))

              precision    recall  f1-score   support

 Non-Suicide     0.9487    0.8258    0.8830     34710
     Suicide     0.8466    0.9556    0.8978     34913

    accuracy                         0.8909     69623
   macro avg     0.8977    0.8907    0.8904     69623
weighted avg     0.8975    0.8909    0.8904     69623



### Logistic Regression

In [18]:
# fit the training dataset on the Logistic Regression classifier
logreg = LogisticRegression(max_iter=300)
logreg.fit(X_train_Tfidf,y_train)
# predict the labels on validation dataset
predictions_logreg = logreg.predict(X_test_Tfidf)

get_metrics(y_test, predictions_logreg)

[[32794  1916]
 [ 2645 32268]]
TP: 32794
FN: 2645
FP: 1916
TN: 32268
Accuracy:  93.44900392111802
Precision: 94.39503861455651
Recall: 92.42402543465185
F1 Score: 93.39913454998047
F2 Score: 92.81161554568673


In [21]:
print(metrics.classification_report(y_test, predictions_logreg, target_names=['Non-Suicide', 'Suicide'], digits=4))

              precision    recall  f1-score   support

 Non-Suicide     0.9254    0.9448    0.9350     34710
     Suicide     0.9440    0.9242    0.9340     34913

    accuracy                         0.9345     69623
   macro avg     0.9347    0.9345    0.9345     69623
weighted avg     0.9347    0.9345    0.9345     69623



### sklearn SVM

In [16]:
# fit the training dataset on the SVM classifier
# SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
linearSVC = LinearSVC()
linearSVC.fit(X_train_Tfidf,y_train)
# predict the labels on validation dataset
predictions_SVC = linearSVC.predict(X_test_Tfidf)

get_metrics(y_test, predictions_SVC)

[[32699  2011]
 [ 2554 32359]]
TP: 32699
FN: 2554
FP: 2011
TN: 32359
Accuracy:  93.44325869324793
Precision: 94.14896712249055
Recall: 92.68467333085097
F1 Score: 93.41108208362803
F2 Score: 92.97387686614336


In [20]:
print(metrics.classification_report(y_test, predictions_SVC, target_names=['Non-Suicide', 'Suicide'], digits=4))

              precision    recall  f1-score   support

 Non-Suicide     0.9276    0.9421    0.9348     34710
     Suicide     0.9415    0.9268    0.9341     34913

    accuracy                         0.9344     69623
   macro avg     0.9345    0.9345    0.9344     69623
weighted avg     0.9345    0.9344    0.9344     69623



### IGNORE BELOW

### NLTK Naive Bayes

In [20]:
suicide_rows = docs[docs["class"] == "suicide"]
non_suicide_rows = docs[docs["class"] == "non-suicide"]
print(f"Number of suicide observations: {len(suicide_data)}.", f"Number of non-suicide observations: {len(non_suicide_data)}")

Number of suicide observations: 116037. Number of non-suicide observations: 116037


In [26]:
# use gensim to convert these documents into raw term frequency-based vectors
dictionary = corpora.Dictionary(doc_text)

train_test_split(X, y, test_size=0.33, random_state=42)

suicide_rows = docs[docs["class"] == "suicide"]['lemmatized_processed_text'].tolist()
non_suicide_rows = docs[docs["class"] == "non-suicide"]['lemmatized_processed_text'].tolist()


suicide_tf_vectors = [dictionary.doc2bow(doc) for doc in suicide_rows]
non_suicide_tf_vectors = [dictionary.doc2bow(doc) for doc in non_suicide_rows]

In [31]:
suicide_data_dict = [{id:1 for (id, tf_value) in vec} for vec in suicide_tf_vectors]
suicide_data = [(d, 'suicide') for d in suicide_data_dict]

non_suicide_data_dict = [{id:1 for (id, tf_value) in vec} for vec in non_suicide_tf_vectors]
non_suicide_data = [(d, 'non-suicide') for d in non_suicide_data_dict]

all_data = suicide_data + non_suicide_data

#### train classifer

In [33]:
classifier = nltk.NaiveBayesClassifier.train(all_data)

In [39]:
print(nltk.classify.accuracy(classifier, all_data))

0.5749114506579798
