In [1]:
import pandas as pd
import numpy as np
import nltk
import gensim
from gensim import corpora
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import naive_bayes
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score, f1_score, fbeta_score

In [2]:
np.random.seed(500)

In [3]:
# read cleaned and processed data
# docs = pd.read_csv("preprocessed_data.csv", converters={'lemmatized_processed_text': pd.eval})
docs = pd.read_csv("preprocessed_data_new.csv")

In [4]:
docs.head(3)

Unnamed: 0,text,class,lemmatized_processed_text,stemmed_processed_text
0,ex wife threatening suicide recently i left my...,suicide,"['ex', 'wife', 'threatening', 'suicide', 'rece...","['ex', 'wife', 'threaten', 'suicid', 'recent',..."
1,am i weird i do not get affected by compliment...,non-suicide,"['weird', 'get', 'affected', 'compliment', 'co...","['weird', 'get', 'affect', 'compliment', 'come..."
2,finally is almost over so i can never hear has...,non-suicide,"['finally', 'almost', 'never', 'hear', 'bad', ...","['final', 'almost', 'never', 'hear', 'bad', 'y..."


In [5]:
# encode suicide as 1 and non-suicide as 0
docs['class_encoded'] = docs['class'].apply(lambda x:1 if x == 'suicide' else 0)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(docs['lemmatized_processed_text'],docs['class_encoded'],test_size=0.3)

In [7]:
Tfidf_vect = TfidfVectorizer()
Tfidf_vect.fit(docs['lemmatized_processed_text'])
X_train_Tfidf = Tfidf_vect.transform(X_train)
X_test_Tfidf = Tfidf_vect.transform(X_test)

### Get Metrics

In [8]:
def get_metrics(y_test, y_pred):
    
    conf_matrix = confusion_matrix(y_test, y_pred)
    print(conf_matrix)
    TP = conf_matrix[0][0]
    FN = conf_matrix[1][0]
    FP = conf_matrix[0][1]
    TN = conf_matrix[1][1]

    accuracy = accuracy_score(y_test, y_pred)*100
    precision = precision_score(y_test, y_pred)*100
    recall = recall_score(y_test, y_pred)*100

    print('TP:',TP); print('FN:',FN); print('FP:',FP) ;print('TN:',TN)
    print("Accuracy: ", accuracy)
    print('Precision:', precision)
    print('Recall:', recall)

    f1_score = 2*((precision * recall) / ((precision + recall)))
    print('F1 Score:', f1_score)

    f2score = ((1 + 2**2) * precision * recall) / (2**2 * precision + recall)
    print('F2 Score:', f2score)

### sklearn Naive Bayes

In [9]:
# fit the training dataset on the NB classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(X_train_Tfidf,y_train)
# predict the labels on validation dataset
predictions_NB = Naive.predict(X_test_Tfidf)

get_metrics(y_test, predictions_NB)

[[28693  6009]
 [ 1580 33341]]
TP: 28693
FN: 1580
FP: 6009
TN: 33341
Accuracy:  89.09986642345203
Precision: 84.72935196950445
Recall: 95.47550184702615
F1 Score: 89.78201451441343
F2 Score: 93.11359853435661


### sklearn SVM

In [10]:
# fit the training dataset on the SVM classifier
# SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
linearSVC = LinearSVC()
linearSVC.fit(X_train_Tfidf,y_train)
# predict the labels on validation dataset
predictions_SVC = linearSVC.predict(X_test_Tfidf)

get_metrics(y_test, predictions_SVC)

[[32677  2025]
 [ 2554 32367]]
TP: 32677
FN: 2554
FP: 2025
TN: 32367
Accuracy:  93.42315039570256
Precision: 94.11200279134682
Recall: 92.68634918816758
F1 Score: 93.39373566286268
F2 Score: 92.96801397090925


### Logistic Regression

In [11]:
# fit the training dataset on the Logistic Regression classifier
logreg = LogisticRegression(max_iter=300)
logreg.fit(X_train_Tfidf,y_train)
# predict the labels on validation dataset
predictions_logreg = logreg.predict(X_test_Tfidf)

get_metrics(y_test, predictions_logreg)

[[32781  1921]
 [ 2581 32340]]
TP: 32781
FN: 2581
FP: 1921
TN: 32340
Accuracy:  93.533746032202
Precision: 94.39304165085666
Recall: 92.6090318146674
F1 Score: 93.49252695787922
F2 Score: 92.9604185230964


### IGNORE BELOW

### NLTK Naive Bayes

In [20]:
suicide_rows = docs[docs["class"] == "suicide"]
non_suicide_rows = docs[docs["class"] == "non-suicide"]
print(f"Number of suicide observations: {len(suicide_data)}.", f"Number of non-suicide observations: {len(non_suicide_data)}")

Number of suicide observations: 116037. Number of non-suicide observations: 116037


In [26]:
# use gensim to convert these documents into raw term frequency-based vectors
dictionary = corpora.Dictionary(doc_text)

train_test_split(X, y, test_size=0.33, random_state=42)

suicide_rows = docs[docs["class"] == "suicide"]['lemmatized_processed_text'].tolist()
non_suicide_rows = docs[docs["class"] == "non-suicide"]['lemmatized_processed_text'].tolist()


suicide_tf_vectors = [dictionary.doc2bow(doc) for doc in suicide_rows]
non_suicide_tf_vectors = [dictionary.doc2bow(doc) for doc in non_suicide_rows]

In [31]:
suicide_data_dict = [{id:1 for (id, tf_value) in vec} for vec in suicide_tf_vectors]
suicide_data = [(d, 'suicide') for d in suicide_data_dict]

non_suicide_data_dict = [{id:1 for (id, tf_value) in vec} for vec in non_suicide_tf_vectors]
non_suicide_data = [(d, 'non-suicide') for d in non_suicide_data_dict]

all_data = suicide_data + non_suicide_data

#### train classifer

In [33]:
classifier = nltk.NaiveBayesClassifier.train(all_data)

In [39]:
print(nltk.classify.accuracy(classifier, all_data))

0.5749114506579798
