In [1]:
#importing libraries
import numpy as np 
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
import nltk
import re
import string
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords


from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

In [2]:
#read data
data = pd.read_csv("pos_tagged_new.csv")
data.dropna(inplace=True)

In [3]:
import ast
data['tagged_tokens'] = data['tagged_tokens'].apply(lambda x: [str(i) for i in ast.literal_eval(x)])

In [4]:
data.drop(columns = ["text","lemmatized_processed_text","tokens"], inplace = True)
data.head()

Unnamed: 0,class,tagged_tokens
0,suicide,"[ex_NN, wife_NN, threatening_VBG, suicide_NN, ..."
1,non-suicide,"[weird_JJ, get_VB, affected_VBN, compliment_NN..."
2,non-suicide,"[finally_RB, almost_RB, never_RB, hear_VB, bad..."
3,suicide,"[need_NN, help_NN, help_NN, cry_NN, hard_RB]"
4,suicide,"[lost_VBN, hello_UH, name_NN, adam_NN, struggl..."


In [5]:
#convert to string of words
data["tagged_tokens"] = data["tagged_tokens"].apply(lambda x: " ".join(x))
data.head()

Unnamed: 0,class,tagged_tokens
0,suicide,ex_NN wife_NN threatening_VBG suicide_NN recen...
1,non-suicide,weird_JJ get_VB affected_VBN compliment_NN com...
2,non-suicide,finally_RB almost_RB never_RB hear_VB bad_JJ y...
3,suicide,need_NN help_NN help_NN cry_NN hard_RB
4,suicide,lost_VBN hello_UH name_NN adam_NN struggling_V...


In [6]:
train_X, test_X, train_y, test_y = train_test_split(data['tagged_tokens'],data['class'],test_size=0.3, random_state = 42)

In [7]:
#tf idf
tf_idf = TfidfVectorizer()
#applying tf idf to training data
X_train_tf = tf_idf.fit_transform(train_X)
#applying tf idf to training data
X_train_tf = tf_idf.transform(train_X)

In [8]:
#print dimension of data
print("n_samples: %d, n_features: %d" % X_train_tf.shape)

n_samples: 162411, n_features: 44044


In [9]:
#transforming test data into tf-idf matrix
X_test_tf = tf_idf.transform(test_X)

#print dimension of data
print("n_samples: %d, n_features: %d" % X_test_tf.shape)

n_samples: 69606, n_features: 44044


In [10]:
#Naive Bayes Classifier
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(X_train_tf, train_y)
#predicted y
y_pred_nb = naive_bayes_classifier.predict(X_test_tf)

In [11]:
print(metrics.classification_report(test_y, y_pred_nb, target_names=['Suicide', 'Non-Suicide']))

              precision    recall  f1-score   support

     Suicide       0.95      0.83      0.88     34757
 Non-Suicide       0.85      0.95      0.90     34849

    accuracy                           0.89     69606
   macro avg       0.90      0.89      0.89     69606
weighted avg       0.90      0.89      0.89     69606



In [12]:
print("Naive Bayes Accuracy Score -> ",accuracy_score(y_pred_nb, test_y)*100)

Naive Bayes Accuracy Score ->  89.1388673390225


In [13]:
#Naive Bayes Confusion Matrix
print("Naive Bayes Confusion Matrix:")
print(metrics.confusion_matrix(test_y, y_pred_nb))

Naive Bayes Confusion Matrix:
[[28797  5960]
 [ 1600 33249]]


In [14]:
# fit the training dataset on the Logistic Regression classifier
logreg = LogisticRegression(max_iter=200)
logreg.fit(X_train_tf,train_y)
# predict the labels on validation dataset
y_pred_logreg = logreg.predict(X_test_tf)

In [15]:
print(metrics.classification_report(test_y, y_pred_logreg, target_names=['Suicide', 'Non-Suicide']))

              precision    recall  f1-score   support

     Suicide       0.93      0.95      0.94     34757
 Non-Suicide       0.95      0.93      0.94     34849

    accuracy                           0.94     69606
   macro avg       0.94      0.94      0.94     69606
weighted avg       0.94      0.94      0.94     69606



In [16]:
print("Logistic Regression Accuracy Score -> ",accuracy_score(y_pred_logreg, test_y)*100)

Logistic Regression Accuracy Score ->  93.73329885354711


In [17]:
#Log Reg Confusion Matrix
print("Logistic Regression Confusion Matrix:")
print(metrics.confusion_matrix(test_y, y_pred_logreg))

Logistic Regression Confusion Matrix:
[[32920  1837]
 [ 2525 32324]]


In [44]:
#sample prediction
test = ['i want to jump off a building. i dont want to live anymore. i want to die']

In [45]:
review = re.sub('[^a-zA-Z]', ' ', test[0])
review = review.lower()
review = review.split()
stop_list = stopwords.words('english')
lemmatizer = WordNetLemmatizer()
review = [lemmatizer.lemmatize(word) for word in review if not word in stop_list]

In [46]:
# Create a function to add POS tags in the form of word_POS
def add_pos_tag(sentences):
    tagged_sentences = []
    for sentence in sentences:
        # Tokenize sentence into words
        words = nltk.word_tokenize(sentence)
        # Get POS tags for words
        pos_tags = nltk.pos_tag(words)
        # Join word and POS tag into word_POS format
        tagged_words = [f"{word}_{pos}" for word, pos in pos_tags]
        # Join tagged words into sentence
        tagged_sentence = " ".join(tagged_words)
        # Add tagged sentence to list
        tagged_sentences.append(tagged_sentence)
    return tagged_sentences

test_tagged = add_pos_tag(review)
print(test_tagged)

['want_NN', 'jump_NN', 'building_NN', 'dont_NN', 'want_NN', 'live_JJ', 'anymore_RB', 'want_NN', 'die_NN']


In [47]:
test_processed =[ ' '.join(test_tagged)]
test_processed

['want_NN jump_NN building_NN dont_NN want_NN live_JJ anymore_RB want_NN die_NN']

In [48]:
test_input = tf_idf.transform(test_processed)
test_input.shape

(1, 44044)

In [49]:
#Naive Bayes
nb_result = naive_bayes_classifier.predict(test_input)[0]
nb_result

'suicide'

In [50]:
#LogReg
logreg_result = logreg.predict(test_input)[0]
logreg_result

'suicide'