In [1]:
#importing libraries
import numpy as np 
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
import nltk
import re
import string
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords


from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

In [2]:
#read data
data = pd.read_csv("pos_tagged.csv")
data.dropna(inplace=True)

In [4]:
import ast
data['tagged_tokens'] = data['tagged_tokens'].apply(lambda x: [str(i) for i in ast.literal_eval(x)])

In [5]:
data.drop(columns = ["text","lemmatized_processed_text","tokens"], inplace = True)
data.head()

Unnamed: 0,class,tagged_tokens
0,suicide,"[ex_NN, wife_NN, threatening_VBG, suiciderecen..."
1,non-suicide,"[weird_JJ, get_VB, affected_VBN, compliment_NN..."
2,non-suicide,"[finally_RB, almost_RB, never_RB, hear_VB, bad..."
3,suicide,"[need_NN, helpjust_RB, help_NN, cry_NN, hard_RB]"
4,suicide,"[losthello_NN, name_NN, adam_NN, struggling_VB..."


In [6]:
#convert to string of words
data["tagged_tokens"] = data["tagged_tokens"].apply(lambda x: " ".join(x))
data.head()

Unnamed: 0,class,tagged_tokens
0,suicide,ex_NN wife_NN threatening_VBG suiciderecently_...
1,non-suicide,weird_JJ get_VB affected_VBN compliment_NN com...
2,non-suicide,finally_RB almost_RB never_RB hear_VB bad_JJ y...
3,suicide,need_NN helpjust_RB help_NN cry_NN hard_RB
4,suicide,losthello_NN name_NN adam_NN struggling_VBG ye...


In [7]:
train_X, test_X, train_y, test_y = train_test_split(data['tagged_tokens'],data['class'],test_size=0.3, random_state = 42)

In [8]:
#tf idf
tf_idf = TfidfVectorizer()
#applying tf idf to training data
X_train_tf = tf_idf.fit_transform(train_X)
#applying tf idf to training data
X_train_tf = tf_idf.transform(train_X)

In [9]:
#print dimension of data
print("n_samples: %d, n_features: %d" % X_train_tf.shape)

n_samples: 162411, n_features: 115290


In [10]:
#transforming test data into tf-idf matrix
X_test_tf = tf_idf.transform(test_X)

#print dimension of data
print("n_samples: %d, n_features: %d" % X_test_tf.shape)

n_samples: 69606, n_features: 115290


In [11]:
#Naive Bayes Classifier
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(X_train_tf, train_y)
#predicted y
y_pred_nb = naive_bayes_classifier.predict(X_test_tf)

In [12]:
print(metrics.classification_report(test_y, y_pred_nb, target_names=['Suicide', 'Non-Suicide']))

              precision    recall  f1-score   support

     Suicide       0.96      0.80      0.87     34757
 Non-Suicide       0.83      0.97      0.89     34849

    accuracy                           0.88     69606
   macro avg       0.90      0.88      0.88     69606
weighted avg       0.90      0.88      0.88     69606



In [13]:
print("Naive Bayes Accuracy Score -> ",accuracy_score(y_pred_nb, test_y)*100)

Naive Bayes Accuracy Score ->  88.40473522397494


In [14]:
#Naive Bayes Confusion Matrix
print("Naive Bayes Confusion Matrix:")
print(metrics.confusion_matrix(test_y, y_pred_nb))

Naive Bayes Confusion Matrix:
[[27711  7046]
 [ 1025 33824]]


In [15]:
# fit the training dataset on the Logistic Regression classifier
logreg = LogisticRegression(max_iter=200)
logreg.fit(X_train_tf,train_y)
# predict the labels on validation dataset
y_pred_logreg = logreg.predict(X_test_tf)

In [16]:
print(metrics.classification_report(test_y, y_pred_logreg, target_names=['Suicide', 'Non-Suicide']))

              precision    recall  f1-score   support

     Suicide       0.93      0.94      0.94     34757
 Non-Suicide       0.94      0.93      0.94     34849

    accuracy                           0.94     69606
   macro avg       0.94      0.94      0.94     69606
weighted avg       0.94      0.94      0.94     69606



In [17]:
print("Logistic Regression Accuracy Score -> ",accuracy_score(y_pred_logreg, test_y)*100)

Logistic Regression Accuracy Score ->  93.61405625951785


In [18]:
#Log Reg Confusion Matrix
print("Logistic Regression Confusion Matrix:")
print(metrics.confusion_matrix(test_y, y_pred_logreg))

Logistic Regression Confusion Matrix:
[[32795  1962]
 [ 2483 32366]]


In [26]:
#sample prediction
test = ['i want to jump off a building. i dont feel like living anymore. everyone is against me and i feel so lonely']

In [27]:
review = re.sub('[^a-zA-Z]', ' ', test[0])
review = review.lower()
review = review.split()
stop_list = stopwords.words('english')
lemmatizer = WordNetLemmatizer()
review = [lemmatizer.lemmatize(word) for word in review if not word in stop_list]

In [28]:
# Create a function to add POS tags in the form of word_POS
def add_pos_tag(sentences):
    tagged_sentences = []
    for sentence in sentences:
        # Tokenize sentence into words
        words = nltk.word_tokenize(sentence)
        # Get POS tags for words
        pos_tags = nltk.pos_tag(words)
        # Join word and POS tag into word_POS format
        tagged_words = [f"{word}_{pos}" for word, pos in pos_tags]
        # Join tagged words into sentence
        tagged_sentence = " ".join(tagged_words)
        # Add tagged sentence to list
        tagged_sentences.append(tagged_sentence)
    return tagged_sentences

test_tagged = add_pos_tag(review)
print(test_tagged)

['want_NN', 'jump_NN', 'building_NN', 'dont_NN', 'feel_NN', 'like_IN', 'living_NN', 'anymore_RB', 'everyone_NN', 'feel_NN', 'lonely_RB']


In [29]:
test_processed =[ ' '.join(test_tagged)]
test_processed

['want_NN jump_NN building_NN dont_NN feel_NN like_IN living_NN anymore_RB everyone_NN feel_NN lonely_RB']

In [30]:
test_input = tf_idf.transform(test_processed)
test_input.shape

(1, 115290)

In [31]:
#Naive Bayes
nb_result = naive_bayes_classifier.predict(test_input)[0]
nb_result

'suicide'

In [32]:
#LogReg
logreg_result = logreg.predict(test_input)[0]
logreg_result

'suicide'