In [11]:
import pandas as pd
import numpy as np
import nltk
import gensim
from gensim import corpora
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import naive_bayes
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

In [2]:
np.random.seed(500)

In [3]:
# read cleaned and processed data
# docs = pd.read_csv("preprocessed_data.csv", converters={'lemmatized_processed_text': pd.eval})
docs = pd.read_csv("preprocessed_data.csv")

In [4]:
docs.head(3)

Unnamed: 0,text,class,stemmed_processed_text,lemmatized_processed_text
0,ex wife threatening suiciderecently i left my ...,suicide,"['ex', 'wife', 'threaten', 'suiciderec', 'left...","['ex', 'wife', 'threatening', 'suiciderecently..."
1,am i weird i do not get affected by compliment...,non-suicide,"['weird', 'get', 'affect', 'compliment', 'come...","['weird', 'get', 'affected', 'compliment', 'co..."
2,finally is almost over so i can never ...,non-suicide,"['final', 'almost', 'never', 'hear', 'bad', 'y...","['finally', 'almost', 'never', 'hear', 'bad', ..."


In [5]:
train_X, test_X, train_y, test_y = train_test_split(docs['lemmatized_processed_text'],docs['class'],test_size=0.3)

In [6]:
Encoder = LabelEncoder()
train_y = Encoder.fit_transform(train_y)
test_y = Encoder.fit_transform(test_y)

In [7]:
Tfidf_vect = TfidfVectorizer()
Tfidf_vect.fit(docs['lemmatized_processed_text'])
train_X_Tfidf = Tfidf_vect.transform(train_X)
test_X_Tfidf = Tfidf_vect.transform(test_X)

### sklearn Naive Bayes

In [8]:
# fit the training dataset on the NB classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(train_X_Tfidf,train_y)
# predict the labels on validation dataset
predictions_NB = Naive.predict(test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, test_y)*100)

Naive Bayes Accuracy Score ->  87.95082084943195


### sklearn SVM

In [9]:
# fit the training dataset on the SVM classifier
# SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
linearSVC = LinearSVC()
linearSVC.fit(train_X_Tfidf,train_y)
# predict the labels on validation dataset
predictions_SVC = linearSVC.predict(test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("SVC Accuracy Score -> ",accuracy_score(predictions_SVC, test_y)*100)

SVC Accuracy Score ->  93.36569811700156


### Logistic Regression

In [14]:
# fit the training dataset on the Logistic Regression classifier
logreg = LogisticRegression(max_iter=200)
logreg.fit(train_X_Tfidf,train_y)
# predict the labels on validation dataset
predictions_logreg = logreg.predict(test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("Logistic Regression Accuracy Score -> ",accuracy_score(predictions_logreg, test_y)*100)

Logistic Regression Accuracy Score ->  93.0870545653017


In [4]:
doc_text = docs['lemmatized_processed_text'].tolist()

### NLTK Naive Bayes

In [20]:
suicide_rows = docs[docs["class"] == "suicide"]
non_suicide_rows = docs[docs["class"] == "non-suicide"]
print(f"Number of suicide observations: {len(suicide_data)}.", f"Number of non-suicide observations: {len(non_suicide_data)}")

Number of suicide observations: 116037. Number of non-suicide observations: 116037


In [26]:
# use gensim to convert these documents into raw term frequency-based vectors
dictionary = corpora.Dictionary(doc_text)

train_test_split(X, y, test_size=0.33, random_state=42)

suicide_rows = docs[docs["class"] == "suicide"]['lemmatized_processed_text'].tolist()
non_suicide_rows = docs[docs["class"] == "non-suicide"]['lemmatized_processed_text'].tolist()


suicide_tf_vectors = [dictionary.doc2bow(doc) for doc in suicide_rows]
non_suicide_tf_vectors = [dictionary.doc2bow(doc) for doc in non_suicide_rows]

In [31]:
suicide_data_dict = [{id:1 for (id, tf_value) in vec} for vec in suicide_tf_vectors]
suicide_data = [(d, 'suicide') for d in suicide_data_dict]

non_suicide_data_dict = [{id:1 for (id, tf_value) in vec} for vec in non_suicide_tf_vectors]
non_suicide_data = [(d, 'non-suicide') for d in non_suicide_data_dict]

all_data = suicide_data + non_suicide_data

#### train classifer

In [33]:
classifier = nltk.NaiveBayesClassifier.train(all_data)

In [39]:
print(nltk.classify.accuracy(classifier, all_data))

0.5749114506579798
