In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
import gensim
from gensim import corpora, models

### Domain Dictionary

In [2]:
dict1 = pd.read_csv("suicidal_indicator.csv", header=None).T
dict2 = pd.read_csv("suicidal_ideation.csv", header=None).T
dict3 = pd.read_csv("suicidal_behavior.csv", header=None).T
dict4 = pd.read_csv("suicidal_attempt.csv", header=None).T


In [3]:
###combined dictionary into 1
domain_dict = pd.concat([dict1, dict2, dict3,dict4], ignore_index=True)
domain_dict = domain_dict.rename(columns={0: 'lexicons'})
domain_dict.head()

Unnamed: 0,lexicons
0,Pessimistic character
1,Suicide of relative
2,Family history of suicide
3,Suicide of close relative
4,Suicide risk assessment


In [4]:
def lower_text(text):
    text = str(text).lower()
    return text

In [5]:
domain_dict['lexicons'] = domain_dict['lexicons'].apply(lower_text)

In [6]:
domain_dict

Unnamed: 0,lexicons
0,pessimistic character
1,suicide of relative
2,family history of suicide
3,suicide of close relative
4,suicide risk assessment
...,...
2272,went in the freezer
2273,jumped from bridge
2274,jumped from roof
2275,bag around head


In [7]:
def stem_preprocess_text(sentence):
    stop_words = set(stopwords.words('english'))
#     stemmer = PorterStemmer()
    stemmer = SnowballStemmer('english')
    sentence = str(sentence)
    words = nltk.word_tokenize(sentence.lower())
    tagged_words = nltk.pos_tag(words)
    tagged_words = [(stemmer.stem(word), tag) for word, tag in tagged_words if word not in stop_words]
    return tagged_words

In [8]:
domain_dict["stem"] = domain_dict["lexicons"].apply(stem_preprocess_text)

In [9]:
domain_dict

Unnamed: 0,lexicons,stem
0,pessimistic character,"[(pessimist, JJ), (charact, NN)]"
1,suicide of relative,"[(suicid, NN), (relat, JJ)]"
2,family history of suicide,"[(famili, NN), (histori, NN), (suicid, NN)]"
3,suicide of close relative,"[(suicid, NN), (close, JJ), (relat, JJ)]"
4,suicide risk assessment,"[(suicid, NN), (risk, NN), (assess, NN)]"
...,...,...
2272,went in the freezer,"[(went, VBD), (freezer, NN)]"
2273,jumped from bridge,"[(jump, NN), (bridg, NN)]"
2274,jumped from roof,"[(jump, NN), (roof, NN)]"
2275,bag around head,"[(bag, NN), (around, IN), (head, NN)]"


In [10]:
def lem_preprocess_text(sentence):
    stop_words = set(stopwords.words('english'))
#     stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    sentence = str(sentence)
    words = nltk.word_tokenize(sentence.lower())
    tagged_words = nltk.pos_tag(words)
    tagged_words = [(lemmatizer.lemmatize(word), tag) for word, tag in tagged_words if word not in stop_words]
    return tagged_words

In [11]:
domain_dict["lem"] = domain_dict["lexicons"].apply(lem_preprocess_text)

In [12]:
domain_dict

Unnamed: 0,lexicons,stem,lem
0,pessimistic character,"[(pessimist, JJ), (charact, NN)]","[(pessimistic, JJ), (character, NN)]"
1,suicide of relative,"[(suicid, NN), (relat, JJ)]","[(suicide, NN), (relative, JJ)]"
2,family history of suicide,"[(famili, NN), (histori, NN), (suicid, NN)]","[(family, NN), (history, NN), (suicide, NN)]"
3,suicide of close relative,"[(suicid, NN), (close, JJ), (relat, JJ)]","[(suicide, NN), (close, JJ), (relative, JJ)]"
4,suicide risk assessment,"[(suicid, NN), (risk, NN), (assess, NN)]","[(suicide, NN), (risk, NN), (assessment, NN)]"
...,...,...,...
2272,went in the freezer,"[(went, VBD), (freezer, NN)]","[(went, VBD), (freezer, NN)]"
2273,jumped from bridge,"[(jump, NN), (bridg, NN)]","[(jumped, NN), (bridge, NN)]"
2274,jumped from roof,"[(jump, NN), (roof, NN)]","[(jumped, NN), (roof, NN)]"
2275,bag around head,"[(bag, NN), (around, IN), (head, NN)]","[(bag, NN), (around, IN), (head, NN)]"


In [13]:
def nested_list_to_string(nested_list):
    return ' '.join(['_'.join(tup) for tup in nested_list])

In [14]:
domain_dict["stem"] = domain_dict["stem"].apply(nested_list_to_string)
domain_dict["lem"] = domain_dict["lem"].apply(nested_list_to_string)
domain_dict

Unnamed: 0,lexicons,stem,lem
0,pessimistic character,pessimist_JJ charact_NN,pessimistic_JJ character_NN
1,suicide of relative,suicid_NN relat_JJ,suicide_NN relative_JJ
2,family history of suicide,famili_NN histori_NN suicid_NN,family_NN history_NN suicide_NN
3,suicide of close relative,suicid_NN close_JJ relat_JJ,suicide_NN close_JJ relative_JJ
4,suicide risk assessment,suicid_NN risk_NN assess_NN,suicide_NN risk_NN assessment_NN
...,...,...,...
2272,went in the freezer,went_VBD freezer_NN,went_VBD freezer_NN
2273,jumped from bridge,jump_NN bridg_NN,jumped_NN bridge_NN
2274,jumped from roof,jump_NN roof_NN,jumped_NN roof_NN
2275,bag around head,bag_NN around_IN head_NN,bag_NN around_IN head_NN


In [15]:
## get list of domain dictionary that are stemmed
list_of_stem = []
for i in range (len(domain_dict["stem"])):
    list_of_stem.append(domain_dict["stem"].iloc[i])
len(list_of_stem)

2277

In [16]:
## get list of domain dictionary that are lemmatized
list_of_lem = []
for j in range (len(domain_dict["lem"])):
    list_of_lem.append(domain_dict["lem"].iloc[j])
len(list_of_lem)

2277

### POS Tagging


In [15]:
data = pd.read_csv("preprocessed_data.csv")
data.head()

Unnamed: 0,text,class,stemmed_processed_text,lemmatized_processed_text
0,ex wife threatening suiciderecently i left my ...,suicide,"['ex', 'wife', 'threaten', 'suiciderec', 'left...","['ex', 'wife', 'threatening', 'suiciderecently..."
1,am i weird i do not get affected by compliment...,non-suicide,"['weird', 'get', 'affect', 'compliment', 'come...","['weird', 'get', 'affected', 'compliment', 'co..."
2,finally is almost over so i can never ...,non-suicide,"['final', 'almost', 'never', 'hear', 'bad', 'y...","['finally', 'almost', 'never', 'hear', 'bad', ..."
3,i need helpjust help me i am crying so hard,suicide,"['need', 'helpjust', 'help', 'cri', 'hard']","['need', 'helpjust', 'help', 'cry', 'hard']"
4,i m so losthello my name is adam and i v...,suicide,"['losthello', 'name', 'adam', 'struggl', 'year...","['losthello', 'name', 'adam', 'struggling', 'y..."


In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 232074 entries, 0 to 232073
Data columns (total 4 columns):
 #   Column                     Non-Null Count   Dtype 
---  ------                     --------------   ----- 
 0   text                       232017 non-null  object
 1   class                      232074 non-null  object
 2   stemmed_processed_text     232074 non-null  object
 3   lemmatized_processed_text  232074 non-null  object
dtypes: object(4)
memory usage: 7.1+ MB


In [16]:
#drop stemmed_processed_text column
data.drop(columns = ["stemmed_processed_text"], inplace = True)

In [17]:
#clean and tokenize lemmatized_processed_text as it is intepreted as an entire string 
def clean_and_tokenize(text):
    # Remove the brackets and commas using a regular expression
    cleaned_text = re.sub(r"[\[\],']", "", text)
    # Tokenize the string using word_tokenize
    tokens = word_tokenize(cleaned_text)
    return tokens

In [18]:
# Apply the function to the 'lemmatized_processed_text' column of the DataFrame
data['tokens'] = data['lemmatized_processed_text'].apply(clean_and_tokenize)

In [19]:
# Define a function to add POS tags to a list of tokens
def add_pos_tags(tokens):
    tagged_tokens = []
    for token in tokens:
        pos_tag = nltk.pos_tag([token])[0][1]
        tagged_token = f"{token}_{pos_tag}"
        tagged_tokens.append(tagged_token)
    return tagged_tokens

# Apply the function to the 'tokens' column of the DataFrame
data['tagged_tokens'] = data['tokens'].map(add_pos_tags)


In [20]:
data.head()

Unnamed: 0,text,class,lemmatized_processed_text,tokens,tagged_tokens
0,ex wife threatening suiciderecently i left my ...,suicide,"['ex', 'wife', 'threatening', 'suiciderecently...","[ex, wife, threatening, suiciderecently, left,...","[ex_NN, wife_NN, threatening_VBG, suiciderecen..."
1,am i weird i do not get affected by compliment...,non-suicide,"['weird', 'get', 'affected', 'compliment', 'co...","[weird, get, affected, compliment, coming, som...","[weird_NN, get_VB, affected_JJ, compliment_NN,..."
2,finally is almost over so i can never ...,non-suicide,"['finally', 'almost', 'never', 'hear', 'bad', ...","[finally, almost, never, hear, bad, year, ever...","[finally_RB, almost_RB, never_RB, hear_NN, bad..."
3,i need helpjust help me i am crying so hard,suicide,"['need', 'helpjust', 'help', 'cry', 'hard']","[need, helpjust, help, cry, hard]","[need_NN, helpjust_NN, help_NN, cry_NN, hard_JJ]"
4,i m so losthello my name is adam and i v...,suicide,"['losthello', 'name', 'adam', 'struggling', 'y...","[losthello, name, adam, struggling, year, afra...","[losthello_NN, name_NN, adam_NN, struggling_VB..."


In [21]:
data.to_csv('pos_tagged.csv', index=False)

### combining domain dictionary + POS and creating TF-IDF model


In [17]:
data = pd.read_csv("pos_tagged.csv")
data.dropna(inplace=True)

In [18]:
data

Unnamed: 0,text,class,lemmatized_processed_text,tokens,tagged_tokens
0,ex wife threatening suiciderecently i left my ...,suicide,"['ex', 'wife', 'threatening', 'suiciderecently...","['ex', 'wife', 'threatening', 'suiciderecently...","['ex_NN', 'wife_NN', 'threatening_VBG', 'suici..."
1,am i weird i do not get affected by compliment...,non-suicide,"['weird', 'get', 'affected', 'compliment', 'co...","['weird', 'get', 'affected', 'compliment', 'co...","['weird_NN', 'get_VB', 'affected_JJ', 'complim..."
2,finally is almost over so i can never ...,non-suicide,"['finally', 'almost', 'never', 'hear', 'bad', ...","['finally', 'almost', 'never', 'hear', 'bad', ...","['finally_RB', 'almost_RB', 'never_RB', 'hear_..."
3,i need helpjust help me i am crying so hard,suicide,"['need', 'helpjust', 'help', 'cry', 'hard']","['need', 'helpjust', 'help', 'cry', 'hard']","['need_NN', 'helpjust_NN', 'help_NN', 'cry_NN'..."
4,i m so losthello my name is adam and i v...,suicide,"['losthello', 'name', 'adam', 'struggling', 'y...","['losthello', 'name', 'adam', 'struggling', 'y...","['losthello_NN', 'name_NN', 'adam_NN', 'strugg..."
...,...,...,...,...,...
232069,if you do not like rock then your not going to...,non-suicide,"['like', 'rock', 'going', 'get', 'anything', '...","['like', 'rock', 'going', 'get', 'anything', '...","['like_IN', 'rock_NN', 'going_VBG', 'get_VB', ..."
232070,you how you can tell i have so many friends an...,non-suicide,"['tell', 'many', 'friend', 'lonely', 'everythi...","['tell', 'many', 'friend', 'lonely', 'everythi...","['tell_NN', 'many_JJ', 'friend_NN', 'lonely_RB..."
232071,pee probably tastes like salty tea can som...,non-suicide,"['pee', 'probably', 'taste', 'like', 'salty', ...","['pee', 'probably', 'taste', 'like', 'salty', ...","['pee_NN', 'probably_RB', 'taste_NN', 'like_IN..."
232072,the usual stuff you find herei'm not posting t...,suicide,"['usual', 'stuff', 'find', 'herei', ""'m"", 'pos...","['usual', 'stuff', 'find', 'herei', '``', 'm',...","['usual_JJ', 'stuff_NN', 'find_VB', 'herei_NN'..."


In [19]:
import ast
data['tagged_tokens'] = data['tagged_tokens'].apply(lambda x: [str(i) for i in ast.literal_eval(x)])


In [20]:
data.drop(columns = ["text","lemmatized_processed_text","tokens"], inplace = True)
data

Unnamed: 0,class,tagged_tokens
0,suicide,"[ex_NN, wife_NN, threatening_VBG, suiciderecen..."
1,non-suicide,"[weird_NN, get_VB, affected_JJ, compliment_NN,..."
2,non-suicide,"[finally_RB, almost_RB, never_RB, hear_NN, bad..."
3,suicide,"[need_NN, helpjust_NN, help_NN, cry_NN, hard_JJ]"
4,suicide,"[losthello_NN, name_NN, adam_NN, struggling_VB..."
...,...,...
232069,non-suicide,"[like_IN, rock_NN, going_VBG, get_VB, anything..."
232070,non-suicide,"[tell_NN, many_JJ, friend_NN, lonely_RB, every..."
232071,non-suicide,"[pee_NN, probably_RB, taste_NN, like_IN, salty..."
232072,suicide,"[usual_JJ, stuff_NN, find_VB, herei_NN, ``_``,..."


In [23]:
## get all words from the document with pos_tag

tagged_tokens_list = []
for i in range (len(data["tagged_tokens"])):
    sen = ' '.join(data["tagged_tokens"].iloc[i])
    tagged_tokens_list.append(sen)
len(tagged_tokens_list)

232017

In [24]:
# combining the document and dictionary
tagged_tokens_list.extend(list_of_lem)
len(tagged_tokens_list)

234294

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
tfidf_vector = tfidf.fit(tagged_tokens_list)

In [26]:
data.info()



<class 'pandas.core.frame.DataFrame'>
Int64Index: 232017 entries, 0 to 232073
Data columns (total 2 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   class          232017 non-null  object
 1   tagged_tokens  232017 non-null  object
dtypes: object(2)
memory usage: 5.3+ MB


### Modelling

In [27]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import naive_bayes
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

In [28]:
train_X, test_X, train_y, test_y = train_test_split(data['tagged_tokens'],data['class'],test_size=0.3)

In [29]:
Encoder = LabelEncoder()
train_y = Encoder.fit_transform(train_y)
test_y = Encoder.fit_transform(test_y)

In [30]:
train_X_Tfidf = tfidf_vector.transform(train_X.apply(lambda x: ' '.join(x)))
test_X_Tfidf = tfidf_vector.transform(test_X.apply(lambda x: ' '.join(x)))

In [31]:
# fit the training dataset on the NB classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(train_X_Tfidf,train_y)
# predict the labels on validation dataset
predictions_NB = Naive.predict(test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, test_y)*100)

Naive Bayes Accuracy Score ->  88.10591041002212


In [32]:
# fit the training dataset on the Logistic Regression classifier
logreg = LogisticRegression(max_iter=200)
logreg.fit(train_X_Tfidf,train_y)
# predict the labels on validation dataset
predictions_logreg = logreg.predict(test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("Logistic Regression Accuracy Score -> ",accuracy_score(predictions_logreg, test_y)*100)

Logistic Regression Accuracy Score ->  93.17874895842313
