In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet

In [2]:
dict1 = pd.read_csv("suicidal_indicator.csv", header=None).T
dict2 = pd.read_csv("suicidal_ideation.csv", header=None).T
dict3 = pd.read_csv("suicidal_behavior.csv", header=None).T
dict4 = pd.read_csv("suicidal_attempt.csv", header=None).T


In [3]:
###combined dictionary into 1
domain_dict = pd.concat([dict1, dict2, dict3,dict4], ignore_index=True)
domain_dict = domain_dict.rename(columns={0: 'lexicons'})
domain_dict.head()

Unnamed: 0,lexicons
0,Pessimistic character
1,Suicide of relative
2,Family history of suicide
3,Suicide of close relative
4,Suicide risk assessment


In [4]:
def lower_text(text):
    text = str(text).lower()
    return text

In [5]:
domain_dict['lexicons'] = domain_dict['lexicons'].apply(lower_text)

In [6]:
domain_dict

Unnamed: 0,lexicons
0,pessimistic character
1,suicide of relative
2,family history of suicide
3,suicide of close relative
4,suicide risk assessment
...,...
2272,went in the freezer
2273,jumped from bridge
2274,jumped from roof
2275,bag around head


In [7]:
def stem_preprocess_text(sentence):
    stop_words = set(stopwords.words('english'))
#     stemmer = PorterStemmer()
    stemmer = SnowballStemmer('english')
    sentence = str(sentence)
    words = nltk.word_tokenize(sentence.lower())
    tagged_words = nltk.pos_tag(words)
    tagged_words = [(stemmer.stem(word), tag) for word, tag in tagged_words if word not in stop_words]
    return tagged_words

In [8]:
domain_dict["stem"] = domain_dict["lexicons"].apply(stem_preprocess_text)

In [9]:
domain_dict

Unnamed: 0,lexicons,stem
0,pessimistic character,"[(pessimist, JJ), (charact, NN)]"
1,suicide of relative,"[(suicid, NN), (relat, JJ)]"
2,family history of suicide,"[(famili, NN), (histori, NN), (suicid, NN)]"
3,suicide of close relative,"[(suicid, NN), (close, JJ), (relat, JJ)]"
4,suicide risk assessment,"[(suicid, NN), (risk, NN), (assess, NN)]"
...,...,...
2272,went in the freezer,"[(went, VBD), (freezer, NN)]"
2273,jumped from bridge,"[(jump, NN), (bridg, NN)]"
2274,jumped from roof,"[(jump, NN), (roof, NN)]"
2275,bag around head,"[(bag, NN), (around, IN), (head, NN)]"


In [10]:
def lem_preprocess_text(sentence):
    stop_words = set(stopwords.words('english'))
#     stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    sentence = str(sentence)
    words = nltk.word_tokenize(sentence.lower())
    tagged_words = nltk.pos_tag(words)
    tagged_words = [(lemmatizer.lemmatize(word), tag) for word, tag in tagged_words if word not in stop_words]
    return tagged_words

In [11]:
domain_dict["lem"] = domain_dict["lexicons"].apply(lem_preprocess_text)

In [12]:
domain_dict

Unnamed: 0,lexicons,stem,lem
0,pessimistic character,"[(pessimist, JJ), (charact, NN)]","[(pessimistic, JJ), (character, NN)]"
1,suicide of relative,"[(suicid, NN), (relat, JJ)]","[(suicide, NN), (relative, JJ)]"
2,family history of suicide,"[(famili, NN), (histori, NN), (suicid, NN)]","[(family, NN), (history, NN), (suicide, NN)]"
3,suicide of close relative,"[(suicid, NN), (close, JJ), (relat, JJ)]","[(suicide, NN), (close, JJ), (relative, JJ)]"
4,suicide risk assessment,"[(suicid, NN), (risk, NN), (assess, NN)]","[(suicide, NN), (risk, NN), (assessment, NN)]"
...,...,...,...
2272,went in the freezer,"[(went, VBD), (freezer, NN)]","[(went, VBD), (freezer, NN)]"
2273,jumped from bridge,"[(jump, NN), (bridg, NN)]","[(jumped, NN), (bridge, NN)]"
2274,jumped from roof,"[(jump, NN), (roof, NN)]","[(jumped, NN), (roof, NN)]"
2275,bag around head,"[(bag, NN), (around, IN), (head, NN)]","[(bag, NN), (around, IN), (head, NN)]"


In [13]:
## get list of domain dictionary that are stemmed
stem_list = domain_dict.explode('stem')['stem'].tolist()
tuple_list = [t for t in stem_list if isinstance(t, tuple)]
set_of_stem = {f"{t[0]}_{t[1]}" for t in tuple_list}

In [14]:
## get list of domain dictionary that are lemmatized
lem_list = domain_dict.explode('lem')['lem'].tolist()
tuple_list = [t for t in stem_list if isinstance(t, tuple)]
set_of_lem = {f"{t[0]}_{t[1]}" for t in tuple_list}

In [15]:
docs = pd.read_csv("preprocessed_data.csv")
docs.head()

Unnamed: 0,text,class,stemmed_processed_text,lemmatized_processed_text
0,ex wife threatening suiciderecently i left my ...,suicide,"['ex', 'wife', 'threaten', 'suiciderec', 'left...","['ex', 'wife', 'threatening', 'suiciderecently..."
1,am i weird i do not get affected by compliment...,non-suicide,"['weird', 'get', 'affect', 'compliment', 'come...","['weird', 'get', 'affected', 'compliment', 'co..."
2,finally is almost over so i can never ...,non-suicide,"['final', 'almost', 'never', 'hear', 'bad', 'y...","['finally', 'almost', 'never', 'hear', 'bad', ..."
3,i need helpjust help me i am crying so hard,suicide,"['need', 'helpjust', 'help', 'cri', 'hard']","['need', 'helpjust', 'help', 'cry', 'hard']"
4,i m so losthello my name is adam and i v...,suicide,"['losthello', 'name', 'adam', 'struggl', 'year...","['losthello', 'name', 'adam', 'struggling', 'y..."
