# 1. Dataset preparation

## 1.1 Import of libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import string
import joblib as jb
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

## 1.2 Load of dataset
<br>

- Source: https://www.kaggle.com/datasets/nikhileswarkomati/suicide-watch

In [20]:
df = pd.read_csv("Suicide_Detection.csv")

df

Unnamed: 0.1,Unnamed: 0,text,class
0,2,Ex Wife Threatening SuicideRecently I left my ...,suicide
1,3,Am I weird I don't get affected by compliments...,non-suicide
2,4,Finally 2020 is almost over... So I can never ...,non-suicide
3,8,i need helpjust help me im crying so hard,suicide
4,9,"I’m so lostHello, my name is Adam (16) and I’v...",suicide
...,...,...,...
232069,348103,If you don't like rock then your not going to ...,non-suicide
232070,348106,You how you can tell i have so many friends an...,non-suicide
232071,348107,pee probably tastes like salty tea😏💦‼️ can som...,non-suicide
232072,348108,The usual stuff you find hereI'm not posting t...,suicide


# 2. Dataset cleaning

## 2.1 Removal of unused columns

In [21]:
df.drop(["Unnamed: 0"], axis = 1, inplace = True)

df

Unnamed: 0,text,class
0,Ex Wife Threatening SuicideRecently I left my ...,suicide
1,Am I weird I don't get affected by compliments...,non-suicide
2,Finally 2020 is almost over... So I can never ...,non-suicide
3,i need helpjust help me im crying so hard,suicide
4,"I’m so lostHello, my name is Adam (16) and I’v...",suicide
...,...,...
232069,If you don't like rock then your not going to ...,non-suicide
232070,You how you can tell i have so many friends an...,non-suicide
232071,pee probably tastes like salty tea😏💦‼️ can som...,non-suicide
232072,The usual stuff you find hereI'm not posting t...,suicide


## 2.2 Functions for text cleaning

### 2.2.1 Removal of stopwords

In [4]:
gist_file = open("gist_stopwords.txt", "r")
try:
    content = gist_file.read()
    stopwords = content.split(",")
finally:
    gist_file.close()

In [5]:
stop = set(stopwords)

In [6]:
def remove_stopwords(text):
    text = [w.lower() for w in text.split() if w.lower() not in stop]

    return " ".join(text)    

In [22]:
df["text"] = df.text.map(lambda x: remove_stopwords(x))

df

Unnamed: 0,text,class
0,wife threatening suiciderecently left wife goo...,suicide
1,weird compliments coming irl feel good interne...,non-suicide
2,"finally 2020 over... hear ""2020 bad year"" agai...",non-suicide
3,helpjust crying hard,suicide
4,"i’m losthello, adam (16) i’ve struggling years...",suicide
...,...,...
232069,rock https://musictaste.space/match/lavish-wal...,non-suicide
232070,friends lonely deprived? pre-bought nightmares...,non-suicide
232071,pee tastes salty tea😏💦‼️ drank pee confirm thi...,non-suicide
232072,usual stuff herei'm posting sympathy pity wors...,suicide


### 2.2.2 Removal of URL's

In [7]:
def remove_URL(text):
    url = re.compile(r"https?://\S+|www\.\S")

    return url.sub(r"", text)

In [23]:
df["text"] = df.text.map(lambda x: remove_URL(x))

df

Unnamed: 0,text,class
0,wife threatening suiciderecently left wife goo...,suicide
1,weird compliments coming irl feel good interne...,non-suicide
2,"finally 2020 over... hear ""2020 bad year"" agai...",non-suicide
3,helpjust crying hard,suicide
4,"i’m losthello, adam (16) i’ve struggling years...",suicide
...,...,...
232069,rock,non-suicide
232070,friends lonely deprived? pre-bought nightmares...,non-suicide
232071,pee tastes salty tea😏💦‼️ drank pee confirm thi...,non-suicide
232072,usual stuff herei'm posting sympathy pity wors...,suicide


### 2.2.3 Removal of html codes

In [8]:
def remove_html(text):
    html = re.compile(r"<.*?>")

    return html.sub("", text)

In [24]:
df["text"] = df.text.map(lambda x: remove_html(x))

df

Unnamed: 0,text,class
0,wife threatening suiciderecently left wife goo...,suicide
1,weird compliments coming irl feel good interne...,non-suicide
2,"finally 2020 over... hear ""2020 bad year"" agai...",non-suicide
3,helpjust crying hard,suicide
4,"i’m losthello, adam (16) i’ve struggling years...",suicide
...,...,...
232069,rock,non-suicide
232070,friends lonely deprived? pre-bought nightmares...,non-suicide
232071,pee tastes salty tea😏💦‼️ drank pee confirm thi...,non-suicide
232072,usual stuff herei'm posting sympathy pity wors...,suicide


### 2.2.4 Removal of emojies

In [9]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
            u"\U0001F600-\U0001F64F"
            u"\U0001F300-\U0001F5FF"
            u"\U0001F680-\U0001F6FF"
            u"\U0001F1E0-\U0001F1FF"
            u"\U00002702-\U000027B0"
            u"\U000024C2-\U0001F251"
            "]+",
            flags = re.UNICODE)
    
    return emoji_pattern.sub(r"", text)

In [25]:
df["text"] = df.text.map(lambda x: remove_emoji(x))

df

Unnamed: 0,text,class
0,wife threatening suiciderecently left wife goo...,suicide
1,weird compliments coming irl feel good interne...,non-suicide
2,"finally 2020 over... hear ""2020 bad year"" agai...",non-suicide
3,helpjust crying hard,suicide
4,"i’m losthello, adam (16) i’ve struggling years...",suicide
...,...,...
232069,rock,non-suicide
232070,friends lonely deprived? pre-bought nightmares...,non-suicide
232071,pee tastes salty tea‼ drank pee confirm this‼,non-suicide
232072,usual stuff herei'm posting sympathy pity wors...,suicide


### 2.2.5 Removal of puncts

In [10]:
def remove_punct(text):
    table = str.maketrans("", "", string.punctuation)

    return text.translate(table)

In [26]:
df["text"] = df.text.map(lambda x: remove_punct(x))

df

Unnamed: 0,text,class
0,wife threatening suiciderecently left wife goo...,suicide
1,weird compliments coming irl feel good interne...,non-suicide
2,finally 2020 over hear 2020 bad year again swe...,non-suicide
3,helpjust crying hard,suicide
4,i’m losthello adam 16 i’ve struggling years i’...,suicide
...,...,...
232069,rock,non-suicide
232070,friends lonely deprived prebought nightmares 2...,non-suicide
232071,pee tastes salty tea‼ drank pee confirm this‼,non-suicide
232072,usual stuff hereim posting sympathy pity worse...,suicide


### 2.2.6 Removal of numbers

In [11]:
def remove_numbers(text):
    numbers = re.compile(r"[0-9]+")

    return numbers.sub("", text)

In [27]:
df["text"] = df.text.map(lambda x: remove_numbers(x))

df

Unnamed: 0,text,class
0,wife threatening suiciderecently left wife goo...,suicide
1,weird compliments coming irl feel good interne...,non-suicide
2,finally over hear bad year again swear fucki...,non-suicide
3,helpjust crying hard,suicide
4,i’m losthello adam i’ve struggling years i’m ...,suicide
...,...,...
232069,rock,non-suicide
232070,friends lonely deprived prebought nightmares ...,non-suicide
232071,pee tastes salty tea‼ drank pee confirm this‼,non-suicide
232072,usual stuff hereim posting sympathy pity worse...,suicide


### 2.2.7 Removal of special characters

In [32]:
special_characters = """!()-[]{};:'"\,<>./?@#$%^&*_~’"""

def remove_special_characters(text):
    new_string = ""
    for x in text:
        if x not in special_characters:
            new_string = new_string + x

    return new_string

In [33]:
df["text"] = df.text.map(lambda x: remove_special_characters(x))

df

Unnamed: 0,text,class
0,wife threatening suiciderecently left wife goo...,suicide
1,weird compliments coming irl feel good interne...,non-suicide
2,finally over hear bad year again swear fucki...,non-suicide
3,helpjust crying hard,suicide
4,im losthello adam ive struggling years im afr...,suicide
...,...,...
232069,rock,non-suicide
232070,friends lonely deprived prebought nightmares ...,non-suicide
232071,pee tastes salty tea‼ drank pee confirm this‼,non-suicide
232072,usual stuff hereim posting sympathy pity worse...,suicide


### 2.2.8 Final stopwords removal

In [44]:
df["text"] = df.text.map(lambda x: remove_stopwords(x))

df

Unnamed: 0,text,class
0,wife threatening suiciderecently left wife goo...,suicide
1,weird compliments coming irl feel good interne...,non-suicide
2,finally hear bad year swear fucking god annoying,non-suicide
3,helpjust crying hard,suicide
4,losthello adam ive struggling years afraid yea...,suicide
...,...,...
232069,rock,non-suicide
232070,friends lonely deprived prebought nightmares c...,non-suicide
232071,pee tastes salty tea‼ drank pee confirm this‼,non-suicide
232072,usual stuff hereim posting sympathy pity worse...,suicide


# 3. Setting the target

## 3.1 Binary target

In [45]:
df["target"] = np.where(df["class"] == "suicide", 1, 0)

df.drop("class", axis = 1, inplace = True)

df

Unnamed: 0,text,target
0,wife threatening suiciderecently left wife goo...,1
1,weird compliments coming irl feel good interne...,0
2,finally hear bad year swear fucking god annoying,0
3,helpjust crying hard,1
4,losthello adam ive struggling years afraid yea...,1
...,...,...
232069,rock,0
232070,friends lonely deprived prebought nightmares c...,0
232071,pee tastes salty tea‼ drank pee confirm this‼,0
232072,usual stuff hereim posting sympathy pity worse...,1


# 4. Saving the cleaned dataset

In [46]:
jb.dump(df, "df_cleaned.pkl.z")

['df_cleaned.pkl.z']