# Using Natural Language Processing to predict suicidal ideation on Reddit

## Setup

In [1]:
## Uncomment if using google colab 
%%capture
from google.colab import drive
drive.mount('/content/drive')     # connect to google drive

!cp /content/drive/MyDrive/Colab\ Notebooks/utils.py /content/    # import utils.py to the working folder
!cp /content/drive/MyDrive/Colab\ Notebooks/clean_reddit.csv /content/   # import data to the working folder

!pip install contractions
!pip install langdetect
!python -m spacy download en_core_web_lg

In [2]:
# Import libraries and helper functions
import warnings
warnings.filterwarnings('ignore')
from utils import *

In [3]:
## Read in data
data_path = "/content/clean_reddit.csv"      # uncomment if using google colab
reddit_data = pd.read_csv(data_path, index_col = 0).reset_index(drop=True)
reddit_data.sample(5, random_state= 1)  # peak at a random sample of 5 rows

Unnamed: 0,class,word_count,char_count,sentence_count,avg_word_len,avg_sent_len,text_clean,sentiment
120988,suicide,35,144,2,4.114286,17.5,doi overwhelmed suicidal ideation today get da...,0.2
186134,suicide,28,107,1,3.821429,28.0,bye worldi tried commiting suicide multiple ti...,-0.052083
102650,suicide,13,63,1,4.846154,13.0,many pills take overdose specifically ibuprofe...,0.5
23371,non-suicide,53,176,5,3.320755,10.6,status girl get get invited boys sleepover eve...,-0.05
119358,non-suicide,11,35,1,3.181818,11.0,give dms want send people horny jail,-0.1


## Preprocessing

### Encoding target variable

In [4]:
#mapping 1 to suicidal and 0 to non-suicidal posts
reddit_data['class'] = reddit_data['class'].map({'suicide': 1, 'non-suicide':0}) 

In [5]:
reddit_data.head()

Unnamed: 0,class,word_count,char_count,sentence_count,avg_word_len,avg_sent_len,text_clean,sentiment
0,1,156,573,7,3.673077,22.285714,ex wife threatening suiciderecently left wife ...,0.116667
1,0,29,112,1,3.862069,29.0,weird get affected compliments coming someone ...,0.1
2,0,31,106,3,3.419355,10.333333,finally almost never hear bad year ever swear ...,-0.525
3,1,9,33,1,3.666667,9.0,need helpjust help crying hard,-0.245833
4,1,524,1985,21,3.788168,24.952381,losthello name adam struggling years afraid pa...,-0.20965


### Data partitioning

In [7]:
# define data and target for our data
X = reddit_data.drop(columns="class")
y = reddit_data["class"]

# split data in train, split sets (leave out 30% of the data for validation)
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size = 0.3, random_state = 12)

In [8]:
# Define bag of words using TFIDF
stemmer = nltk.stem.porter.PorterStemmer() 
                                                   
bow = feature_extraction.text.TfidfVectorizer(
    stop_words='english', max_df=2000,
    tokenizer=lambda x: [stemmer.stem(i) for i in x.split(" ")]
    )

bow.fit(X_train["text_clean"])
X_train_transformed = bow.transform(X_train["text_clean"])
X_test_transformed = bow.transform(X_test["text_clean"])

In [None]:
# create X_train dataframe for ploting words
word_counts = pd.DataFrame({"counts": X_train_transformed.toarray().sum(axis=0)}, 
                                index=bow.get_feature_names_out()).sort_values("counts", 
                                ascending=False)

# visualize top 20 words in the train set
word_counts.head(20).plot(kind="bar", figsize=(15, 5), legend=False)
plt.title(f"Top 20 most frequently occurring words")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.show()

In [None]:
# join vectorized words original training dataset
ds = pd.DataFrame(X_train_transformed.todense(), 
                        columns = bow.get_feature_names_out(),
                        index=X_train.index)
dtf_train = X_train.merge(ds)
dtf_train.drop(columns = 'text_clean', inplace = True)

# join transformed data to original test data
test_ds = pd.DataFrame(X_test_transformed.todense(), 
                        columns = bow.get_feature_names_out(), 
                        index=X_test.index)
dtf_test = X_test.merge(test_ds)
dtf_test.drop(columns = 'text_clean', inplace = True)