In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
from time import time
from collections import Counter


In [None]:
df = pd.read_csv('/content/gdrive/MyDrive/Suicide_Detection/reddit_suicide_detection_final_clean.csv', header=0)

In [None]:
df.drop(columns=['text'], axis=1, inplace=True)
df = df.rename(columns={"cleaned_text": "text"})
classes = {"suicide": 1, "non-suicide": 0}
df = df.replace({"class": classes})
df = df[df['text'].notnull()]
df

Unnamed: 0,class,text
0,1,sex wife threaten suicide recently leave wife ...
1,0,weird not affect compliment come know girl but...
2,0,finally never hear bad year swear fucking god ...
3,1,need help help cry hard
4,1,end tonight not anymore quit
...,...,...
174170,0,today went sled friend not like but pretty big...
174171,0,not like rock not go but
174172,0,tell friend not lonely deprive buy little nigh...
174173,0,pee probably taste like salty tea drink pee co...


### Split dataset into training, validation and test sets

In [None]:
train_text, test_text, train_labels, test_labels = train_test_split(df['text'], df['class'],
                                                                    random_state= 0,
                                                                    test_size=0.2,
                                                                    stratify=df['class'])

### Building Vocabulary

In [None]:
# define vocab
vocab = Counter()
# tokenise each sentence
tokens_list = [(s.split()) for s in train_text]
# add each sentence to vocab
for i in tokens_list:
  vocab.update(i)
# removing words with a low occurance
min_occurance = 2
tokens = [k for k,c in vocab.items() if c >= min_occurance]
print(len(tokens))


18859


In [None]:
#save list to file
def save_list(lines, filename):
	# convert lines to a single blob of text
	data = '\n'.join(lines)
	# open file
	file = open(filename, 'w')
	# write text
	file.write(data)
	# close file
	file.close()

In [None]:
# save tokens to a vocabulary file
save_list(vocab, '/content/gdrive/MyDrive/Suicide_Detection/vocab.txt')

In [None]:
# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

# load the vocabulary
vocab_filename = '/content/gdrive/MyDrive/Suicide_Detection/vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)


### Removing out-of-vocab words

In [None]:
# clean each line
def clean_line(line, vocab):
  tokens = line.split()
  # filter out tokens not in vocab
  tokens_clean = [w for w in tokens if w in vocab]
  return [tokens_clean]

# clean entire dataset
def process_lines(data, vocab):
  lines = list()
  for i in data:
    line = clean_line(i, vocab)
    # add lines to list
    lines += line
  return lines

In [None]:
train_clean = process_lines(train_text, vocab)
test_clean = process_lines(test_text, vocab)

### Training the Model
     

In [None]:
# set up the parameters of the model
model = Word2Vec(vector_size=300, window=10, min_count=1, epochs=5, seed=0)

# it builds the vocabulary from a sequence of sentences and thus initialized the model.
t = time()
model.build_vocab(train_clean, progress_per=1000)
print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

# training the model
t = time()
model.train(train_clean, total_examples=model.corpus_count, epochs=5, report_delay=1)
print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))


Time to build vocab: 0.02 mins
Time to train the model: 0.67 mins


In [None]:
# save model in ASCII (word2vec) format
filename = '/content/gdrive/MyDrive/Suicide_Detection/embedding_word2vec.txt'
model.wv.save_word2vec_format(filename, binary=False)

In [None]:
model.wv.most_similar('suicide')

[('blackwell', 0.6158646941184998),
 ('involuntarily', 0.577850878238678),
 ('awry', 0.567089319229126),
 ('arson', 0.5341591835021973),
 ('unsuccessful', 0.5330076217651367),
 ('suicidal', 0.5307183861732483),
 ('git', 0.5295685529708862),
 ('frustum', 0.5262470245361328),
 ('fuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuck',
  0.5226901769638062),
 ('reoccur', 0.5203526020050049)]