In [1]:
#IMPORTING LIBRARIES
import numpy as np
import pandas as pd
#import model data
model_data = pd.read_csv('https://raw.githubusercontent.com/gegeli638/Capstone/master/data_for_model.csv', keep_default_na=False)
model_data.head(1)

Unnamed: 0,title,selftext,author,num_comments,is_suicide,url,selftext_clean,title_clean,author_clean,selftext_length,title_length,megatext_clean
0,Our most-broken and least-understood rules is ...,We understand that most people who reply immed...,SQLwitch,133,0,https://www.reddit.com/r/depression/comments/d...,understand people reply immediately op invitat...,broken least understood rule helper may invite...,sql witch,4792,144,sql witch understand people reply immediately ...


In [2]:
#DEFINING X and y
X = model_data['megatext_clean'].tolist()
y = model_data['is_suicide'].tolist()

In [3]:
from sklearn.model_selection import train_test_split, GridSearchCV       
#TRAIN-TEST SPLIT
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [4]:
#Tokenizing the data

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

maxlen = 100 #Cuts off reviews after 100 words
training_samples = 1500
validation_samples = 50
max_words = 10000

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)
sequences = tokenizer.texts_to_sequences(X_train)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
#print(tokenizer.word_index)


data = pad_sequences(sequences, maxlen=maxlen)

labels = np.asarray(Y_train)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

x_train = data[:training_samples]
y_train = labels[:training_samples]
x_val = data[training_samples: training_samples + validation_samples]
y_val = labels[training_samples: training_samples + validation_samples]

Found 10524 unique tokens.
Shape of data tensor: (1517, 100)
Shape of label tensor: (1517,)


In [5]:
import os
glove_dir = 'glove.6B'

embeddings_index = {}
f = open(os.path.join(glove_dir, 'glove.6B.50d.txt'), encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [6]:
max_words = 10000
embedding_dim = 50
embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
    if i < max_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [7]:
embedding_matrix[0]
embedding_matrix[1]
embedding_matrix.shape
print(embedding_matrix[1])

[-0.71187001 -0.34548     0.25773999  1.11580002 -0.45910001 -1.13559997
 -0.49160001 -0.41088    -0.82639998  0.14788     0.017755    0.4738
  0.4341     -0.75437999 -1.1415      0.32315999 -0.10246     0.27882999
  0.98781002  1.87709999 -0.85609001 -0.072251    0.79453999  0.32765999
 -0.29482999 -0.38997999 -0.67232001 -0.18064    -0.57815999 -0.85960001
  0.43899    -0.086074   -0.95765001  0.71298999  0.80085999  0.048109
  0.09286    -1.01240003  0.13322    -0.25224    -0.26030999 -0.28819001
 -0.67439002 -1.15820003  0.28542     0.44405001 -0.72180998  0.24398001
  1.42970002 -0.2545    ]


In [8]:
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense

model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model.add(Flatten())
#model.add(LSTM(32))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
#model.add(Flatten())
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 50)           500000    
_________________________________________________________________
flatten (Flatten)            (None, 5000)              0         
_________________________________________________________________
dense (Dense)                (None, 32)                160032    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 660,065
Trainable params: 660,065
Non-trainable params: 0
_________________________________________________________________


In [9]:
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = True

In [10]:
#Compile and train the model

model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
history = model.fit(x_train, y_train, epochs=10, batch_size=32, validation_data=(x_val, y_val))
model.save_weights('pre_trained_glove_model.h5')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [11]:
sequences = tokenizer.texts_to_sequences(X_test)
x_test = pad_sequences(sequences, maxlen=maxlen)
y_test = np.asarray(Y_test)

In [12]:
model.load_weights('pre_trained_glove_model.h5')
model.evaluate(x_test, y_test)



[1.3880292177200317, 0.5947368144989014]

In [13]:
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense
from keras.layers import LSTM

model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
#model.add(Embedding(10000,32))
#model.add(Dense(64, activation='relu'))
model.add(LSTM(32))
model.add(Dense(1, activation='sigmoid'))
model.add(Flatten())

model.summary()

#model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
#history = model.fit(x_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 50)           500000    
_________________________________________________________________
lstm (LSTM)                  (None, 32)                10624     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 33        
_________________________________________________________________
flatten_1 (Flatten)          (None, 1)                 0         
Total params: 510,657
Trainable params: 510,657
Non-trainable params: 0
_________________________________________________________________


In [14]:
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False

In [15]:
#Compile and train the model

model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
history = model.fit(x_train, y_train, epochs=10, batch_size= 1517, validation_data=(x_val, y_val))
model.save_weights('pre_trained_glove_model.h5')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [16]:
sequences = tokenizer.texts_to_sequences(X_test)
x_test = pad_sequences(sequences, maxlen=maxlen)
y_test = np.asarray(Y_test)

In [17]:
model.load_weights('pre_trained_glove_model.h5')
model.evaluate(x_test, y_test)



[0.664656937122345, 0.5789473652839661]