# Import necessary libraries

<b> Note that </b>: If you're using Jupyter Notebook, you wouldn't need the drive.mount("/content/drive").

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras import layers
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import pandas as pd
import numpy as np
from sklearn.metrics import precision_recall_fscore_support
import gensim

Mounted at /content/drive


<b> Note that: </b> If you're using a local directory, you'll have to read_excel from the appropriate directory.

In [None]:
df = pd.read_excel("/content/drive/MyDrive/Colab Notebooks/IS424 Depression Project/model_data_v2.xlsx")

In [None]:
# This step is to drop unnecessary columns, and empty rows.

df = df.rename(columns={"text_cleaned": "text"})
df = df.drop("Unnamed: 0", axis=1)
df['text'] = df['text'].astype("str")
df = df.dropna()

In [None]:
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 50000

# Max number of words in each thread. For this one, we just used the max length of word of all sentences in the data.
MAX_SEQUENCE_LENGTH = max([len(s.split()) for s in df['text']])

# Hyperparameter. We tested with different parameters and 128 returns the best results.
EMBEDDING_DIM = 128

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(df['text'].values)
word_index = tokenizer.word_index

print('Found %s unique tokens.' % len(word_index))

Found 53660 unique tokens.


In [None]:
X = tokenizer.texts_to_sequences(df['text'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

Y = pd.get_dummies(df['depression']).values
print('Shape of label tensor:', Y.shape)

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.30, random_state = 99)
print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

Shape of data tensor: (38362, 2521)
Shape of label tensor: (38362, 2)
(26853, 2521) (26853, 2)
(11509, 2521) (11509, 2)


In [None]:
# This is for testing later. Y_test_arr stores the original classes' outputs in a 1D array.
Y_test_arr = []
for idx in range(len(Y_test)):
  if Y_test[idx][0] == 1:
    Y_test_arr.append(0)
  else:
    Y_test_arr.append(1) 

# LSTM + Keras' Word Embeddings


In [None]:
|# LSTM
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional, GlobalMaxPooling1D
from keras.callbacks import EarlyStopping
import tensorflow as tf 
from tensorflow import keras
from keras.regularizers import l2

model = Sequential()
adam = tf.optimizers.Adam(learning_rate=0.001)
model.add(layers.Embedding(len(word_index)+1, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(LSTM(32, return_sequences=True, dropout=0.5, kernel_regularizer=l2(0.001), recurrent_regularizer=l2(0.001), bias_regularizer=l2(0.001)))
model.add(GlobalMaxPooling1D())
model.add(Dense(10,activation='relu', kernel_regularizer=l2(0.001), bias_regularizer=l2(0.001))) 
model.add(layers.Dense(2, activation="softmax"))
model.compile(optimizer=adam, loss="binary_crossentropy", 
     metrics=['accuracy'])

epochs = 25
batch_size = 64

	
es = EarlyStopping(monitor='val_loss', mode="min", patience=3)

history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size, validation_split=0.3, callbacks=[es])

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25


In [None]:
accr = model.evaluate(X_test,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Test set
  Loss: 0.158
  Accuracy: 0.962


In [None]:
accr = model.evaluate(X_test,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

predictions = model.predict(X_test)
predictions = [np.argmax(prediction) for prediction in predictions]
precision, recall, f1_score, none = precision_recall_fscore_support(Y_test_arr, predictions, average='weighted')
print("Precision is: ", precision)
print("Recall is: ", recall)
print("F1 Score is: ", f1_score)

Precision is:  0.9615153440042628
Recall is:  0.9615083847423755
F1 Score is:  0.9615106347249784


# Bi-LSTM + Keras' Word Embeddings

In [None]:
#Bi-LSTM

from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional, GlobalMaxPooling1D
from keras.callbacks import EarlyStopping
import tensorflow as tf 
from tensorflow import keras
from keras.regularizers import l2
model = Sequential()
adam = tf.optimizers.Adam(learning_rate=0.001)
model.add(layers.Embedding(len(word_index)+1, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(Bidirectional(LSTM(32, return_sequences=True, dropout=0.5, kernel_regularizer=l2(0.001), recurrent_regularizer=l2(0.001), bias_regularizer=l2(0.001))))
model.add(GlobalMaxPooling1D())
model.add(Dense(10,activation='relu', kernel_regularizer=l2(0.001), bias_regularizer=l2(0.001))) 
model.add(layers.Dense(2, activation="softmax"))
model.compile(optimizer=adam, loss="binary_crossentropy", 
     metrics=['accuracy'])

epochs = 25
batch_size = 64

	
es = EarlyStopping(monitor='val_loss', mode="min", patience=3)

history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size, validation_split=0.3, callbacks=[es])

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25


In [None]:
accr = model.evaluate(X_test,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

predictions = model.predict(X_test)
predictions = [np.argmax(prediction) for prediction in predictions]
precision, recall, f1_score, none = precision_recall_fscore_support(Y_test_arr, predictions, average='weighted')
print("Precision is: ", precision)
print("Recall is: ", recall)
print("F1 Score is: ", f1_score)

Test set
  Loss: 0.161
  Accuracy: 0.958
Precision is:  0.9582008817589148
Recall is:  0.9579459553392997
F1 Score is:  0.9579097474929152


# LSTM + CBOW

- Choose where you want to save the embedding_word2vec txt files.
- You will be retrieving from that directory in the next code!

In [None]:
"""
CBOW Model
"""
text_sentences = df['text'].apply(lambda x: x.split())

model = gensim.models.Word2Vec(sentences=text_sentences, size=EMBEDDING_DIM, window=5, workers=4, min_count=1)
words = list(model.wv.vocab)
print(len(words))

# Where you want to save.
filename = 'depression_embedding_word2vec.txt'
model.wv.save_word2vec_format(filename, binary=False)


53756


In [None]:
import numpy as np
embeddings_index = {}
f = open("/content/depression_embedding_word2vec.txt")
for line in f:
  values = line.split()
  word = values[0]
  coefs = np.asarray(values[1:])
  embeddings_index[word] = coefs
f.close()

num_words = len(word_index) + 1
word2vec_embedding_matrix = np.zeros((num_words, 128))

for word, i in word_index.items():
  if i > num_words: 
    continue

  embedding_vector = embeddings_index.get(word) 
  if embedding_vector is not None:
    word2vec_embedding_matrix[i] = embedding_vector 
print(num_words)


tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['text'].values)
X = tokenizer.texts_to_sequences(df['text'].values)

word_index = tokenizer.word_index
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)


Y = pd.get_dummies(df['depression']).values
print('Shape of label tensor:', Y.shape)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.30, random_state = 99)
# X_train, X_test, Y_train, Y_test = train_test_split(X_svd, Y, test_size = 0.30, random_state = 42)
print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)



53661
Shape of label tensor: (38362, 2)
(26853, 2521) (26853, 2)
(11509, 2521) (11509, 2)


In [None]:
# CBOW LSTM

from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional, GlobalMaxPooling1D
from keras.callbacks import EarlyStopping
import tensorflow as tf 
from tensorflow import keras
from keras.regularizers import l2


model = Sequential()
adam = tf.optimizers.Adam(learning_rate=0.001)
model.add(Embedding(num_words, 128, weights=[word2vec_embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, 
                    trainable=False))
model.add(LSTM(32, return_sequences=True, dropout=0.5, kernel_regularizer=l2(0.001), recurrent_regularizer=l2(0.001), bias_regularizer=l2(0.001)))
model.add(GlobalMaxPooling1D())
model.add(Dense(10,activation='relu', kernel_regularizer=l2(0.001), bias_regularizer=l2(0.001))) 
model.add(layers.Dense(2, activation="softmax"))

model.compile(optimizer=adam, loss="binary_crossentropy", 
     metrics=['accuracy'])

epochs = 25
batch_size = 64

	
es = EarlyStopping(monitor='val_loss', mode="min", patience=3)

history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size, validation_split=0.3, callbacks=[es])
# history = model.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs=epochs, batch_size=batch_size)


Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25


In [None]:
accr = model.evaluate(X_test,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

predictions = model.predict(X_test)
predictions = [np.argmax(prediction) for prediction in predictions]
precision, recall, f1_score, none = precision_recall_fscore_support(Y_test_arr, predictions, average='weighted')
print("Precision is: ", precision)
print("Recall is: ", recall)
print("F1 Score is: ", f1_score)

Test set
  Loss: 0.158
  Accuracy: 0.954
Precision is:  0.955613322188781
Recall is:  0.9536015292379877
F1 Score is:  0.9536323512888436


# LSTM + Skip-Gram

- Choose where you want to save the embedding_word2vec txt files.
- You will be retrieving from that directory in the next code!

In [None]:
"""
Skip-Gram Model
"""
text_sentences = df['text'].apply(lambda x: x.split())

model = gensim.models.Word2Vec(sentences=text_sentences, size=EMBEDDING_DIM, window=5, workers=4, min_count=1, sg=1)
words = list(model.wv.vocab)
print(len(words))

# Where you want to save.
filename = 'depression_embedding_word2vec_skipgram.txt'
model.wv.save_word2vec_format(filename, binary=False)


53756


In [None]:
# SkipGram word2vec
import numpy as np
embeddings_index = {}
f = open("/content/depression_embedding_word2vec_skipgram.txt")
for line in f:
  values = line.split()
  word = values[0]
  coefs = np.asarray(values[1:])
  embeddings_index[word] = coefs
f.close()

num_words = len(word_index) + 1
word2vec_embedding_matrix = np.zeros((num_words, 128))

for word, i in word_index.items():
  if i > num_words: 
    continue

  embedding_vector = embeddings_index.get(word) 
  if embedding_vector is not None:
    word2vec_embedding_matrix[i] = embedding_vector 
print(num_words)


tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['text'].values)
X = tokenizer.texts_to_sequences(df['text'].values)

word_index = tokenizer.word_index
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)


Y = pd.get_dummies(df['depression']).values
print('Shape of label tensor:', Y.shape)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.30, random_state = 99)
# X_train, X_test, Y_train, Y_test = train_test_split(X_svd, Y, test_size = 0.30, random_state = 42)
print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)



53661
Shape of label tensor: (38362, 2)
(26853, 2521) (26853, 2)
(11509, 2521) (11509, 2)


In [None]:
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional, GlobalMaxPooling1D
from keras.callbacks import EarlyStopping
import tensorflow as tf 
from tensorflow import keras
from keras.regularizers import l2


model = Sequential()
adam = tf.optimizers.Adam(learning_rate=0.001)
model.add(Embedding(num_words, 128, weights=[word2vec_embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, 
                    trainable=False))
model.add(LSTM(32, return_sequences=True, dropout=0.5, kernel_regularizer=l2(0.001), recurrent_regularizer=l2(0.001), bias_regularizer=l2(0.001)))
model.add(GlobalMaxPooling1D())
model.add(Dense(10,activation='relu', kernel_regularizer=l2(0.001), bias_regularizer=l2(0.001))) 
model.add(layers.Dense(2, activation="softmax"))

model.compile(optimizer=adam, loss="binary_crossentropy", 
     metrics=['accuracy'])

epochs = 25
batch_size = 64

	
es = EarlyStopping(monitor='val_loss', mode="min", patience=3)

history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size, validation_split=0.3, callbacks=[es])
# history = model.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs=epochs, batch_size=batch_size)


Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25


In [None]:
accr = model.evaluate(X_test,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

predictions = model.predict(X_test)
predictions = [np.argmax(prediction) for prediction in predictions]
precision, recall, f1_score, none = precision_recall_fscore_support(Y_test_arr, predictions, average='weighted')
print("Precision is: ", precision)
print("Recall is: ", recall)
print("F1 Score is: ", f1_score)

Test set
  Loss: 0.138
  Accuracy: 0.961
Precision is:  0.9618126254414924
Recall is:  0.9609870536102181
F1 Score is:  0.9610126628791377


# LSTM + Glove

- glove.6B.300d.txt is attached in the submission folder. Please use the appropriate directory you stored it in.

In [None]:
# Glove
import numpy as np
embeddings_index = {}

# Use your own directory.
f = open("/content/drive/MyDrive/Colab Notebooks/IS424 Depression Project/Classification/glove.6B.300d.txt")
for line in f:
  values = line.split()
  word = values[0]
  coefs = np.asarray(values[1:])
  embeddings_index[word] = coefs
f.close()

num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, 300))

for word, i in word_index.items():
  if i > num_words: 
    continue

  embedding_vector = embeddings_index.get(word) 
  if embedding_vector is not None:
    embedding_matrix[i] = embedding_vector 
print(num_words)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['text'].values)
X = tokenizer.texts_to_sequences(df['text'].values)

word_index = tokenizer.word_index
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)

Y = pd.get_dummies(df['depression']).values
print('Shape of label tensor:', Y.shape)

53661
Shape of label tensor: (38362, 2)


In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.30, random_state = 99)
print(X_train.shape, Y_train.shape)
print(X_test.shape,Y_test.shape)

(26853, 2521) (26853, 2)
(11509, 2521) (11509, 2)


In [None]:
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional, GlobalMaxPooling1D
from keras.callbacks import EarlyStopping
import tensorflow as tf 
from tensorflow import keras
from keras.regularizers import l2

model = Sequential()
adam = tf.optimizers.Adam(learning_rate=0.001)
model.add(Embedding(num_words, 300, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, 
                    trainable=False))
model.add(LSTM(32, return_sequences=True, dropout=0.5, kernel_regularizer=l2(0.001), recurrent_regularizer=l2(0.001), bias_regularizer=l2(0.001)))
model.add(GlobalMaxPooling1D())
model.add(Dense(10,activation='relu', kernel_regularizer=l2(0.001), bias_regularizer=l2(0.001))) 
model.add(layers.Dense(2, activation="softmax"))

model.compile(optimizer=adam, loss="binary_crossentropy", 
     metrics=['accuracy'])

epochs = 25
batch_size = 64

	
es = EarlyStopping(monitor='val_loss', mode="min", patience=3)

history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size, validation_split=0.3, callbacks=[es])
# history = model.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs=epochs, batch_size=batch_size)



Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25


In [None]:
accr = model.evaluate(X_test,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

predictions = model.predict(X_test)
predictions = [np.argmax(prediction) for prediction in predictions]
precision, recall, f1_score, none = precision_recall_fscore_support(Y_test_arr, predictions, average='weighted')
print("Precision is: ", precision)
print("Recall is: ", recall)
print("F1 Score is: ", f1_score)

Test set
  Loss: 0.157
  Accuracy: 0.960
Precision is:  0.9601075235415188
Recall is:  0.9597706143018507
F1 Score is:  0.9597907993001087


# LSTM + Pre-trained Word2Vec

In [None]:
"""
Pre-trained Word2Vec Model
This part loads really long, please be patient!
"""

import gensim.downloader as api
model = api.load("word2vec-google-news-300")


In [None]:
"""
Pre-trained Word2Vec Model
"""

text_sentences = df['text'].apply(lambda x: x.split())
words = list(model.wv.vocab)
print(len(words))

# filename = 'selfdriving_embedding_word2vec_pretrained.txt'
# model.wv.save_word2vec_format(filename, binary=False)

In [None]:
import numpy as np
labels = np.asarray(model.index2word)
vectors = np.asarray(model.vectors)
word_embeddings = dict(zip(labels, vectors))

num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, 300))

for word, i in word_index.items():
  if i > num_words: 
    continue

  embedding_vector = word_embeddings.get(word) 
  if embedding_vector is not None:
    embedding_matrix[i] = embedding_vector 
print(num_words)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['text'].values)
X = tokenizer.texts_to_sequences(df['text'].values)

word_index = tokenizer.word_index
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)

Y = pd.get_dummies(df['depression']).values
print('Shape of label tensor:', Y.shape)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.30, random_state = 99)
# X_train, X_test, Y_train, Y_test = train_test_split(X_svd, Y, test_size = 0.30, random_state = 42)
print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

53661
Shape of label tensor: (38362, 2)
(26853, 2521) (26853, 2)
(11509, 2521) (11509, 2)


In [None]:
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional, GlobalMaxPooling1D
from keras.callbacks import EarlyStopping
import tensorflow as tf 
from tensorflow import keras
from keras.regularizers import l2

model = Sequential()
adam = tf.optimizers.Adam(learning_rate=0.001)
model.add(Embedding(num_words, 300, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, 
                    trainable=False))
model.add(LSTM(32, return_sequences=True, dropout=0.5, kernel_regularizer=l2(0.001), recurrent_regularizer=l2(0.001), bias_regularizer=l2(0.001)))
model.add(GlobalMaxPooling1D())
model.add(Dense(10,activation='relu', kernel_regularizer=l2(0.001), bias_regularizer=l2(0.001))) 
model.add(layers.Dense(2, activation="softmax"))

model.compile(optimizer=adam, loss="binary_crossentropy", 
     metrics=['accuracy'])

epochs = 25
batch_size = 64

	
es = EarlyStopping(monitor='val_loss', mode="min", patience=3)

history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size, validation_split=0.3, callbacks=[es])
# history = model.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs=epochs, batch_size=batch_size)


Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [None]:
accr = model.evaluate(X_test,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

predictions = model.predict(X_test)
predictions = [np.argmax(prediction) for prediction in predictions]
precision, recall, f1_score, none = precision_recall_fscore_support(Y_test_arr, predictions, average='weighted')
print("Precision is: ", precision)
print("Recall is: ", recall)
print("F1 Score is: ", f1_score)

Test set
  Loss: 0.143
  Accuracy: 0.963
Precision is:  0.9632444777731072
Recall is:  0.9632461551829004
F1 Score is:  0.9632449591810578


# Bi-LSTM + Glove
- glove.6B.300d.txt is attached in the submission folder. Please use the appropriate directory you stored it in.

In [None]:
# Glove
import numpy as np
embeddings_index = {}


f = open("/content/drive/MyDrive/Colab Notebooks/IS424 Depression Project/Finalised Submission Folder/glove.6B.300d.txt")
for line in f:
  values = line.split()
  word = values[0]
  coefs = np.asarray(values[1:])
  embeddings_index[word] = coefs
f.close()

num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, 300))

for word, i in word_index.items():
  if i > num_words: 
    continue

  embedding_vector = embeddings_index.get(word) 
  if embedding_vector is not None:
    embedding_matrix[i] = embedding_vector 
print(num_words)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['text'].values)
X = tokenizer.texts_to_sequences(df['text'].values)

word_index = tokenizer.word_index
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)

Y = pd.get_dummies(df['depression']).values
print('Shape of label tensor:', Y.shape)


X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.30, random_state = 99)
# X_train, X_test, Y_train, Y_test = train_test_split(X_svd, Y, test_size = 0.30, random_state = 42)
print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)



53661
Shape of label tensor: (38362, 2)
(26853, 2521) (26853, 2)
(11509, 2521) (11509, 2)


In [None]:
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional, GlobalMaxPooling1D
from keras.callbacks import EarlyStopping
import tensorflow as tf 
from tensorflow import keras
from keras.regularizers import l2

model = Sequential()
adam = tf.optimizers.Adam(learning_rate=0.001)
model.add(Embedding(num_words, 300, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, 
                    trainable=False))
model.add(Bidirectional(LSTM(32, return_sequences=True, dropout=0.5, kernel_regularizer=l2(0.001), recurrent_regularizer=l2(0.001), bias_regularizer=l2(0.001))))
model.add(GlobalMaxPooling1D())
model.add(Dense(10,activation='relu', kernel_regularizer=l2(0.001), bias_regularizer=l2(0.001))) 
model.add(layers.Dense(2, activation="softmax"))

model.compile(optimizer=adam, loss="binary_crossentropy", 
     metrics=['accuracy'])

epochs = 25
batch_size = 64

	
es = EarlyStopping(monitor='val_loss', mode="min", patience=3)

history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size, validation_split=0.3, callbacks=[es])
# history = model.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs=epochs, batch_size=batch_size)


Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25


In [None]:
accr = model.evaluate(X_test,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

predictions = model.predict(X_test)
predictions = [np.argmax(prediction) for prediction in predictions]
precision, recall, f1_score, none = precision_recall_fscore_support(Y_test_arr, predictions, average='weighted')
print("Precision is: ", precision)
print("Recall is: ", recall)
print("F1 Score is: ", f1_score)

Test set
  Loss: 0.157
  Accuracy: 0.961
Precision is:  0.9614523505560858
Recall is:  0.9614214962203493
F1 Score is:  0.9614274979874109
