In [31]:
import pandas as pd
import numpy as np

import tensorflow as tf

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Embedding, SpatialDropout1D, LSTM, GRU, Dense
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical
from keras.layers import Dense, Activation, Dropout
# import LabelEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from matplotlib import pyplot as plt

In [32]:
#Auxillary function for graphing loss and accuracy during training
def graph_plots(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()

In [33]:
df = pd.read_csv('cleaned_dataset.csv')

In [34]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(df['cleaned_posts'])
tokenizer.word_index # Get our learned vocabulary
word_index = tokenizer.word_index

VOCAB_SIZE = len(word_index)+1 # Total words

In [35]:
X = tokenizer.texts_to_sequences(df['cleaned_posts'])
MAX_SEQ_LENGTH = max(len(seq) for seq in X)
X = tf.keras.preprocessing.sequence.pad_sequences(X, maxlen= MAX_SEQ_LENGTH) # Pad the sequence to the same length to make it uniform

In [36]:
labels = LabelEncoder().fit_transform(df.type)
n_classes = len(np.unique(labels))

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=1234)

In [None]:
#Simple RNN - baseline model
model_rnn = Sequential()
model_rnn.add(Embedding(VOCAB_SIZE, 50, input_length= MAX_SEQ_LENGTH))
model_rnn.add(SimpleRNN(100))
model_rnn.add(Dense(n_classes, activation='softmax'))
model_rnn.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
history =model_rnn.fit(X_train,y_train, epochs =100, verbose=0,
                       callbacks=[
                        EarlyStopping(monitor='val_loss',
                                      patience=3,
                                      min_delta=1e-5)])



In [None]:
graph_plots(history, "accuracy")
graph_plots(history, "loss")