In [None]:
from snippets import unpack_file, load_text_data, plot_graphs, generate_text, create_n_grams
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
import numpy as np


In [None]:
FILE = "sarcasm.zip"
DIR = "sarcasm"

unpack_file(FILE, DIR)

In [None]:
DATA_FILE = DIR + "/train-balanced-sarcasm.csv"
corpus, labels = load_text_data(DATA_FILE, 1, 0)
corpus = [sentence for i, sentence in enumerate(corpus) if int(labels[i]) == 1]

In [None]:
# Create input sequences

MAX_WORDS = 1000
SEQUENCE_LEN = 50
MAX_DOCS = 5000

tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)

if MAX_WORDS:
    tokenizer.word_index = {e:i for e,i in tokenizer.word_index.items() if i <= MAX_WORDS}

total_words = len(tokenizer.word_index) + 1

predictors, predictands = create_n_grams(corpus, tokenizer, SEQUENCE_LEN, MAX_DOCS)
predictands = tf.keras.utils.to_categorical(predictands, num_classes=total_words, dtype=int)

In [None]:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Embedding(total_words, 100, input_length=SEQUENCE_LEN-1))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(50, return_sequences = True)))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.LSTM(50)),
model.add(tf.keras.layers.Dense(total_words/2, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)))
model.add(tf.keras.layers.Dense(total_words, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

In [None]:
history = model.fit(predictors, predictands, epochs=1, verbose=1)

In [None]:
plot_graphs(history, "accuracy", plot_validation=False)

In [None]:
seed_text = "I have never seen such a good movie!"
next_words = 50

generate_text(model, tokenizer, seed_text, next_words)