In [22]:
import json
import numpy as np
import tensorflow as tf
from   tensorflow.keras.preprocessing.text import Tokenizer
from   tensorflow.keras.preprocessing.sequence import pad_sequences

In [5]:
!ls ../../../data

aclImdb_v1.tar.gz bbc-text.csv      [34mnmt[m[m               [34msurnames[m[m
[34mag_news[m[m           [34mbooks[m[m             sarcasm.json      [34myelp[m[m


In [6]:
DATA = '../../../data'

In [15]:
vocab_size = 1000
embedding_dim = 32
max_len = 120
padding_type = 'post'
trunc_type = 'post'
oov = '<OOV>'
training_size = 20000

In [8]:
with open(f'{DATA}/sarcasm.json', 'r') as f:
    datastore = json.load(f)

In [9]:
sentences = []
labels = []
urls = []
for item in datastore:
    sentences.append(item['headline'])
    labels.append(item['is_sarcastic'])

In [11]:
training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]
training_labels = labels[0:training_size]
testing_labels = labels[training_size:]

In [12]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov)
tokenizer.fit_on_texts(training_sentences)

In [13]:
word_index = tokenizer.word_index

In [16]:
training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, 
                                maxlen=max_len, 
                                padding=padding_type, 
                                truncating=trunc_type)
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences,
                               maxlen=max_len,
                               padding=padding_type,
                               truncating=trunc_type)

In [19]:
mod = tf.keras.Sequential([
    tf.keras.layers.Embedding(
        vocab_size, embedding_dim, input_length=max_len),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')])
mod.summary()
mod.compile(
    loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 120, 32)           32000     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 64)                16640     
_________________________________________________________________
dense_2 (Dense)              (None, 24)                1560      
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 25        
Total params: 50,225
Trainable params: 50,225
Non-trainable params: 0
_________________________________________________________________


In [20]:
EPX = 50

In [23]:
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

In [None]:
history = mod.fit(training_padded, 
                  training_labels, 
                  epochs=EPX, 
                  validation_data=(testing_padded, testing_labels), 
                  verbose=1)

Train on 20000 samples, validate on 6709 samples
Epoch 1/50


In [None]:
def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()

In [None]:
plot_graphs(history, 'acc')
plot_graphs(history, 'loss')