In [102]:
# Import packages
import pandas as pd
import json
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [3]:
# Convert JSON file to array of JSON objects
with open('./data/Sarcasm_Headlines_Dataset.json', 'r') as f:
    data = json.loads("[" + f.read().replace("}\n{", "},\n{") + "]")

In [4]:
# Create DataFrame from array of JSON objects
df = pd.DataFrame(data)

In [5]:
# Create predictor and target from DataFrame
X = df['headline']
y = df['is_sarcastic']

In [6]:
# Create train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [89]:
# Set tokenizer and padder parameters
num_words = 1000
oov_token = '<OOV>'
pad_type = 'pre'
trunc_type = 'post'

In [90]:
# Fit tokenizer on the training set
tokenizer = Tokenizer(num_words=num_words, oov_token=oov_token)
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index

In [91]:
# Tokenize the training and test sets
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

maxlen = max([len(x) for x in X_train_sequences])

In [92]:
# Pad the training and test sets
X_train_padded = pad_sequences(X_train_sequences, padding=pad_type, truncating=trunc_type, maxlen=maxlen)
X_test_padded = pad_sequences(X_test_sequences, padding=pad_type, truncating=trunc_type, maxlen=maxlen)

In [93]:
print(X_train[1])

the 'roseanne' revival catches up to our thorny political mood, for better and worse


In [94]:
print(X_train_sequences[1])

[1, 927, 736, 695, 429, 1, 41, 1, 2, 45, 1, 5, 7, 302, 736]


In [95]:
print(X_train_padded[1])

[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   1 927 736 695 429   1  41   1   2  45   1
   5   7 302 736]


In [96]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=num_words,output_dim=64),
    tf.keras.layers.GRU(64),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, 'sigmoid')
])

In [97]:
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [98]:
len(y_train.values.reshape(len(X_train_padded), 1))

20031

In [99]:
history = model.fit(X_train_padded, y_train.values.reshape(-1, 1), epochs=10, validation_data=(X_test_padded, y_test.values.reshape(-1, 1)))

Train on 20031 samples, validate on 6678 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [100]:
test_loss, test_acc = model.evaluate(X_test_padded, y_test.values.reshape(-1, 1))

print('Test Loss: {}'.format(test_loss))
print('Test Accuracy: {}'.format(test_acc))



Test Loss: 0.47563465652297304
Test Accuracy: 0.823150634765625
