In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# Import packages
import pandas as pd
import json
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, GRU
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorboard.plugins.hparams import api as hp
import numpy as np

In [None]:
# Convert JSON file to array of JSON objects
with open('./data/Sarcasm_Headlines_Dataset.json', 'r') as f:
    data = json.loads("[" + f.read().replace("}\n{", "},\n{") + "]")

In [None]:
# Create DataFrame from array of JSON objects
df = pd.DataFrame(data)

In [None]:
# Create predictor and target from DataFrame
X = df['headline']
y = df['is_sarcastic']

In [None]:
# Create train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
# Set tokenizer and padder parameters
num_words = 1000
oov_token = '<OOV>'
pad_type = 'pre'
trunc_type = 'post'
embedding_dim = 8

In [None]:
# Fit tokenizer on the training set
tokenizer = Tokenizer(num_words=num_words, oov_token=oov_token)
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index

In [None]:
# Tokenize the training and test sets
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

maxlen = max([len(x) for x in X_train_sequences])

In [None]:
# Pad the training and test sets
X_train_padded = pad_sequences(X_train_sequences, padding=pad_type, truncating=trunc_type, maxlen=maxlen)
X_test_padded = pad_sequences(X_test_sequences, padding=pad_type, truncating=trunc_type, maxlen=maxlen)

In [None]:
print(X_train[1])

In [None]:
print(X_train_sequences[1])

In [None]:
print(X_train_padded[1])

## Hyperparameter tuning

In [None]:
! rm -rf ./logs/

In [None]:
HP_EMBED_INITIALIZER = hp.HParam('embeddings_initializer', hp.Discrete(["Constant",
"GlorotNormal",
"GlorotUniform",
"HeNormal",
"HeUniform",
"Identity",
"Initializer",
"LecunNormal",
"LecunUniform",
"Ones",
"Orthogonal",
"RandomNorma",
"RandomUniform",
"TruncatedNormal",
"VarianceScaling",
"Zeros",
"constant",
"glorot_normal",
"glorot_uniform",
"he_normal",
"he_uniform",
"identity",
"lecun_normal",
"lecun_uniform",
"ones",
"orthogonal",
"random_normal",
"random_uniform",
"truncated_normal",
"variance_scaling",
"zeros"]))




HP_EMBEDDINGS_REGULARIZER = hp.HParam('l2 regularizer', hp.RealInterval(0.0, 1.0))

HP_EMBEDDINGS_CONSTRAINT = hp.HParam('embeddings_constraint', hp.Discrete(["MaxNorm",
"MinMaxNorm",
"NonNeg",
"RadialConstraint",
"UnitNorm",
"max_norm",
"min_max_norm",
"non_neg",
"radial_constraint",
"unit_norm"]))


HP_OPTIMIZER = hp.HParam('optimizer', hp.Discrete(["sgd",
    "rmsprop",
    "adam",
    "adadelta",
    "adagrad",
    "adamax",
    "nadam",
    "ftrl"]))



HP_ACTIVATION = hp.HParam('activation', hp.Discrete([
    "relu",
    "sigmoid",
    "softmax",
    "softplus",
    "softsign",
    "tanh",
    "selu",
    "elu",
    "exponential"]))

    
METRIC_ACCURACY = 'accuracy'

with tf.summary.create_file_writer('logs/hparam_tuning').as_default():
    hp.hparams_config(
        hparams=[HP_EMBED_INITIALIZER,HP_EMBEDDINGS_REGULARIZER,HP_EMBEDDINGS_CONSTRAINT,HP_ACTIVATION,HP_OPTIMIZER],
        metrics=[hp.Metric(METRIC_ACCURACY, display_name='Accuracy')],)

In [None]:
def train_test_model(hparams):
    model = Sequential()
    model.add(Embedding(input_dim=num_words, 
                        output_dim=embedding_dim, 
                        mask_zero=True, 
                        embeddings_initializer=hparams[HP_EMBED_INITIALIZER], 
                        embeddings_regularizer=tf.keras.regularizers.l2(hparams[HP_EMBEDDINGS_REGULARIZER]), 
                        embeddings_constraint=hparams[HP_EMBEDDINGS_CONSTRAINT]))
    model.add(Bidirectional(GRU(8)))
    model.add(Dense(8, activation=hparams[HP_ACTIVATION]))
    model.add(Dense(1, 'sigmoid'))
    
    model.compile(loss='binary_crossentropy',optimizer=hparams[HP_OPTIMIZER],metrics=['accuracy'])
    
    model.fit(X_train_padded, y_train.values.reshape(-1, 1), epochs=3)#, validation_data=(X_test_padded, y_test.values.reshape(-1, 1)))
    _, accuracy = model.evaluate(X_test_padded, y_test.values.reshape(-1, 1))
    return accuracy

In [None]:
def run(run_dir, hparams):
    with tf.summary.create_file_writer(run_dir).as_default():
        hp.hparams(hparams)
        accuracy = train_test_model(hparams)
        tf.summary.scalar(METRIC_ACCURACY, accuracy, step=1)

In [None]:
session_num = 0

for i in range(10):
    hparams = {
        HP_EMBED_INITIALIZER: np.random.choice(HP_EMBED_INITIALIZER.domain.values),
        HP_EMBEDDINGS_REGULARIZER: np.random.uniform(HP_EMBEDDINGS_REGULARIZER.domain.min_value, HP_EMBEDDINGS_REGULARIZER.domain.max_value),
        HP_EMBEDDINGS_CONSTRAINT: np.random.choice(HP_EMBEDDINGS_CONSTRAINT.domain.values),
        HP_ACTIVATION: np.random.choice(HP_ACTIVATION.domain.values),
        HP_OPTIMIZER: np.random.choice(HP_OPTIMIZER.domain.values),
    }
    
    run_name = "run-%d" % session_num
    print('--- Starting trial: %s' % run_name)
    print({h.name: hparams[h] for h in hparams})
    
    run('logs/hparam_tuning/' + run_name, hparams)
    session_num += 1

In [None]:
# Run each model for 10 epochs and save the history for each to enable choosing best model based on validation accuracy/loss

#history = model.fit(X_train_padded, y_train.values.reshape(-1, 1), epochs=3, validation_data=(X_test_padded, y_test.values.reshape(-1, 1)))