# import libraries

In [0]:
import tensorflow as tf
from tensorflow import keras
print(tf.__version__)

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import json
import numpy as np
import matplotlib.pyplot as plt


# define helper functions

In [0]:
def plot_graphs(history, metric):
  plt.plot(history.history[metric])
  plt.plot(history.history['val_'+metric])
  plt.xlabel('time')
  plt.ylabel(metric)
  plt.grid(True)
  print('blue: {}'.format(metric))
  print('orange: {}'.format('val_'+metric))

# define hyperparameters

In [0]:
vocab_size = 1000 # number of word tokens to generate ordered by frequenc
max_length = 120 # max length of sentence 
trunc_type = 'post' # if sentence exceeds max_length, cut from end
pad_type = 'post' # if sentence is short, pad with 0's on end
oov_token = '<OOV>' # token substitute for words not found in word_index
train_size = 20000 # where to split training & testing datasets in main dataset
num_epochs = 50

as an nlp neural network trains, it learns vectors and associates vectors with labels to come up with an EMBEDDING
- embedding= vector for each word with associated label
- embedding layer returns 2D array= (sentence_length, embedding_size)

In [0]:
embedding_dimensions = 16

# get data

In [0]:
!wget --no-check-certificate \
https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sarcasm.json \
-O /tmp/sarcasm.json

# split dataset into training & testing 

In [0]:
with open('/tmp/sarcasm.json') as csv:
  # parse json object into a list
  datastore = json.load(csv)

# define default data x & y lists
sentences = []
labels = []

# iterate over datastore and append to respective lists 
for item in datastore:
  sentences.append(item['headline'])
  labels.append(item['is_sarcastic'])

# [:train_size] = 0 up to train_size (not including train_size value)
train_sentences = sentences[:train_size]
train_labels = labels[:train_size]

# [train_size:] = train_size value up to end
test_sentences = sentences[train_size:]
test_labels = labels[train_size:]



# preprocess

**tokenize sentences**

tokenize means to split corpus dataset into encoded words (words in numeric representation)


In [0]:
# instantiate tokenizer to generate word index dictionary
  # num_words= max num_words tracked by frequency in corpus
  # oov_token= token substitute for out_of_vocabulary words
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token)

# .fit_on_texts() to tokenize sentences 
tokenizer.fit_on_texts(train_sentences)

# get word_index dictionary= word:index
word_index = tokenizer.word_index
# print('word_index: \n{}'.format(word_index))

In [0]:
# texts_to_sequences() transforms list of sentences into lists of numeric representation
  # uses training word_index
sequences = tokenizer.texts_to_sequences(train_sentences)


In [0]:
# pad_sequences() transforms sentence/sequence into a uniform input_shape 
  # padding= 'post' because default padding adds 0's to beginning 
  # maxlen= max length of sequence
train_padded = pad_sequences(sequences, maxlen=max_length, padding=pad_type, truncating=trunc_type)


In [0]:
# tokenize & pad test sequences
# word_index is derived from training set, testing will probably generate more <OOV>
test_sequences = tokenizer.texts_to_sequences(test_sentences)
test_padded = pad_sequences(test_sequences)

convert into numpy arrays for model processing

In [0]:
train_padded = np.array(train_padded)
train_labels = np.array(train_labels)

test_padded = np.array(test_padded)
test_labels = np.array(test_labels)

# define the model

**embedding process**

words are mapped in higher dimensional space, and semantics of the words are then learned when those words are labeled with similar meaning. 

*movie review examples:*
- movie reviews with positive sentiment had the dimensionality of their words ended up 'pointing' in a particular direction

- movie reviews with negative sentiment 'pointed' in a different direction

---

after model training, words in future sentences could have their direction established as positive or negative (inferred sentiment)

In [0]:
model = keras.Sequential([
  # input_layer
  # over time, words cluster together due to the training labels (word meaning)
    # embedding= words found together are given similar vectors (shape & direction)
  # embedding output_shape=(sentence_length, embedding_size)
    # embedding_dimension=num_neurons                          
  keras.layers.Embedding(vocab_size, embedding_dimensions, input_length=max_length),
  # 128 5x5 filters/neurons that detect shared patterns
  # relu= return x if x > 0, else return 0
  keras.layers.Conv1D(filters=128, kernel_size=5, activation='relu'),
  # flatten layer that takes max values in filter and compresses, commonly used for nlp
  keras.layers.GlobalMaxPool1D(),
  # fully-connected dense layers that map inputs to outputs
  keras.layers.Dense(units=24, activation='relu'),
  # binary classification output layer
  # sigmoid= return 0 or 1, whichever has greater probability
  keras.layers.Dense(units=1, activation='sigmoid')
])

model.summary()

# compile the model 

build the model by compiling it with a loss, optimizer, and objective metric
- loss= prediction accuracy
- the optimizer uses the loss to adjust and imporove prediction performance
- metric= target

In [0]:
# binary classification uses loss: binary_crossentropy
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# define callbacks

In [0]:
# enable early_stopping to prevent overfitting
class myCallback(keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs={}):
    if logs.get('accuracy') >= .99:
      print('\nstopping training, train accuracy > 99%')
      self.model.stop_training = True

callbacks = myCallback()

# train the model

fit the model to train & learn the optimal weights/relationships


In [0]:
%%capture
# assign trained model to history var for performance querying
history = model.fit(train_padded, train_labels, epochs=num_epochs, validation_data=(test_padded, test_labels), callbacks=[callbacks], verbose=1)

# visualize performance 

In [0]:
plt.figure(figsize=(10,6))
plot_graphs(history, 'accuracy')
plt.show()

# clean up

terminate memory kernel to free up resources

In [0]:
import os, signal

os.kill(os.getpid(), signal.SIGKILL)