# import libraries

In [0]:
import tensorflow as tf
from tensorflow import keras
print(tf.__version__)

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import os
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.image as mpimg 

import tensorflow.keras.utils as ku


2.2.0-rc3


# define helper functions

In [0]:
def plot_graphs(history, metric):
  plt.plot(history.history[metric])
  plt.plot(history.history['val_'+metric])
  plt.xlabel('time')
  plt.ylabel(metric)
  plt.grid(True)
  print('blue: {}'.format(metric))
  print('orange: {}'.format('val_'+metric))

# get dataset

In [0]:
!wget --no-check-certificate \
    https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sonnets.txt \
    -O /tmp/sonnets.txt

--2020-04-19 05:02:16--  https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sonnets.txt
Resolving storage.googleapis.com (storage.googleapis.com)... 74.125.195.128, 2607:f8b0:400e:c04::80
Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.195.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 93578 (91K) [text/plain]
Saving to: ‘/tmp/sonnets.txt’


2020-04-19 05:02:16 (117 MB/s) - ‘/tmp/sonnets.txt’ saved [93578/93578]



In [0]:
# open & read data
data = open('/tmp/sonnets.txt').read()

# preprocess

**tokenize sentences**

tokenize means to split corpus dataset into encoded words (words in numeric representation)


In [0]:
# transform your corpus into all lowercase
corpus = data.lower()
# split the corpus into sentences by new line 
corpus = corpus.split('\n')

In [0]:
# instantiate tokenizer to generate word index dictionary
tokenizer = Tokenizer()

# .fit_on_texts() to tokenize training sentences
tokenizer.fit_on_texts(corpus)

# get word_index dictionary= word:index
word_index = tokenizer.word_index
print('word_index: \n{}'.format(word_index))

In [0]:
# get vocab_size
vocab_size = len(tokenizer.word_index) + 1
print('vocab_size: \n{}'.format(vocab_size))

vocab_size: 
3211


In [0]:
# create input sequences using training word_index
input_sequences = []
for line in corpus:
  # texts_to_sequences() transforms list of sentences into lists of numeric representation
    # uses training word_index
  token_list = tokenizer.texts_to_sequences([line])[0]
  for i in range(1, len(token_list)):
    n_gram_sequences = token_list[:i+1]
    input_sequences.append(n_gram_sequences)

In [0]:
# pad_sequences() transforms sentence/sequence into a uniform input_shape 
  # padding= 'post' because default padding adds 0's to beginning 
  # maxlen= max length of sequence
max_sequences_len = max([len(x) for x in input_sequences])
padded = pad_sequences(input_sequences, maxlen=max_sequences_len)

# convert training sequences 
padded = np.array(padded)

In [0]:
# create predictors and labels
train_padded, labels = padded[:, :-1], padded[:, -1]

# one-hot encode
labels = ku.to_categorical(labels, num_classes=vocab_size)

print('train_padded: \n{}'.format(train_padded))
print('labels: \n{}'.format(labels))

# define hyperparameters

In [0]:
embedding_dimensions = 100
max_length = max_sequences_len - 1
num_epochs = 100

# define the model

**embedding process**

words are mapped in higher dimensional space, and semantics of the words are then learned when those words are labeled with similar meaning. 

*movie review examples:*
- movie reviews with positive sentiment had the dimensionality of their words ended up 'pointing' in a particular direction

- movie reviews with negative sentiment 'pointed' in a different direction

after model training, words in future sentences could have their direction established as positive or negative (inferred sentiment)

In [0]:
model = keras.Sequential([
  # input_layer
  # over time, words cluster together due to the training labels (word meaning)
    # embedding= words found together are given similar vectors (shape & direction)
  # embedding output_shape=(sentence_length, embedding_size)
    # embedding_dimension=num_neurons
  keras.layers.Embedding(vocab_size, embedding_dimensions, input_length=max_length),
  # bidirectional= flowing 'cell state' goes both directions
  # return_sequences= when stacking LSTM layers, feeding an LSTM into another
    # requires the output of LSTM(current) to match the desired input of LSTM(next)
  keras.layers.Bidirectional(keras.layers.LSTM(units=150, return_sequences=True)),
  # randomly deactivate % of neurons in hidden layer
  keras.layers.Dropout(0.2),
  keras.layers.Bidirectional(keras.layers.LSTM(units=100)),
  # L2 regularization= give features different weights of importance 
  # relu= return x if x > 0, else return 0
  keras.layers.Dense(units=vocab_size/2, activation='relu', kernel_regularizer=keras.regularizers.l2(0.01)),
  # output_layer
  # softmax= organize prediction neurons into valid distribution of 100% 
  keras.layers.Dense(units=vocab_size, activation='softmax')
])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 10, 100)           321100    
_________________________________________________________________
bidirectional (Bidirectional (None, 10, 300)           301200    
_________________________________________________________________
dropout (Dropout)            (None, 10, 300)           0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 200)               320800    
_________________________________________________________________
dense (Dense)                (None, 1605)              322605    
_________________________________________________________________
dense_1 (Dense)              (None, 3211)              5156866   
Total params: 6,422,571
Trainable params: 6,422,571
Non-trainable params: 0
______________________________________________

# compile the model

build the model by compiling it with a loss, optimizer, and objective metrics
- loss= prediction accuracy
- the optimizer uses the loss to adjust & improve prediction performance per epoch
- metrics= target

In [0]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])

# define callbacks

In [0]:
# enable early stopping 
class myCallback(keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs={}):
    if logs.get('acc') > .97:
      self.model.stop_training = True

callbacks = myCallback()

# train the model

fit the model to trian & learn the optimal weights/relationships

In [0]:
# assign train model to a history var for performance querying
history = model.fit(train_padded, labels, epochs=num_epochs, callbacks=[callbacks], verbose=1)

# visualize performance

In [0]:
plt.figure(figsize=(10,6))
plot_graphs(history, 'acc')
plot_graphs(history, 'loss')
plt.show()

# test the model

In [0]:
seed_text = "Help me Obi Wan Kenobi, you're my only hope"
next_words = 100

for _ in range(next_words):
  # preprocess with tokenizer
	token_list = tokenizer.texts_to_sequences([seed_text])[0]
	token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
	# get label/class probabilities
  predicted = model.predict_classes(token_list, verbose=0)
	output_word = ""
  # map word to label
	for word, index in tokenizer.word_index.items():
		if index == predicted:
			output_word = word
			break
  # generate text from rnn predictions
	seed_text += " " + output_word
print(seed_text)


# clean up

terminate the kernel to free up memory 

In [0]:
import os, signal

# os.kill(os.getpid(), signal.SIGKILL)