In [None]:
# Importing Libraries
import tensorflow as tf
from sklearn.model_selection import train_test_split
import unicodedata
import re
import numpy as np
import os
import io
import time
import random
import nltk

# Testing for eager execution
try:
    tf.enable_eager_execution()
except:
    pass
print('Tensorflow Version: ', tf.__version__)
print('Using Eager Execution?: ', tf.executing_eagerly())

In [None]:
# First lets make sure we are operating on GPU
if not tf.test.gpu_device_name():
    print('No GPU found. Please use a GPU to train your neural network.')
else:
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))

In [None]:
# Defining our reading function for pulling in data
def load_data(path):
    with open(path, 'r', encoding='utf-8') as file:
        data = file.read()
    return data

In [None]:
# Looks like we need to whitespace the punctuation
def whitespace_punct(sent_list):
    whitespaced = [re.sub('([.,!?;()"])', r' \1 ', x).strip() for x in sent_list]
    whitespaced = [re.sub('\s{2,}', ' ', x).strip() for x in whitespaced]
    whitespaced = [x.replace('-',' - ') for x in whitespaced]
    return whitespaced

In [None]:
# Defining our data preprocessing function
def preprocess_mobydick(path, max_length=None):
    # Loading our raw text data
    raw_data = load_data(path)
    
    # Dropping lines that have chapter in them
    raw_data = '\n'.join([x for x in raw_data.split('\n') if not 'CHAPTER' in x])
    
    # Getting rid of some special characters
    raw_data = raw_data.replace('\ufeff','')
    raw_data = raw_data.replace('\n',' ')
    raw_data = raw_data.replace('-',',')
    raw_data = raw_data.replace('—',' , ')
    raw_data = raw_data.replace('”','"')
    raw_data = raw_data.replace('“','"')
    raw_data = raw_data.replace('  ',' ')
    raw_data = raw_data.replace('_','')
    raw_data = raw_data.replace('?"', '? "')
    raw_data = raw_data.replace('!"', '! "')
    raw_data = raw_data.replace('."', '. "')
    raw_data = raw_data.replace('Oh!','Oh,')
    raw_data = raw_data.replace('’',"'")
    raw_data = raw_data.replace(';', ' . ')
    
    # Prepping our sentence tokenizer
    sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    
    # Setting exceptions to tokenization
    extra_abbreviations = set(['Dr', 'Mr', 'Mrs', 'Prof', 'Ms'])
    sentence_tokenizer._params.abbrev_types.update(extra_abbreviations)

    # Tokenizing text
    sentences = sentence_tokenizer.tokenize(raw_data)

    # Rejoining
    sentences = '\n'.join(sentences).lower()
    
    # Getting rid of endnotes
    sentences = sentences.split('end of the project gutenberg ebook')[0]

    # Splitting back up
    sentences = sentences.split('\n')
    
    # Whitespacing
    sentences = whitespace_punct(sentences)
    
    # Removing empty sentences
    sentences = [x for x in sentences if len(x)>0]
    
    # Cutting down long sentences
    if max_length is not None:
        sentences = [x if len(x.split())<max_length else ' '.join(x.split()[0:max_length]) for x in sentences]
    
    # Joining sentences
    sentences = '\n'.join(sentences)
    
    # Replacing titles
    sentences = sentences.replace('mr . ','mr. ').replace('ms . ','ms. ').replace('mrs . ', 'mrs. ').replace('--',' : ')
    
    # Resplitting sentences
    sentences = sentences.split('\n')
    
    # Making sure trailing, leading whitespaces removed
    sentences = [x.lower().rstrip().strip() for x in sentences]
    
    # Adding our start and stop tokens
    sentences = ['<start> ' + x + ' <end>' for x in sentences]
    
    # Returning sentences
    return sentences

In [None]:
# Loading text
sents = preprocess_mobydick('MobyDick.txt', max_length=30)

# How many sentences in our corpus?
print('Number of sentences in corpus: ',len(sents))

# How many distinct tokens?
print('Number of tokens: ', len(set('\n'.join(sents).split(' '))))

# What is the max sentence length?
print('Max sentence length: ', max([len(x.split()) for x in sents]))

# What does our dataset look like?
print(*sents, sep = '\n\n')

In [None]:
# Defining our tokenizer
def tokenize(lang, vocab_size):
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', num_words=vocab_size)
    
    lang_tokenizer.fit_on_texts(lang)
    
    tensor = lang_tokenizer.texts_to_sequences(lang)
    
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')
  
    return tensor, lang_tokenizer
# Some code adapted from
# https://github.com/tensorflow/docs/blob/master/site/en/r2/tutorials/sequences/nmt_with_attention.ipynb

In [None]:
# Creating our dataset function
def load_dataset(path, max_len=None, vocab_size=None):
    # creating cleaned input, output pairs
    corpus = preprocess_mobydick(path, max_len)
    
    # Tokenizing our text
    corpus, corpus_tokenizer = tokenize(corpus, vocab_size)
    
    return corpus, corpus_tokenizer

In [None]:
# Defining our tensor length function
def max_length(tensor):
    return max(len(t) for t in tensor)
# Some code adapted from
# https://github.com/tensorflow/docs/blob/master/site/en/r2/tutorials/sequences/nmt_with_attention.ipynb

In [None]:
# Loading our data
corpus, corpus_tokenizer = load_dataset('MobyDick.txt', max_len=30, vocab_size=1000)

# Calculate max_length of the target tensors
max_length_corpus = max_length(corpus)

# Printing the data's shape
print('Tokenized Corpus Shape: ', corpus.shape)

# Printing the first 3 lines
print(' First 3 lines of data:\n',corpus[1:4])

In [None]:
# Creating our skip-thought data creator
def skip_thought_data(corpus):
    # Creating our pre and post data
    pre_corpus, post_corpus = corpus[:-2], corpus[2:]
    
    # Removing 1st and 2nd obs from corpus
    corpus = corpus[1:-1]
    
    return corpus, pre_corpus, post_corpus

In [None]:
corpus, pre_corpus, post_corpus = skip_thought_data(corpus)
print('Encoding shape: ', corpus.shape)
print('Previous shape: ', pre_corpus.shape)
print('Post shape: ', post_corpus.shape)

In [None]:
# Making sure it works
print('Previous line: ', pre_corpus[0][0:4])
print('Current line: ', corpus[0][0:4])
print('Next line: ', post_corpus[0][0:4])
# Looks like it works great!

In [None]:
# Define the buffer size
BUFFER_SIZE = 1024

# Setting our batch size
BATCH_SIZE = 64

# Number of epochs to train over
EPOCHS = 5

# Number of rounds with no improvement to stop after
early_stopping_rounds = 5

# How many steps do we need to take per epoch?
steps_per_epoch = len(corpus)//BATCH_SIZE

# The dimension of our word embeddings
embedding_dim = 256

# The number of RNN cells to include in the recurrent layer
units = 256

# The dropout rate of the recurrent cells to help generalize
dropout = 0

# Determine the clipping threshold for our gradients to ease training
gradient_clip = 1

# Define the learning rate of our optimizer
learning_rate = 0.005

# Defining the momentum
moment = 0.9

# Setting vocab sizes
vocab_size = len(corpus_tokenizer.word_index)+1

In [None]:
# Defining our encoder
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz, dropout):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.enc_units, 
                                       return_sequences=True, 
                                       return_state=True, 
                                       recurrent_initializer='he_uniform')
        self.drop = tf.keras.layers.Dropout(rate=dropout)
        
    def call(self, x, hidden, training=False):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state = hidden)     
        if training:
            output = self.drop(output, training)
        return output, state

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))
# Some code adapted from
# https://github.com/tensorflow/docs/blob/master/site/en/r2/tutorials/sequences/nmt_with_attention.ipynb

In [None]:
# Defining our Decoder
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz, dropout):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.dec_units, 
                                       return_sequences=True, 
                                       return_state=True, 
                                       recurrent_initializer='he_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)
        self.drop = tf.keras.layers.Dropout(rate=dropout)

    def call(self, x, hidden, enc_output, training=False):
        # Calling our first GRU layer
        output, state = self.gru(enc_output, hidden)
        
        # Applying dropout
        if training:
            output = self.drop(output, training)
            
        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (output.shape[0], -1))

        # output shape == (batch_size, vocab)
        output = self.fc(output)

        return output, state
    
# Some code adapted from
# https://github.com/tensorflow/docs/blob/master/site/en/r2/tutorials/sequences/nmt_with_attention.ipynb

In [None]:
# Defining our loss function
def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    
    return tf.reduce_mean(loss_)
# Some code adapted from
# https://github.com/tensorflow/docs/blob/master/site/en/r2/tutorials/sequences/nmt_with_attention.ipynb

In [None]:
# Defining our NMT loss function
@tf.function
def nmt_train_step(inp, out, enc_hidden):
    loss = 0

    with tf.GradientTape() as tape:
        
        enc_output, enc_hidden = nmt_encoder(inp, enc_hidden, training=True)    

        # Current sentence pass
        dec_hidden = enc_hidden
        dec_input = tf.expand_dims([corpus_tokenizer.word_index['<start>']] * BATCH_SIZE, 1)   
        for t in range(1, pre.shape[1]):
            predictions, dec_hidden = nmt_decoder(dec_input, dec_hidden, enc_output, training=True)
            loss += loss_function(out[:, t], predictions)
            dec_input = tf.expand_dims(out[:, t], 1)

    batch_loss = (loss / int(out.shape[1]))

    variables = nmt_encoder.trainable_variables + nmt_decoder.trainable_variables

    gradients = tape.gradient(loss, variables)
    
    optimizer.apply_gradients(zip(gradients, variables))
  
    return batch_loss

In [None]:
# Defining our training step for pre and post data for skip_thoughts
@tf.function
def skip_thoughts_train_step(inp, pre, post, enc_hidden):
    loss = 0

    with tf.GradientTape() as tape:
        
        enc_output, enc_hidden = st_encoder(inp, enc_hidden, training=True)    

        # Preceeding sentence pass
        dec_hidden = enc_hidden
        dec_input = tf.expand_dims([corpus_tokenizer.word_index['<start>']] * BATCH_SIZE, 1)   
        for t in range(1, pre.shape[1]):
            predictions, dec_hidden = st_pre_decoder(dec_input, dec_hidden, enc_output, training=True)
            loss += loss_function(pre[:, t], predictions)
            dec_input = tf.expand_dims(pre[:, t], 1)
        
        # Following sentence pass
        dec_hidden = enc_hidden
        dec_input = tf.expand_dims([corpus_tokenizer.word_index['<start>']] * BATCH_SIZE, 1)   
        for t in range(1, post.shape[1]):
            predictions, dec_hidden = st_post_decoder(dec_input, dec_hidden, enc_output, training=True)
            loss += loss_function(post[:, t], predictions)
            dec_input = tf.expand_dims(post[:, t], 1) 

    batch_loss = (loss / int(pre.shape[1]))

    variables = st_encoder.trainable_variables + st_pre_decoder.trainable_variables + st_post_decoder.trainable_variables

    gradients = tape.gradient(loss, variables)
    
    optimizer.apply_gradients(zip(gradients, variables))
  
    return batch_loss
# Some code adapted from
# https://github.com/tensorflow/docs/blob/master/site/en/r2/tutorials/sequences/nmt_with_attention.ipynb

In [None]:
# Setting up our NMT Model
# Creating our dataset
nmt_dataset = tf.data.Dataset.from_tensor_slices((corpus, corpus)).shuffle(BUFFER_SIZE)
nmt_dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)
# Defining encoder
nmt_encoder = Encoder(vocab_size, embedding_dim, units, BATCH_SIZE, dropout)
# Defining decoder
nmt_decoder = Decoder(vocab_size, embedding_dim, units, BATCH_SIZE, dropout)
# Defining optimizer
nmt_optimizer = tf.keras.optimizers.SGD(lr=learning_rate, momentum = moment, clipvalue=gradient_clip)
# Defining loss function
nmt_loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
# Defining our checkpoint
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt/nmt")
checkpoint = tf.train.Checkpoint(optimizer=nmt_optimizer,
                                 encoder=nmt_encoder,
                                 decoder=nmt_decoder)
# Some code adapted from
# https://github.com/tensorflow/docs/blob/master/site/en/r2/tutorials/sequences/nmt_with_attention.ipynb

In [None]:
# Training our NMT Model
for epoch in range(EPOCHS):
    start = time.time()

    enc_hidden = st_encoder.initialize_hidden_state()
    loss = 0
    
    # Calculation loss and applying gradients on training batches
    for (batch, (sent, out)) in enumerate(st_dataset.take(steps_per_epoch)):
        batch_loss = nmt_train_step(sent, out, enc_hidden)
        print('Batch {}/{} - {:.4f}'.format(batch, steps_per_epoch, batch_loss))
        loss += batch_loss
        
    # Creating our meaned losses
    loss = loss/steps_per_epoch
    
    # Printing out our progress
    print('NMT: Epoch = {} | Training Loss = {:.4f} | Train Time = {:.2f} sec\n'.format(epoch + 1,
                                                                                       loss,
                                                                                       time.time() - start))
checkpoint.save(file_prefix = checkpoint_prefix)

In [None]:
# Setting up our Skip-thoughts model
# Creating our dataset
st_dataset = tf.data.Dataset.from_tensor_slices((corpus, pre_corpus, post_corpus)).shuffle(BUFFER_SIZE)
st_dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)
# Defining encoder
st_encoder = Encoder(vocab_size, embedding_dim, units, BATCH_SIZE, dropout)
# Defining decoder
st_pre_decoder = Decoder(vocab_size, embedding_dim, units, BATCH_SIZE, dropout)
st_post_decoder = Decoder(vocab_size, embedding_dim, units, BATCH_SIZE, dropout)
# Defining optimizer
st_optimizer = tf.keras.optimizers.SGD(lr=learning_rate, momentum = moment, clipvalue=gradient_clip)
# Defining loss function
st_loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
# Defining our checkpoint
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt/skip-thoughts")
checkpoint = tf.train.Checkpoint(optimizer=st_optimizer,
                                 encoder=st_encoder,
                                 pre_decoder = st_pre_decoder
                                 post_decoder=st_post_decoder)
# Some code adapted from
# https://github.com/tensorflow/docs/blob/master/site/en/r2/tutorials/sequences/nmt_with_attention.ipynb

In [None]:
# Training our skip-thoughts model
for epoch in range(EPOCHS):
    start = time.time()

    enc_hidden = st_encoder.initialize_hidden_state()
    loss = 0
    
    # Calculation loss and applying gradients on training batches
    for (batch, (sent, pre, post)) in enumerate(st_dataset.take(steps_per_epoch)):
        batch_loss = skip_thoughts_train_step(sent, pre, post, enc_hidden)
        print('Batch {}/{} - {:.4f}'.format(batch, steps_per_epoch, batch_loss))
        loss += batch_loss
        
    # Creating our meaned losses
    loss = loss/steps_per_epoch
    
    # Printing out our progress
    print('Skip-thoughts: Epoch = {} | Training Loss = {:.4f} | Train Time = {:.2f} sec\n'.format(epoch + 1,
                                                                                                  loss,
                                                                                                  time.time() - start))
checkpoint.save(file_prefix = checkpoint_prefix)

# Some code adapted from
# https://github.com/tensorflow/docs/blob/master/site/en/r2/tutorials/sequences/nmt_with_attention.ipynb

In [None]:
# Now we need to reimport our models to get an estimate on cosine similarity to compare
# We can use T-SNE or PCA to get down to 2 dimensions