## Homework 10

In [1]:
# useful imports 
import tensorflow as tf
from tensorflow_text import RegexSplitter
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense, Conv2D, AveragePooling2D, TimeDistributed, LSTM, GlobalAvgPool2D, AbstractRNNCell, MaxPooling2D, RNN
import numpy as np
import matplotlib.pyplot as plt
import re
from collections import defaultdict
import datetime
import tqdm
from heapq import nlargest
from numpy.linalg import norm

### 2.2 Word Embeddings

In [2]:
# open the txt file
bible = open("bible.txt", "r")  
# read file
data = bible.read()  

In [3]:
# global hyperparameters
VOCAB_SIZE = 10000
WINDOW_SIZE = 4
BATCH_SIZE = 32

# words from the bible corpus we want to track during training
tracked_words= ['holy', 'water', 'wine', 'love', 'son', 'father', 'devil']

In [4]:
# convert to lower case and replace new-line characters with <space>
data = data.lower().replace("\n", " ")
# delete special characters, only alphanumeric values remain
# do we care for numbers or should we also delete them?
data = re.sub('\W+',' ', data)
# split on space
tokenized_data = data.split(' ')

In [5]:
def wordcount(data):
    uniqe_words = set(data)
    occurrences = defaultdict(lambda: 0)
    for item in tokenized_data:
        occurrences[item] +=1
    # sort occurences such that we can get the words with the highest counts
    # sorted from small to high counts
    sorted_occurrences = {k: v for k, v in sorted(occurrences.items(), key=lambda item: item[1])}
    most_common_words = list(sorted_occurrences.keys())[(12744-VOCAB_SIZE):]
    least_common_words = list(sorted_occurrences.keys())[:(12744-VOCAB_SIZE)+1]
    most_common_words = most_common_words[::-1]
    return most_common_words, least_common_words

In [6]:
# m_c_w = most_common_words
# l_c_w = least_common_words
m_c_w, l_c_w = wordcount(tokenized_data)

# remove instances of least commen words from data (they will not be included??)
for word in l_c_w:
    data = re.sub(f' {word} ',' ', data)
    
# assign new tokenized data
tokenized_data = data.split(' ')

In [7]:
# create a vocabulary for later purposes
vocab, index = {}, 1  # start indexing from 1

for word in m_c_w:
    vocab[word] = index
    index += 1
    
example_sequence = [vocab[word] for word in tokenized_data[:10]]
print(example_sequence)
print(tokenized_data[20:30])

[1, 253, 449, 3, 161, 193, 43, 43, 6, 1]
['and', 'the', 'earth', 'was', 'without', 'form', 'and', 'void', 'and', 'darkness']


In [8]:
# inverse vocabulary
inverse_vocab = {index: token for token, index in vocab.items()}

In [9]:
# words from the bible corpus we want to track during training
tracked_words= ['holy', 'water', 'wine', 'love', 'son', 'father', 'devil']

In [10]:
# to create input-target pairs (= word-contexts pairs) we use 
# tf's continuous skipgram model 
# https://www.tensorflow.org/tutorials/text/word2vec

bible_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
      tokenized_data,
      vocabulary_size=VOCAB_SIZE,
      window_size=WINDOW_SIZE,
      negative_samples=0)

bible_skip_grams = [[vocab[x[0]], vocab[x[1]]] for x in bible_skip_grams]
inputs, targets = map(list, zip(*bible_skip_grams))
print(targets[:100])


[143, 38, 13, 112, 125, 5471, 8, 6, 17, 5, 73, 54, 26, 14, 287, 9, 1, 215, 84, 1484, 132, 236, 4, 138, 157, 11, 858, 272, 702, 763, 20, 263, 189, 4034, 173, 117, 8, 31, 1, 17, 172, 6920, 252, 28, 372, 24, 10, 3932, 253, 2, 37, 1, 162, 94, 23, 3130, 2, 726, 181, 38, 959, 210, 108, 211, 2, 114, 1165, 7, 379, 134, 23, 232, 62, 149, 86, 7, 3119, 41, 119, 3, 12, 8571, 264, 167, 110, 16, 9, 32, 757, 193, 4834, 1594, 2, 79, 4, 1542, 763, 30, 136, 150]


In [11]:
# the usual cache, shuffle, batch, prefetch
bible_dataset = tf.data.Dataset.from_tensor_slices((inputs, targets)).shuffle(1000).batch(32).take(10000)

# print how a batch looks like
iterator = iter(bible_dataset)
iterator.get_next()

(<tf.Tensor: shape=(32,), dtype=int32, numpy=
 array([ 167,   40,   30, 1883,   27,  323,   28,  123,   60,   76, 1167,
         214, 1514,  141,   27,  223,  700,  150,   69, 1550,   81,  908,
          42,  868,   62,  116,  135,    1,  332,  580,    1,  833])>,
 <tf.Tensor: shape=(32,), dtype=int32, numpy=
 array([ 701,    2,  142,    3,  513,    1,   55, 5483,   96,    1,  129,
          64,   18,  493,    2,   16,   11,   29,    7,   16,  236,   94,
          13,   10,  327,   19,   33,    2,   13,   38,    2,   91])>)

### 2.3 The Model

In [12]:
class SkipGramModel(tf.keras.layers.Layer):
    def __init__(self, optimizer, embedding_size):
        super(SkipGramModel,self).__init__()
        
        self.vocabulary_size = VOCAB_SIZE
        self.embedding_size = embedding_size
        self.loss_metric = tf.keras.metrics.Mean(name = "loss")
        self.optimizer = optimizer
        
    def build(self,string):
        self.embedding_mat = self.add_weight(shape=(self.vocabulary_size, self.embedding_size),
                                             initializer="uniform",
                                             trainable=True) 
        self.output_mat = self.add_weight(shape=(self.vocabulary_size, self.embedding_size),
                                          initializer="random_normal",
                                          trainable=True) 
        self.output_bias = self.add_weight(shape=(self.vocabulary_size,),
                                           initializer="zero",
                                           trainable=True)
    def call(self, inputs):
        target_predicted = tf.nn.embedding_lookup(params=self.embedding_mat, ids=inputs)
        return target_predicted
    
    def reset_metrics(self):
        for metric in self.metrics:
            metric.reset_states()
    
    def train(self, data):
        inputs, targets = data
        with tf.GradientTape() as tape:
            predictions = self(inputs)
            loss = tf.reduce_mean(
                tf.nn.nce_loss(weights=self.output_mat,
                               biases=self.output_bias, 
                               labels=tf.expand_dims(targets,axis=1), 
                               inputs=predictions,
                               num_sampled=1,
                               num_classes=self.vocabulary_size))
        gradients = tape.gradient(loss, self.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
        
        self.metrics[0].update_state(loss)
        
        return {m.name: m.result() for m in self.metrics}

In [13]:
def cosine_similarity(A, B):
    cos_sim = np.dot(A,B)/(norm(A)*norm(B))
    return cos_sim

In [14]:
def print_nearest_neighbours(tracked_words):
    
    for word in tracked_words:
    
        word_id = vocab[word]
        # calculate all cosine similarity scores between the word's embedding vector and 
        # the emb vectors of all other words in the vocab
        cos_sims = [cosine_similarity(model.call(word_id), model.call(word_ids)) for word_ids in range(VOCAB_SIZE)]
        # get the n = 5 largest cos_sim values
        nlarg = nlargest(5, cos_sims)
        # get the indices of the highest cos_sim values
        idx_of_nearest_neigh = [np.where(cos_sims == nlarg[i]) for i in range(len(nlarg))] 
        # unnest the result to have a nice array of indices
        idx_of_nearest_neigh = [idx_of_nearest_neigh[i][0][-1] for i in range(len(idx_of_nearest_neigh))]
        # look up the words that belong to the indices 
        nearest_neigh = [inverse_vocab[idx] for idx in idx_of_nearest_neigh]
        
        print(word, "- nearest neighbours: ", nearest_neigh)
        

In [19]:
def training_loop(model, train_ds, epochs, summary_writer):

    # iterate over epochs
    for epoch in range(epochs):
        print('Epoch: ', epoch)

        for data in train_ds:
            metrics = model.train(data)

        # log train loss
        with summary_writer.as_default():  
            # for scalar metrics:
            for metric in model.metrics:
                    tf.summary.scalar(f"{metric.name}", metric.result(), step=epoch)
        print([f"{key}: {value.numpy()}" for (key, value) in metrics.items()])
        
        print_nearest_neighbours(tracked_words)
       
        model.reset_metrics()

In [16]:
def create_summary_writer(config_name):
    
    # Define where to save the logs
    # along with this, you may want to save a config file with the same name so you know what the hyperparameters were used
    # alternatively make a copy of the code that is used for later reference
    
    current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

    train_log_path = f"logs/{config_name}/{current_time}/train"

    # log writer for training metrics
    train_summary_writer = tf.summary.create_file_writer(train_log_path)

    
    return train_summary_writer

In [20]:
# define model
EPOCHS = 3
optimizer = tf.keras.optimizers.Adam()
model = SkipGramModel(optimizer, 64)
summary_writer = create_summary_writer("model1")
training_loop(model, train_ds=bible_dataset, epochs= EPOCHS, summary_writer=summary_writer)

Epoch:  0
['loss: 5.883515357971191']
holy - nearest neighbours:  ['holy', 'hearts', 'brought', 'been', 'jacob']
water - nearest neighbours:  ['water', 'twelve', 'wisdom', 'obey', 'six']
wine - nearest neighbours:  ['wine', 'hearts', 'more', 'far', 'three']
love - nearest neighbours:  ['love', 'six', '29', 'power', 'tabernacle']
son - nearest neighbours:  ['son', 'israel', 'out', 'house', 'children']
father - nearest neighbours:  ['father', 's', 'over', 'eyes', 'jacob']
devil - nearest neighbours:  ['devil', 'master', 'hour', 'lie', 'present']
Epoch:  1
['loss: 4.939197063446045']
holy - nearest neighbours:  ['holy', 'who', 'way', 'truth', 'days']
water - nearest neighbours:  ['water', 'twelve', 'jeroboam', 'pitched', 'wall']
wine - nearest neighbours:  ['wine', 'houses', 'thirty', 'honey', 'pitched']
love - nearest neighbours:  ['love', '24', 'art', '9', 'found']
son - nearest neighbours:  ['son', 'house', 'children', 'tabernacle', 'israel']
father - nearest neighbours:  ['father', 'h