# Homework 10

## Assignment 2: NLP 

### 2.1 Dataset


In [1]:
# open the txt file
bible = open("bible.txt", "r")  
# read file
data = bible.read()  

In [2]:
# test 
data[0:10]

'The First '

### 2.2 Word Embeddings

In [4]:
# useful imports 
import tensorflow as tf
from tensorflow.keras.layers import Embedding
import numpy as np
import matplotlib.pyplot as plt
import re

In [4]:
# global hyperparameters
VOCAB_SIZE = 10000
WINDOW_SIZE = 4
BATCH_SIZE = 32

In [5]:
# convert to lower case and replace new-line characters with <space>
data = data.lower().replace("\n", " ")
# delete special characters, only alphanumeric values remain
# do we care for numbers or should we also delete them?
data = re.sub('\W+',' ', data)

In [6]:
# test 
data[0:100]

'the first book of moses called genesis 1 1 in the beginning god created the heaven and the earth 1 2'

In [7]:
# split on space
tokenized_data = data.split(' ')

In [8]:
# test
tokenized_data[0:10]

['the', 'first', 'book', 'of', 'moses', 'called', 'genesis', '1', '1', 'in']

In [9]:
# get the unique words from the bible text
unique_words = set(tokenized_data)

In [10]:
# count how many times a unique word occurs in the bible text
occurrence = {item: tokenized_data.count(item) for item in unique_words}

In [11]:
# test
occurrence

{'': 1,
 'sickness': 20,
 'shout': 36,
 'thin': 9,
 'wake': 4,
 'treacherously': 23,
 'barrel': 3,
 'bishlam': 1,
 'church': 77,
 'board': 17,
 'nearer': 2,
 'nuts': 2,
 'tow': 3,
 'stability': 1,
 'celestial': 2,
 'jokdeam': 1,
 'biteth': 2,
 'saviours': 2,
 'lowly': 6,
 'abia': 4,
 'mail': 2,
 'clap': 6,
 'gibeah': 48,
 'bathsheba': 10,
 'compellest': 1,
 'pitchers': 5,
 'hairs': 15,
 'increase': 88,
 'backbone': 1,
 'slumberings': 1,
 'sabbath': 136,
 'twoedged': 2,
 'reading': 6,
 'inn': 5,
 'witchcraft': 3,
 'bunches': 3,
 'vintage': 10,
 'slideth': 1,
 'shobek': 1,
 'loseth': 1,
 'rapha': 2,
 'neck': 62,
 'damsel': 48,
 'hachaliah': 2,
 'age': 42,
 'shishak': 7,
 'blindness': 7,
 'lydians': 1,
 'less': 27,
 'jorai': 1,
 'clouds': 49,
 'ai': 36,
 'lords': 42,
 'ornan': 12,
 'flourishing': 2,
 'weigh': 6,
 'eshban': 2,
 'riddle': 9,
 'rebuker': 1,
 'lamech': 12,
 'widow': 55,
 'band': 19,
 'jaazaniah': 4,
 'unfaithfully': 1,
 'tetrarch': 7,
 'vein': 1,
 'jehoiakim': 37,
 'food': 55

In [12]:
# sort occurences such that we can get the words with the highest counts
# sorted from small to high counts
sorted_occurences = {k: v for k, v in sorted(occurrence.items(), key=lambda item: item[1])}

In [13]:
# how many unique words with counts do we have 
len(sorted_occurences)

12744

In [14]:
# we thus want the last 12 744 - VOCAB_SIZE words of the dictionary keys (= words) as most common words
most_common_words = list(sorted_occurences.keys())[(12744-VOCAB_SIZE):]

# test
# if it worked, the very last item on the list should be the most common word aka 'the'
most_common_words[-1]

'the'

In [15]:
# create a vocabulary for later purposes
vocab, index = {}, 1  # start indexing from 1

for word in most_common_words:
    vocab[word] = index
    index += 1


In [16]:
# inverse vocabulary
inverse_vocab = {index: token for token, index in vocab.items()}

In [17]:
# to create input-target pairs (= word-contexts pairs) we use 
# tf's continuous skipgram model 
# https://www.tensorflow.org/tutorials/text/word2vec


bible_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
      tokenized_data,
      vocabulary_size=VOCAB_SIZE,
      window_size=WINDOW_SIZE,
      negative_samples=0)



In [18]:
# test how word-pairs look like
for target, context in bible_skip_grams[:5]:
    print(target, context) 

of were
said yet
servant for
way there
hide tabernacle


In [19]:
# convert the produced word-pairs into a tf dataset
bible_dataset = tf.data.Dataset.from_tensors(bible_skip_grams)

In [20]:
# test how the dataset looks like
list(bible_dataset.as_numpy_iterator())

[array([[b'of', b'were'],
        [b'said', b'yet'],
        [b'servant', b'for'],
        ...,
        [b'fall', b'once'],
        [b'even', b'for'],
        [b'thistle', b'thou']], dtype=object)]

In [21]:
# shape of one data tensor
bible_dataset.batch(1)
# should this be reshaped? 

<BatchDataset element_spec=TensorSpec(shape=(None, 6832244, 2), dtype=tf.string, name=None)>

In [22]:
# the usual cache, shuffle, batch, prefetch 
bible_dataset = bible_dataset.cache().shuffle(1000).batch(BATCH_SIZE).prefetch(20)

### 2.3 The Model

In [None]:
class Word2Vec(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim):
    super(Word2Vec, self).__init__()
    self.target_embedding = Embedding(vocab_size,
                                      embedding_dim,
                                      input_length=1,
                                      name="w2v_embedding")
    self.context_embedding = Embedding(vocab_size,
                                       embedding_dim,
                                       input_length=num_ns+1)

  def call(self, pair):
    target, context = pair
    if len(target.shape) == 2:
      target = tf.squeeze(target, axis=1)
    word_emb = self.target_embedding(target)
    context_emb = self.context_embedding(context)
    dots = tf.einsum('be,bce->bc', word_emb, context_emb)
    return dots

In [None]:
# https://towardsdatascience.com/calculating-document-similarities-using-bert-and-other-models-b2c1a29c9630 
#finding cosine similarity 


In [None]:
#This is coded provided by Mathis via email (Not sure if it fits exactly)
def train_step(self, data):
    inputs, targets = data # inputs should have shape (batch, 1) and targets should have shape (batch, 1) or     (batch, n_context_words)
    with tf.GradientTape() as tape:
        looked_up_embeddings = self.embedding_layer(inputs)
        loss = tf.nn.nce_loss(weights = tf.transpose(self.dense_out.weights[0]), biases =  self.dense_out.weights[1], labels = targets, inputs = looked_up_embeddings,  num_sampled=self.n_negative_samples, num_classes=self.vocab_size)

In [None]:
embedding_dim = 64
epochs = 15
word2vec = Word2Vec(VOCAB_SIZE, embedding_dim)
word2vec.compile(optimizer= 'adam',
                 loss=tf.nn.nce_loss(from_logits=True),
                 metrics=['accuracy'])
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")
word2vec.fit(bible_dataset, epochs=20, callbacks=[tensorboard_callback])

In [None]:
%tensorboard --logdir logs