In [1]:
pip install tensorflow_text

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
"""
@authors: faurand, chardes, ehagensieker
"""

# all our imports 
import tensorflow_datasets as tfds
import tensorflow as tf
from tensorflow import keras
from google.colab import drive
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
import random
import os
import datetime
import tqdm
import re


# in a notebook, load the tensorboard extension, not needed for scripts
%load_ext tensorboard

In [3]:
# bash code to mount the drive
drive.mount("/content/drive")
os.chdir("drive/MyDrive")

Mounted at /content/drive


In [4]:
# 2.1 the dataset
with open("/content/bible.txt", "r") as f:
  bible = f.read()

# print a short example
print(bible[:500])

The First Book of Moses:  Called Genesis


1:1 In the beginning God created the heaven and the earth.

1:2 And the earth was without form, and void; and darkness was upon
the face of the deep. And the Spirit of God moved upon the face of the
waters.

1:3 And God said, Let there be light: and there was light.

1:4 And God saw the light, that it was good: and God divided the light
from the darkness.

1:5 And God called the light Day, and the darkness he called Night.
And the evening and the mornin


In [5]:
# exersize 2.2
import tensorflow_text as tf_text
# convert to lower case, remove \n and special characters
bible = bible.replace("\n", " ").lower()
bible = re.sub('[^A-Za-z]+', ' ', bible)
# bible = re.sub(r'[0-9]+', lambda m: num2words(m.group(0)), bible) 
print("converted: ", bible[:500])

# tokenize the text
bible = tf_text.WhitespaceTokenizer().split(bible)
print("tokenized: ", bible[:500])

converted:  the first book of moses called genesis in the beginning god created the heaven and the earth and the earth was without form and void and darkness was upon the face of the deep and the spirit of god moved upon the face of the waters and god said let there be light and there was light and god saw the light that it was good and god divided the light from the darkness and god called the light day and the darkness he called night and the evening and the morning were the first day and god said let the
tokenized:  tf.Tensor(
[b'the' b'first' b'book' b'of' b'moses' b'called' b'genesis' b'in' b'the'
 b'beginning' b'god' b'created' b'the' b'heaven' b'and' b'the' b'earth'
 b'and' b'the' b'earth' b'was' b'without' b'form' b'and' b'void' b'and'
 b'darkness' b'was' b'upon' b'the' b'face' b'of' b'the' b'deep' b'and'
 b'the' b'spirit' b'of' b'god' b'moved' b'upon' b'the' b'face' b'of'
 b'the' b'waters' b'and' b'god' b'said' b'let' b'there' b'be' b'light'
 b'and' b'there' b'was' b'light' b'

In [29]:
# counting all words from the corpus and get 10000 most frequent words
words = list(bible.numpy())
counts = Counter(words)
print(counts)

most_common = counts.most_common(10000)
words = [x[0] for x in most_common]
print("most common words: ", words)




In [40]:
# apply subsampling to discard words that appear very often
s = 0.001
word_counts = Counter(words)
total_count = 1000

freqs = {word: count/total_count for word, count in word_counts.items()}
p_drop = {word: 1 - (np.sqrt(freqs[word]/s) + 1) for word in word_counts}
train_words = [word for word in words if random.random() < (1 - p_drop[word])]
print(len(train_words), train_words)



In [41]:
dict_1 = {x:i for i, x in enumerate(list(train_words))}
print(dict_1)

# input-target pairs
tuples = []
for word in range(1000):
  for neighbor in range(word-2,word+3):
    if word!=neighbor and neighbor>=0 and neighbor<1000:
      tuples.append((dict_1[train_words[word]], dict_1[train_words[neighbor]]))
print(tuples)

[(0, 1), (0, 2), (1, 0), (1, 2), (1, 3), (2, 0), (2, 1), (2, 3), (2, 4), (3, 1), (3, 2), (3, 4), (3, 5), (4, 2), (4, 3), (4, 5), (4, 6), (5, 3), (5, 4), (5, 6), (5, 7), (6, 4), (6, 5), (6, 7), (6, 8), (7, 5), (7, 6), (7, 8), (7, 9), (8, 6), (8, 7), (8, 9), (8, 10), (9, 7), (9, 8), (9, 10), (9, 11), (10, 8), (10, 9), (10, 11), (10, 12), (11, 9), (11, 10), (11, 12), (11, 13), (12, 10), (12, 11), (12, 13), (12, 14), (13, 11), (13, 12), (13, 14), (13, 15), (14, 12), (14, 13), (14, 15), (14, 16), (15, 13), (15, 14), (15, 16), (15, 17), (16, 14), (16, 15), (16, 17), (16, 18), (17, 15), (17, 16), (17, 18), (17, 19), (18, 16), (18, 17), (18, 19), (18, 20), (19, 17), (19, 18), (19, 20), (19, 21), (20, 18), (20, 19), (20, 21), (20, 22), (21, 19), (21, 20), (21, 22), (21, 23), (22, 20), (22, 21), (22, 23), (22, 24), (23, 21), (23, 22), (23, 24), (23, 25), (24, 22), (24, 23), (24, 25), (24, 26), (25, 23), (25, 24), (25, 26), (25, 27), (26, 24), (26, 25), (26, 27), (26, 28), (27, 25), (27, 26), (27

In [42]:
# char_tokens = [dict_1[char] for char in tuples]
# dataset = tf.data.Dataset.from_tensor_slices(char_tokens)
dataset = tf.data.Dataset.from_tensor_slices(tuples)

def prepare_data(data, batch_size = 128, sequence_length = 4): 
  data = data.map(lambda x, target: (tf.cast(x, tf.int64), tf.cast(target, tf.int64)))
  #shuffle, batch, prefetch
  data = data.shuffle(10000)
  data = data.batch(batch_size)
  data = data.prefetch(tf.data.AUTOTUNE)
  
  return data

train_ds = prepare_data(dataset)
for seq in train_ds.take(1):
  tf.print(seq)

TypeError: ignored

In [43]:
class SkipGram(tf.keras.layers.Layer): 

  def __init__(self, num_words, embedding): 

    super(SkipGram, self).__init__()
    self.num_words = num_words
    self.embedding = embedding

  def build(self, input_shape): 
    self.weight_matrix = self.add_weight(shape=(self.num_words, self.embedding), trainable = True)

  @tf.function
  def call(self, inputs, labels): 
    embeddings = tf.nn.embedding_lookup(self.weight_matrix, inputs)
    return embeddings
    

In [44]:
def train_step(model, input, target, optimizer = tf.keras.optimizers.SGD(), num_words = 10000): 

  with tf.GradientTape() as tape: 
    embedding = model(input)
    biases = tf.Variable(tf.zeros([num_words]))
    target = tf.reshape(target, (target.shape[0], 1))
    loss = tf.nn.nce_loss(weights = model.get_weights()[0], biases = biases, labels = target, inputs = embedding, num_classes = num_words)

    loss = tf.reduce_mean(loss)

  gradients = tape.gradient(loss, model.trainable_variables)
  optimizer.apply_gradients(zip(gradients, model.trainable_variables))

  return loss

def train_step(self, test_ds): 
  losses = []

  for input, target in test_ds: 
    target = tf.expand_dims(target, -1)
    loss = self(input, target)
    losses.append(loss)
    
  return tf.reduce_mean(losses, axis = 0)
  

In [45]:
def training_loop(model, train_ds, test_ds, optimizer, num_words, epochs = 10): 
  
  for epoch in range(epochs): 

    losses = []

    for input, target in train_ds: 
      train_loss = train_step(model, input, target, optimizer, num_words = num_words)
      losses.append(losses)

    value = model.embedding.numpy()
    cosine = tf.keras.losses.CosineSimilarity()

    for data in test_ds: 
      embedding = model(data)
      cosine = np.ones(value.shape[0])
      


In [46]:
text_words = ["holy, father, wine, poison, love, strong, day"]
num_words = 10000
embedding = 64
num_epochs = 15

model = SkipGram(num_words = num_words, embedding = embedding)

