In [1]:
import tensorflow as tf
#import tensorflow_text as tf_text
#import tensorflow_dataset as tf_ds
import datetime
import pprint
import tqdm
import re
import numpy as np

In [2]:
# read the text
text = open("bible.txt", "r").read().lower()

In [3]:
# counts how often a word appears in the text 
def Count_word_frequency(text): 
    text = text.numpy().tolist()
    text.sort()
    counts = { text[0] : 1 }
    current_word = text[0]
    for i in text[1:]: 
        if i == current_word:
            counts[current_word] += 1
        else:
            current_word = i
            counts.update({current_word:1})
    counts = {key: val for key, val in sorted(counts.items(), key = lambda ele: ele[1], reverse = True)}
    return counts

In [4]:
# creates a list of the most common words in a text
def Vocabulary(text, size):
    vocabulary = tf.convert_to_tensor(list(text.keys())[:size])
    return vocabulary

In [5]:
def words_to_number(text, voc):
    text = text.numpy().tolist()
    voc = voc.numpy().tolist()
    for item in range(len(text)):
        if text[item] in voc:
            text[item] = voc.index(text[item])
        else: 
            # UNK = index 10000
            text[item] = 10000
    return text

In [6]:
# returns a tensor of tokens of the num_words (int) of most common words
def Tokenization(text, num_words):
    
    #convert everything to lower case
    text.lower()

    # replace every new-line characters with a white space
    text = re.sub('[\n]', ' ', text)
    # and remove every special character
    text = re.sub('[^a-zA-Z_ ]', '', text)
    
    text = tf.strings.split(text)
        
    # get word frequency to figure out the n most common words
    word_frequency = Count_word_frequency(text)
    
    # create a vocabulary with all common words
    vocabulary = Vocabulary(word_frequency, num_words)
    
    # convert every word into a token
    #tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=num_words, oov_token='oov')
    #tokens = tokenizer.fit_on_texts([text])
    
    # replace words not in vocabulary with UNK
    # this takes verylong, may need alternatives
    length_vocab = len(vocabulary)
    for item in list(word_frequency.keys())[length_vocab + 1:]:
        text = tf.strings.regex_replace(text, f'^{item}$', 'UNK', replace_global = True)
        
    # transform the str into int so we can create one-hot encodings
    text = words_to_number(text, vocabulary)
        
    return text

In [7]:
# it should create all (input,target) pairs inside the context_window and put them into a dataset
# not sure if it does that currently, might have to check that again
def Target_pairs(text, num_words):

    dataset = tf.data.Dataset.from_tensor_slices(text)

    iterator = iter(dataset) 
    iterator.get_next()
    shift1 = dataset.map(lambda x: iterator.get_next())

    iterator2 = iter(dataset) 
    iterator2.get_next()
    iterator2.get_next()
    shift2 = dataset.map(lambda x: iterator2.get_next())


    #Reihenfolge ist egal???? 
    shift1up = tf.data.Dataset.zip((dataset, shift1))
    shift2up = tf.data.Dataset.zip((dataset, shift2))
    shift1down = tf.data.Dataset.zip((shift1, dataset))
    shift2down = tf.data.Dataset.zip((shift2, dataset))

    dataset = shift2down.concatenate(shift1up).concatenate(shift2up).concatenate(shift1down)

    # also do we need to turn it into a one-hot encoding?
    dataset = dataset.map(lambda inp, target: (tf.one_hot(inp, depth=num_words, dtype=tf.int32),tf.one_hot(target, depth=num_words, dtype=tf.int32)))
    
    dataset = dataset.shuffle(10000).batch(32).prefetch(tf.data.AUTOTUNE)
    
    return dataset

In [8]:
# create dataset
num_words = 10000
dataset = Target_pairs(Tokenization(text,num_words),num_words)

2023-02-03 15:20:33.438907: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-02-03 15:20:33.442222: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-02-03 15:20:33.442332: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-02-03 15:20:33.442905: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

In [27]:
# the actual model
class SkipGram(tf.keras.layers.Layer):
    
    def __init__(self, vocabulary_size, embedding_size):
        super(SkipGram,self).__init__()
        
        self.vocabulary_size = vocabulary_size
        self.embedding_size = embedding_size
        self.loss_metric = tf.keras.metrics.Mean(name = "loss")
        
    def build(self,string):
        # also don't know if this is in any way correct
        self.embedding = self.add_weight(shape = (self.vocabulary_size,self.embedding_size), trainable=True)
        self.score_matrix = self.add_weight(shape = (self.vocabulary_size, self.embedding_size), trainable=True)
        
    def call(self, input_word):
        # I don't know if this is how you actually use that function
        target_predicted = tf.nn.embedding_lookup(params = self.embedding, ids = input_word)
        return target_predicted
    
    def train(self, data):
        
        input_word, target_word = data
        predictions = self(input_word)
        # using this gives you dimensional problems
        loss_function = tf.keras.losses.CategoricalCrossentropy()
        loss = loss_function(target_word,predictions)
        # using this gives you other problems
        '''loss = tf.reduce_mean(
            tf.nn.nce_loss(weights = self.score_matrix,
                           biases = None,
                          labels = target_word,
                          inputs = predictions,
                          num_sampled = 2,
                          num_classes = self.vocabulary_size))
                          '''
        
        # appearantly this is an outdated thing from tf1, so we probably can't use that
        tf.train.AdamOptimizer().minimize(loss)

In [10]:
# svery shorted version
def training_loop(model, train_ds, epochs):
    
    config_name= "config_name"
    current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

    train_log_path = f"logs/{config_name}/{current_time}/train/"

    # log writer for training metrics
    train_summary_writer = tf.summary.create_file_writer(train_log_path) 
    
    # 1. iterate over epochs
    for epoch in range(epochs):

        # 2. train step over all batches in training data
        for data in tqdm.tqdm(train_ds):
            metrics = model.train(data)
        
            # 3. log and print training metrics 
        with train_summary_writer.as_default():
            tf.summary.scalar(name = "loss", data = metrics["loss"], step = epoch)

        print([f"{key}: {value.numpy()}" for (key, value) in metrics.items()])

        # 4. rest metrics
        model.reset_metrics()

In [28]:
epochs = 10
model = SkipGram(10000,64)
train_ds = dataset
training_loop(model, train_ds, epochs)

  0%|                                                 | 0/98753 [00:00<?, ?it/s]


ValueError: Shapes (32, 10000) and (32, 10000, 64) are incompatible

In [None]:
%load_ext tensorboard
%tensorboard --logdir logs/