# Project 4
## Students:
 > [Eli Carter]
 > [Gabriel Stowe]
 
 

In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import re

2023-04-29 13:56:40.213071: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
print(tf.__version__)# you may want to upgrade to 2.10.0 

2.12.0


### Please Use Markdown
> for markdown, see here: https://www.ibm.com/docs/en/watson-studio-local/1.2.3?topic=notebooks-markdown-jupyter-cheatsheet

## Task 1

In [3]:
class TransformerModel():
    def __init__(self, vocab_size, embed_dim=256, num_heads=2, num_blocks=1, ff_dim=256, maxlen=64, rate=0.1):
        #initailize variables
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.num_blocks = num_blocks
        self.ff_dim = ff_dim
        self.maxlen = maxlen
        self.rate = rate

    def TransformerBlock(self, inputs):
        #create the transformer block as discribed in the writeup, use the Keras functional API (https://keras.io/guides/functional_api/)
        #MultiHeadAttention layer, specifiy 'use_causal_mask=True' (https://keras.io/api/layers/attention_layers/multi_head_attention/)
        #LayerNormalization layer, specifiy 'epsilon=1e-6' (https://keras.io/api/layers/normalization_layers/layer_normalization/)
        #Use the rate variable for the dropout layers
        mha = tf.keras.layers.MultiHeadAttention(num_heads=self.num_heads, key_dim=self.embed_dim)(inputs, inputs, use_causal_mask=True)
        d1 = layers.Dropout(self.rate)(mha)
        n1 = layers.LayerNormalization(epsilon=1e-6)(d1 + inputs)
        fc1 = layers.Dense(self.ff_dim, activation='relu')(n1)
        fc2 = layers.Dense(self.ff_dim, activation='relu')(fc1)
        d2 = layers.Dropout(self.rate)(fc2)
        n2 = layers.LayerNormalization(epsilon=1e-6)(d2 + n1)
        return n2


    
    def EmbeddingLayer(self, inputs):
        #create the embedding layer
        #create (1) an embedding for the tokens and (2) an embedding for the positions
        #you can use https://keras.io/api/layers/core_layers/embedding/ Embedding class
        #you can use tf.range to enocde positions
        #add (1) and (2) and return the layer
        toke_embedding = layers.Embedding(input_dim=self.vocab_size, output_dim=self.embed_dim)(inputs)
        pos_embedding = layers.Embedding(input_dim=self.maxlen, output_dim=self.embed_dim)(tf.range(self.maxlen))
        return toke_embedding + pos_embedding
        #return layers.TokenAndPositionEmbedding(self.maxlen, self.vocab_size, self.embed_dim)(inputs)
    
    def create_model(self):
        #combine the EmbeddingLayer and num_blocks TransformerBlocks to create the model, use the Keras functional API (https://keras.io/guides/functional_api/)
        #use the SparseCategoricalCrossentropy loss function (https://keras.io/api/losses/probabilistic_losses/#sparsecategoricalcrossentropy-class)

        inputs = layers.Input(shape=(self.maxlen,))
        embedding = self.EmbeddingLayer(inputs)
        tmp = embedding
        for i in range(self.num_blocks):
            tmp = self.TransformerBlock(tmp)

        outputs = layers.Dense(self.vocab_size, activation='softmax')(tmp)
        model = keras.Model(inputs=inputs, outputs=outputs)
        print(model.summary())
        keras.utils.plot_model(model, "model.png", show_shapes=True)
        model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
        return model


## Task 2

In [11]:
class DataSet():
    def __init__(self, filename, len):
        #load the text from the file
        self.text = open(filename, 'r').read()
        self.len = len
        

    def prep_text(self):
        #remove all punctuation
        self.text = re.sub(r'[^\w\s]', '', self.text)
        #remove all special characters
        self.text = re.sub(r'[^a-zA-Z0-9\s]', '', self.text)
        #replace all whitespaces except for \n with a space
        self.text = re.sub(r'[^\S\n]+', ' ', self.text)
        #replace all \n with a space, newline, then space
        self.text = re.sub(r'\n', ' \n ', self.text)
        
    def tokenize_text(self):
        #seperate into words, create a vocab and convert the text to a list of numbers using the vocab such that each unique word is represented by its own number
        self.text = self.text.split(' ')
        #remove all the empty strings ??????
        c = self.text.count('')
        for i in range(c):
            self.text.remove('')
        self.vocab = np.unique(self.text)
        self.text = np.array([np.where(self.vocab == word)[0][0] for word in self.text])

    def create_dataset(self):
        #split the tokenized data into sequences of length len, return the sequences and vocab
        self.prep_text()
        self.tokenize_text()
        x = []
        y = []
        for i in range(len(self.text) - self.len - 1):
            x.append(self.text[i:i+self.len])
            y.append(self.text[i+1:i+self.len+1])
        return np.array(x), np.array(y), self.vocab

In [12]:
test = DataSet('beatles.txt', 64)
x, y, vocab = test.create_dataset()
print(x[0])
print(y[0])
print(vocab)
print(len(vocab))


[  13  511 1072 2226 1232    0 1053 1756 2226 1499 2280 1552  261    0
   18   13 1300 1330 2487 1310 2226  897    0   64 2252 2226 1499 2435
 1754 1848    0 2467 1053 1136  925 2279 1197    0 1053 1869 2226 1640
    0  959  229 1007 1397 1582 1072   13  320    0  959  548 1530 2224
 2226 1239  925  356    0   13  466 1549]
[ 511 1072 2226 1232    0 1053 1756 2226 1499 2280 1552  261    0   18
   13 1300 1330 2487 1310 2226  897    0   64 2252 2226 1499 2435 1754
 1848    0 2467 1053 1136  925 2279 1197    0 1053 1869 2226 1640    0
  959  229 1007 1397 1582 1072   13  320    0  959  548 1530 2224 2226
 1239  925  356    0   13  466 1549 1628]
['\n' '0' '1' ... 'zapped' 'zoo' 'zu']
2595


## Task 3

In [13]:
class GenerateText():
    def __init__(self, model, vocab):
        self.vocab = vocab
        self.model = model

    
    def generate_text(self, start_string, num_generate=100):
        #generate text using the model and vocab, start with the start_string and generate num_generate words
        #use the model to predict the next word, then add it to the input and predict the next word, repeat until num_generate words have been generated

        #convert the start_string to a list of numbers using the vocab
        start_tokens = [np.where(self.vocab == word)[0][0] for word in start_string.split(' ')]
        
        for i in range(num_generate):
            #use the model to predict the next word
            prediction = self.model.predict(start_tokens)
            #add the predicted word to the input
            start_tokens.append(np.argmax(prediction))
        #convert the list of numbers back to a string using the vocab
        return ' '.join([self.vocab[i] for i in start_tokens])
    
    def generate_random_text(self, num_generate=100):
        #generate text using the model and vocab, start with a random word and generate num_generate words

        #choose a random word from the vocab as the start_string
        start_string = np.random.choice(self.vocab)
        return self.generate_text(start_string, num_generate)


## Task 4: Model Traning and Testing

In [14]:
#Train the model while periodically generating text to show progress
def train_model(model, vocab, x, y, epochs=50, verbose=1):
    gen_text = GenerateText(model, vocab)
    for i in range(epochs):
        #train the model
        model = model.fit(x, y)
        
        if i % 10 == 0 and verbose == 1:
            #generate text using the model
            print(f'Epoch {i}')
            print(gen_text.generate_random_text())
            print('\n\n')
    return model

## Running the models

In [15]:
# Create the dataset
dataset = DataSet('beatles.txt', 64)
x, y, vocab = dataset.create_dataset()
print(type(x))
print(type(x[0]))
print(type(x[0][0]))
print(type(y))
print(type(y[0]))
print(type(y[0][0]))
print(np.shape(x))
print(np.shape(y))
# Create the model
my_model = TransformerModel(len(vocab))
compiled_model = my_model.create_model()
# Train the model
trained_model = train_model(compiled_model, vocab, x, y)


<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.int64'>
(38847, 64)
(38847, 64)
Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_4 (InputLayer)           [(None, 64)]         0           []                               
                                                                                                  
 embedding_6 (Embedding)        (None, 64, 256)      664320      ['input_4[0][0]']                
                                                                                                  
 tf.__operators__.add_9 (TFOpLa  (None, 64, 256)     0           ['embedding_6[0][0]']            
 mbda)                                                                                            
                                                                                                 

ValueError: Failed to find data adapter that can handle input: (<class 'list'> containing values of types {"<class 'numpy.int64'>"}), <class 'NoneType'>


# Report

## Introduction

## Results

## Conclusion

## How to Run Code

Please include any special libraries and list your tf version here.