# Problem 1

## Transformer code setup from Prof. Uzair Ahmad's Provided Code

### Transformer Setup

In [7]:
import numpy as np
import torch
from torch import nn
from torch.nn import functional as F

''' Look at all previous tokens to generate next
    @Author: Uzair Ahmad
    2022
    +TransformerBlock 
'''


class TransformerBlockLM(nn.Module):
    class TransformerBlock(nn.Module):
        def __init__(self, head_count, in_size, out_size):
            super().__init__()
            self.comm = TransformerBlockLM.MultiHeadAttention(head_count=head_count,
                                                              in_size=in_size,
                                                              out_size=out_size)
            self.think = TransformerBlockLM.MLP(embed_size=out_size)

        def forward(self, x):
            return x + self.think(x + self.comm(x))

    class MLP(nn.Module):
        # FFNN (embed_size, embed_size*4, embed_size)
        def __init__(self, embed_size):
            super().__init__()
            self.mlp = nn.Sequential(nn.Linear(embed_size, embed_size * 4),
                                     nn.ReLU(),
                                     nn.Linear(embed_size * 4, embed_size))
            self.layerNorm = nn.LayerNorm(embed_size)

        def forward(self, x):  # think
            return self.layerNorm(self.mlp(x))  # paper - after
            # return self.mlp(self.layerNorm(x)) # alternate - before

    class MultiHeadAttention(nn.Module):
        """
        multiple parallel SA heads (communication among words)
        """

        def __init__(self, head_count, in_size, out_size):
            super().__init__()
            self.heads = nn.ModuleList(
                TransformerBlockLM.SelfAttentionHead(in_size, out_size // head_count)
                for _ in range(head_count)
            )
            self.layerNorm = nn.LayerNorm(out_size)
            # self.proj = nn.Linear(out_size, out_size)

        def forward(self, x):
            # concat over channel/embeddings_size dimension
            return self.layerNorm(torch.cat([head(x) for head in self.heads], dim=-1))  # paper - after
            # return torch.cat([head(self.layerNorm(x)) for head in self.heads], dim=-1) # alternate - before
            # return self.proj(torch.cat([head(x) for head in self.heads], dim=-1))

    class SelfAttentionHead(nn.Module):
        def __init__(self, in_size, out_size):
            """
            in_size is embed_size
            out_size is head_size
            """
            super().__init__()
            self.head_size = out_size
            self.K = nn.Linear(in_size, self.head_size, bias=False)
            self.Q = nn.Linear(in_size, self.head_size, bias=False)
            self.V = nn.Linear(in_size, self.head_size, bias=False)

        def forward(self, x):
            keys = self.K(x)
            queries = self.Q(x)
            # affinities :
            # all the queries will dot-product with all the keys
            # transpose (swap) second dimension (input_length) with third (head_size)
            keys_t = keys.transpose(1, 2)
            autocorrs = (queries @ keys_t) * (self.head_size ** -0.5)  # (batch_size x input_length x input_length)
            '''
            (batch_size x input_length x embed_size) @ (batch_size x embed_size x input_length) ----> (batch_size x input_length x input_length)
            '''
            autocorrs = torch.tril(autocorrs)
            autocorrs = autocorrs.masked_fill(autocorrs == 0, float('-inf'))
            autocorrs = torch.softmax(autocorrs, dim=-1)
            values = self.V(x)  # (batch_size x input_length x head_size)
            out = autocorrs @ values
            return out

    def __init__(self, batch_size=4,
                 input_length=8,
                 embed_size=16,
                 sa_head_size=8,
                 sa_multihead_count=4,
                 pos_embed=False,
                 include_mlp=False):
        super().__init__()
        self.blocks = None
        self.ffn = None
        self.sa_heads = None
        # sa_head_size head_size of self-attention module
        self.sa_head_size = sa_head_size
        self.sa_multihead_count = sa_multihead_count

        self.val_data = None
        self.train_data = None
        self.val_text = None
        self.train_text = None
        self.K = None
        self.linear_sahead_to_vocab = None
        self.vocab = None
        self.token_embeddings_table = None
        self.vocab_size = None
        self.encoder = None
        self.decoder = None
        self.vocab_size: int
        self.is_pos_emb = pos_embed
        self.include_mlp = include_mlp
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        # self.device = 'cpu'
        
        print(f"Device: {self.device}") 
        # input_length = how many consecutive tokens/chars in one input
        self.input_length = input_length
        # batch_size = how many inputs are going to be processed in-parallel (on GPU)
        self.batch_size = batch_size
        # embed_size = embedding size
        self.embed_size = embed_size

        self.lm_head = None
        self.position_embeddings_table = None

    def forward(self, in_ids, target=None):
        i = in_ids[:, -self.input_length:].to(self.device)
        in_ids_emb = self.token_embeddings_table(i)
        if self.is_pos_emb:
            in_ids_pos_emb = self.position_embeddings_table(
                torch.arange(in_ids[:, -self.input_length:].shape[1], device=self.device)
            )
            in_ids_emb = in_ids_emb + in_ids_pos_emb

        block_outputs = self.blocks(in_ids_emb)
        logits = self.linear_sahead_to_vocab(block_outputs)  # compute

        if target is None:
            ce_loss = None
        else:
            batch_size, input_length, vocab_size = logits.shape
            logits_ = logits.view(batch_size * input_length, vocab_size)
            targets = target.view(batch_size * input_length)
            ce_loss = F.cross_entropy(logits_, targets)
        return logits, ce_loss

    def fit(self, train_iters=100, eval_iters=10, lr=0.0001):
        """
        train_iters = how many training iterations
        eval_iters = how many batches to evaluate to get average performance
        """
        optimizer = torch.optim.Adam(self.parameters(), lr=lr)
        for iteration in range(train_iters):
            if iteration % eval_iters == 0:
                avg_loss = self.eval_loss(eval_iters)
                print(f"iter {iteration}:, time = {time.time()-start}, train {avg_loss['train']}, val {avg_loss['eval']}")
            inputs, targets = self.get_batch(split='train')
            if inputs is None or targets is None:
                continue
            _, ce_loss = self(inputs, targets)
            optimizer.zero_grad(set_to_none=True)  # clear gradients of previous step
            ce_loss.backward()  # propagate loss back to each unit in the network
            optimizer.step()  # update network parameters w.r.t the loss
        # torch.save(self, 'sa_pos_')

    def generate(self, context_token_ids, max_new_tokens):
        for _ in range(max_new_tokens):
            token_rep, _ = self(context_token_ids)
            last_token_rep = token_rep[:, -1, :]
            probs = F.softmax(last_token_rep, dim=1)
            next_token = torch.multinomial(probs, num_samples=1)
            context_token_ids = torch.cat((context_token_ids, next_token), dim=1)
        output_text = self.decoder(context_token_ids[0].tolist())
        return output_text

    @torch.no_grad()  # tell torch not to prepare for back-propagation (context manager)
    def eval_loss(self, eval_iters):
        perf = {}
        self.eval()
        for split in ['train', 'eval']:
            total_loss = 0.0
            total_tokens = 0
            losses = torch.zeros(eval_iters)
            for k in range(eval_iters):
                tokens, targets = self.get_batch(split)
                if tokens is None or targets is None:
                    print(f"Insufficient data for evaluation, skipping this iteration for {split} split...")
                    continue
                _, ce_loss = self(tokens, targets)  # forward pass
                losses[k] = ce_loss.item()  # the value of loss tensor as a standard Python number

                total_loss += ce_loss.item() * tokens.size(0)
                total_tokens += tokens.size(0)

            avg_loss = losses.mean()
            perf[split] = avg_loss

            # calculate and print perplexity
            perplexity = torch.exp(torch.tensor(total_loss / total_tokens).to(self.device)) if total_tokens > 0 else None
            print(f"Perplexity for {split}: {perplexity}")

        self.train()  # turn-on training mode-
        return perf

    def prep(self, corpus):
        self.vocab = sorted(list(set(corpus)))
        self.vocab_size = len(self.vocab)
        c2i = {c: i for i, c in
               enumerate(self.vocab)}  # char c to integer i map. assign value i for every word in vocab
        i2c = {i: c for c, i in c2i.items()}  # integer i to char c map

        self.encoder = lambda doc: [c2i[c] for c in doc]
        self.decoder = lambda nums: ''.join([i2c[i] for i in nums])

        n = len(text)
        self.train_text = text[:int(n * 0.9)]
        self.val_text = text[int(n * 0.9):]

        self.train_data = torch.tensor(self.encoder(self.train_text), dtype=torch.long).to(self.device)
        self.val_data = torch.tensor(self.encoder(self.val_text), dtype=torch.long).to(self.device)

        # look-up table for embeddings (vocab_size x embed_size)
        # it will be mapping each token id to a vector of embed_size
        # a wrapper to store vector representations of each token
        self.token_embeddings_table = \
            nn.Embedding(self.vocab_size, self.embed_size).to(self.device)

        if self.is_pos_emb:
            self.position_embeddings_table = nn.Embedding(self.input_length, self.embed_size).to(self.device)

        self.blocks = nn.Sequential(
            TransformerBlockLM.TransformerBlock(head_count=self.sa_multihead_count,
                                                in_size=self.embed_size,
                                                out_size=self.sa_head_size),
            TransformerBlockLM.TransformerBlock(head_count=self.sa_multihead_count,
                                                in_size=self.embed_size,
                                                out_size=self.sa_head_size),
            TransformerBlockLM.TransformerBlock(head_count=self.sa_multihead_count,
                                                in_size=self.embed_size,
                                                out_size=self.sa_head_size),
            TransformerBlockLM.TransformerBlock(head_count=self.sa_multihead_count,
                                                in_size=self.embed_size,
                                                out_size=self.sa_head_size),
            TransformerBlockLM.TransformerBlock(head_count=self.sa_multihead_count,
                                                in_size=self.embed_size,
                                                out_size=self.sa_head_size),
            TransformerBlockLM.TransformerBlock(head_count=self.sa_multihead_count,
                                                in_size=self.embed_size,
                                                out_size=self.sa_head_size),
        ).to(self.device)
        # linear projection of sa_head output to vocabulary
        self.linear_sahead_to_vocab = nn.Linear(self.sa_head_size, self.vocab_size).to(self.device)

    def get_batch(self, split='train'):
        data = self.train_data if split == 'train' else self.val_data
        # get random chunks of length batch_size from data
        if len(data) - self.input_length < 0:
            print("Insufficient data for this iteration, skipping...")
            print(f"Len(data): {len(data)}, input length: {self.input_length}, batch size: {self.batch_size}")
            return None, None
        ix = torch.randint(len(data) - self.input_length, (self.batch_size,))
        inputs_batch = torch.stack([data[i:i + self.input_length] for i in ix])
        targets_batch = torch.stack([data[i + 1:i + self.input_length + 1] for i in ix])
        inputs_batch = inputs_batch.to(self.device).to(self.device)
        targets_batch = targets_batch.to(self.device).to(self.device)
        # inputs_batch is
        return inputs_batch, targets_batch

Above is code implementing a Transformer model with MLP based positional encoding, code is provided by Dr. Uzair Ahmad.

1. The Primary difference between the code provided in the module by Prof. Ahmad and the architecture of the Google Patent is that the code above is using transformer architecture as the neural network used, while Google proposes RNN's. The code above also uses positional encoding from an MLP layer, compared to Google's patent's proposed global attention.

2. Corrections above made to be able to run code using CUDA cores, several parts of data required ".to(self.device)" to properly send to cuda cores, prior to these changes below error was received:

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument index in method wrapper_CUDA__index_select)

This fixes a bug causing the program to fail to run, while also increasing the speed it can be trained in. 

In addition, I was frequently having an issue where in validation, the len(data) was less than input length when training on smaller datasets, so I created some checks to avoid this, though the best solution was to simply lower the input size.

### Running The Code

Below is a main block to run the transformer on a sample of Emily Dickinson's poems 

In [None]:
import time

start = time.time()
print(f"Start: {time.ctime()}")

with open('./Transformer/emily_dickonson.txt', 'r') as f:
    text = f.read()

# text = 'a quick brown fox jumps over the lazy dog.\n ' \
#        'lazy dog and a quick brown fox.\n' \
#        'the dog is lazy and the fox jumps quickly.\n' \
#        'a fox jumps over the dog because he is lazy.\n' \
#        'dog is lazy and fox is brown. she quickly jumps over the lazy dog.'

model = TransformerBlockLM(batch_size=64,
                           input_length=32,
                           embed_size=128,
                           sa_multihead_count=8,
                           sa_head_size=128,
                           pos_embed=True,
                           include_mlp=True)
model = model.to(model.device)
model.prep(text)
model_parameters = filter(lambda p: p.requires_grad, model.parameters())
print(f'params {sum([np.prod(p.size()) for p in model_parameters])}')
input_batch, output_batch = model.get_batch(split='train')
_, _ = model(input_batch, output_batch)
model.fit(train_iters=4000, eval_iters=1000, lr=1e-3)
outputs = model.generate(context_token_ids=torch.zeros((1, 1),
                                                        dtype=torch.long,
                                                        device=model.device),
                         max_new_tokens=1000)
print(outputs)

end = time.time()

print(f"Elapsed Time: {end-start}")

## Results:

466s on cuda 881s on cpu #sample text

481s on cuda 1653s on cpu # emily dickonson

**Device:** cpu

**params** 1113169

**iter 0:** time = 152.3404836654663, train 5.497964382171631, val 5.49129581451416

**iter 1000:** time = 562.986720085144, train 1.6596962213516235, val 1.7237656116485596

**iter 2000:** time = 971.5997943878174, train 1.3862591981887817, val 1.7082321643829346

**iter 3000:** time = 1379.8686068058014, train 1.1676121950149536, val 1.8512464761734009


**Device:** cuda

**params** 1113169

**iter 0:** time = 46.85462760925293, train 5.598533630371094, val 5.547347545623779

**iter 1000:** time = 160.57424426078796, train 1.6535813808441162, val 1.7112394571304321

**iter 2000:** time = 274.0162696838379, train 1.3842737674713135, val 1.6874533891677856

**iter 3000:** time = 387.8321087360382, train 1.1877456903457642, val 1.8289140462875366


With the cuda cores being utilized, we can see significant reduction in the time spent training. This can improve our model significantly by limiting the time spent to train, or allowing us to train a more complicated model, or train for longer in the same amount of time.

# Problem 2

### Training on Warren Buffet's style

Below is the modified eval_loss method that calculates and prints the perplexity on each step of training.

In [None]:
def eval_loss(self, eval_iters):
    perf = {}
    self.eval()
    for split in ['train', 'eval']:
        total_loss = 0.0
        total_tokens = 0
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            tokens, targets = self.get_batch(split)
            if tokens is None or targets is None:
                print(f"Insufficient data for evaluation, skipping this iteration for {split} split...")
                continue
            _, ce_loss = self(tokens, targets)  # forward pass
            losses[k] = ce_loss.item()  # the value of loss tensor as a standard Python number

            total_loss += ce_loss.item() * tokens.size(0)
            total_tokens += tokens.size(0)

        avg_loss = losses.mean()
        perf[split] = avg_loss

        # calculate and print perplexity
        perplexity = torch.exp(torch.tensor(total_loss / total_tokens).to(self.device)) if total_tokens > 0 else None
        print(f"Perplexity for {split}: {perplexity}")

    self.train()  # turn-on training mode-
    return perf

In [8]:
import time

start = time.time()
print(f"Start: {time.ctime()}")

with open('./Transformer/WarrenBuffet.txt', 'r') as f:
    text = f.read()

# text = 'a quick brown fox jumps over the lazy dog.\n ' \
#        'lazy dog and a quick brown fox.\n' \
#        'the dog is lazy and the fox jumps quickly.\n' \
#        'a fox jumps over the dog because he is lazy.\n' \
#        'dog is lazy and fox is brown. she quickly jumps over the lazy dog.'

model = TransformerBlockLM(batch_size=64,
                           input_length=32,
                           embed_size=128,
                           sa_multihead_count=8,
                           sa_head_size=128,
                           pos_embed=True,
                           include_mlp=True)
model = model.to(model.device)
model.prep(text)
model_parameters = filter(lambda p: p.requires_grad, model.parameters())
print(f'params {sum([np.prod(p.size()) for p in model_parameters])}')
input_batch, output_batch = model.get_batch(split='train')
_, _ = model(input_batch, output_batch)
model.fit(train_iters=4000, eval_iters=1000, lr=1e-3)
outputs = model.generate(context_token_ids=torch.zeros((1, 1),
                                                        dtype=torch.long,
                                                        device=model.device),
                         max_new_tokens=1000)
print(outputs)

end = time.time()

print(f"Elapsed Time: {end-start}")

Start: Sat Jul 22 23:23:46 2023
Device: cuda
params 1115739
Perplexity for train: 451.3076477050781
Perplexity for eval: 451.9237365722656
iter 0:, time = 41.77569794654846, train 6.112149238586426, val 6.113512992858887
Perplexity for train: 4.6118597984313965
Perplexity for eval: 5.1837005615234375
iter 1000:, time = 145.29064178466797, train 1.5286312103271484, val 1.6455191373825073
Perplexity for train: 3.7546401023864746
Perplexity for eval: 4.574649810791016
iter 2000:, time = 251.60797309875488, train 1.3229925632476807, val 1.520530104637146
Perplexity for train: 3.329393148422241
Perplexity for eval: 4.434844970703125
iter 3000:, time = 356.81307220458984, train 1.2027900218963623, val 1.489492654800415

Manufactured-housing story about 
one my great errossends will likel changes their come a record 
on time - migarges are the naurance business, but never by 45%,000 our lawn best selections disting future acquisitions that viewed his stock a long of many others, and investmen

1. We can see in the final iterations of the training that validation gets down to a perplexity of 4.43. Compared to the best model (bigram model) getting a score of about 12 in the previous assignment, we can see this as a significant improvement. Compared to the relative frequency model getting a perplexity of ~7000 (probaby as it was a character model), we can see this as an enormous improvement. It is also worth noting the significant decrease in perplexity from the first iterations to the last in the training cycle.

2. Below we can see some additional output from generating 1000 more tokens. While the text does not initially seem impressive when read by a human, when we compare to our previous character based model, which had entirely illegible text with "words" that didn't even resemble real words, this is a drastic difference. At a glance, the text generated by this model looks like English, only upon further scrutiny does one notice that there are many spelling, grammar, and logic issues. While this does not make coherent sentences, there are coherent phrases, particularly if you can mentally correct some of the spelling issues or consider them typos.

In [9]:
outputs = model.generate(context_token_ids=torch.zeros((1, 1),
                                                        dtype=torch.long,
                                                        device=model.device),
                         max_new_tokens=1000)
print(outputs)




5 


Deferred tax, decision enjoy the cisell of importance and includentially hundred the outstence weeken and a serve though our float were bought that it the everyolder of descoures, it's olders poursation of blinally run by the SEICRs signed resulable. We would notically has been bailed on by oving each ban. Just a few case for the and our cheeptentees.) 

Underwriting Profit Yearend Nebraska Furnings, but that it substantial. Sice knew here never hapment related itsition. So 
months) call, 12/30; state with them. (All purchased each of convinced continue to proxy a fully-value. United, resid the stribute of sthat we asked able prices. 

Next a good many obinable, a renter will be since 10 confill, both or 
Wall, Stanorial sums for a Berkshire-ze-pre-tax. ZShill are Stain. One met than quarters. In aggregate, it's only a distributed. When, 
I records have learned in things. Walter buy hope and se of our operating the company's problem would be the eistable, many thousands of star