In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F
# Can import from attention module in week4

In [None]:
### Hyperparameters
batch_size = 16 # how many independent sequences will we process in parallel?
block_size = 32 # what is the maximum context length for predictions?
max_iters = 5000 # how many iterations do we want to train our model for
eval_interval = 100 # at which iterations do we perform our evaluation
learning_rate = 1e-3 # how much do we want to optimize our weights at each step
device = 'cuda' if torch.cuda.is_available() else 'cpu' # determines device we run the tensor on
eval_iters = 200 # how many evaluation intervals do we use to get the loss average
n_embd = 64 # dimension of embeddings for our input
n_head = 4 # number of attention heads working in parallel
n_layer = 4 # number of layers in our attention head that our input goes through
dropout = 0.0 # dropout probability aka probability that a weight turns to 0

In [None]:
### Preparing Data
torch.manual_seed(1337)

# wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

In [None]:
class LayerNorm(nn.Module):
    "Construct a layernorm module (See citation for details)."

    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        # initialize the parameters of the LayerNorm equation
        ### your implementation ###
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps
        ### ------------------- ###

    def forward(self, x):
        # implement LayerNorm based on the equation above
        ### your implementation ###
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2        ### ------------------- ###

In [None]:
class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

In [None]:
class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        # initialize all the heads for the MultiHeadAttention module (hint: use ModuleList)
        ### your implementation ###
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        ### ------------------- ###

    def forward(self, x):
        # feed x through all the attention heads
        ### your implementation ###
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        return out
        ### ------------------- ###

In [None]:
class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        # initialize the components of a attention only Block using MultiHeadAttention
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ffwd = nn

    def forward(self, x):
        # put the input throught the intialized components
        ### your implementation ###

        x = self.ln1(self.sa(x)) + x
        return x

        ### ------------------- ###

In [None]:
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # initialize the token_embedding table
        self.token_embedding_table = torch.nn.Embedding(vocab_size, n_embd)
        # initialize the position embedding table
        self.position_embedding_table = torch.nn.Embedding(block_size, n_embd)

        # intialize the blocks which are like attention layers for our model
        self.blocks = torch.nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        # intialize the layer norm and projection layer to predict the next character
        self.ln_f = torch.nn.LayerNorm(n_embd)
        self.lm_head = torch.nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        # feed the idx through the model and initialized parameters
        '''
        1. First create embeddings for the input using the token_embedding_table
        2. Then get the position embeddings (you can use torch.arange)
        3. Add the position embeddings and token_embeddings
        4. Get the logits (probabilities for the next character) using the blocks
        5. Layer Norm the logits
        6. Feed the logits through the last linear layer

        MAKE SURE TO PAY ATTENTION TO DIMENSIONS
        '''
        ### your implementation ###

        ### your implementation ###
        # Create token embeddings
        tok_emb = self.token_embedding_table(idx)  # (B, T, C)
        # Create position embeddings
        pos_emb = self.position_embedding_table(torch.arange(T, device=idx.device))  # (T, C)
        # Add token and position embeddings
        x = tok_emb + pos_emb  # (B, T, C)
        # Pass through blocks (attention layers)
        x = self.blocks(x)  # (B, T, C)
        # Apply layer normalization
        x = self.ln_f(x)  # (B, T, C)
        # Get logits using the linear layer
        logits = self.lm_head(x)  # (B, T, vocab_size)


        ### ------------------- ###
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C) # (B*T, C)
            targets = targets.view(B*T) # (B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [None]:
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

@torch.no_grad()
def estimate_loss():
    # estimate the average loss for each data split for evaluation
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

### Model Training and Initialization
# initialize model and set it to device
model = BigramLanguageModel()
model = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in model.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    # feed input and target into model
    logits, loss = model(xb, yb)
    # implement rest of training loop
    ### your implementation ###
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    ### ------------------- ###

In [None]:
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(model.generate(context, max_new_tokens=2000)[0].tolist()))