In [50]:
import torch
import torch.nn as nn
from torch.nn import functional as F

import mmap
import random
import pickle
import time


device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)
batch_size = 32
block_size = 64
max_iters = 5000
learning_rate = 1e-3
eval_iters = 200
n_embd = 128
n_head = 4
n_layer = 4
dropout = 0.2

cuda


In [51]:
chars = ""
with open("vocab.txt", "r", encoding="utf-8") as f:
    text = f.read()
    chars = sorted(list(set(text)))
    
# get characters that appear in text    
#print(chars)
vocab_size = len(chars)
print(f"vocabulary_size: {vocab_size}")

vocabulary_size: 32172


In [52]:
# tokenizer
# map from str to int and int to str
string_to_int = {ch:i for i,ch in enumerate(chars)}
int_to_string = {i:ch for i,ch in enumerate(chars)}

encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])


In [53]:
# memory map for using small snippets of text from a single file of any size
def get_random_chunk(split):
    filename = "output_train.txt" if split == 'train' else "output_val.txt"
    with open(filename, 'rb') as f:
        with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm:
            # Determine the file size and a random position to start reading
            file_size = len(mm)
            start_pos = random.randint(0, (file_size) - block_size*batch_size)

            # Seek to the random position and read the block of text
            mm.seek(start_pos)
            block = mm.read(block_size*batch_size-1)

            # Decode the block to a string, ignoring any invalid byte sequences
            decoded_block = block.decode('utf-8', errors='ignore').replace('\r', '')
            
            # Train and test splits
            data = torch.tensor(encode(decoded_block), dtype=torch.long)
            
    return data

In [54]:
def get_batch(split):
    data = get_random_chunk(split)
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y



In [55]:
class Head(nn.Module):
    # one head of self-attention

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # input of size (batch, time-step, channels)
        # output of size(batch, time-step, head size)
        B,T,C = x.shape
        k = self.key(x) # (B,T,hs)
        q = self.query(x) # (B,T,hs)
        # compute attention scores
        wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 # (B,T,hs) @ (B,hs,T) -> (B,T,T)
        wei = wei.masked_fill(self.tril[:T,:T] == 0, float('-inf')) # (B,T,T)
        wei = F.softmax(wei, dim=-1) # (B,T,T)
        wei = self.dropout(wei)
        # perform weighted aggregation of values
        v = self.value(x) # (B,T,hs)
        out = wei @ v # (B,T,T) @ (B,T,hs) -> (B,T,hs)
        return out


class MultiHeadAttention(nn.Module):
    # multiple heads of self attention in parallel

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(head_size * num_heads, n_embd)
        self.dropout = nn.Dropout(dropout)
        

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1) # (B,T,F) -> (B,T,[h1,h1,h1,h1,h2,h2,h2,h2,h3,h3,h3,h3])
        out = self.dropout(self.proj(out))
        return out

class FeedFoward(nn.Module):
    # linear layer followed by non linerality
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4*n_embd),
            nn.ReLU(),
            nn.Linear(4*n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

    
class Block(nn.Module):
    # Transformer block: communication followed by computation
    
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)
  
    def forward(self, x): 
        y = self.sa(x)
        x = self.ln1(x+y)
        y = self.ffwd(x)
        x = self.ln2(x+y) 
        return x


class GPTLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        # how many indices of size n_enbd
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        # how many decoder blocks running sequentially
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

 
    
    def forward(self, index, targets=None):
        B,T = index.shape
        # index and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(index) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)
        
        if targets is None:
            loss = None
        else:
            B,T,C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
            
        return logits, loss
   
    def generate(self, index, max_new_tokens):
        # index is (B,T) array of idices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            index_cond = index[:, -block_size:]
            # get predictions
            logits, loss = self.forward(index_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B,C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B,C)
            # sample from distribution
            index_next = torch.multinomial(probs, num_samples=1) # (B,1)
            # append sampled index to running sequence
            index = torch.cat((index, index_next), dim=1) # (B,T+1)
        return index

In [56]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ["train", "val"]:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X,Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [57]:
model = GPTLanguageModel()
try:
    with open("model-172619723.pkl", "rb") as f:
        model = pickle.load(f)
        print("Model loaded")
except:
    print("No model found, initializing random weights")
m = model.to(device)



No model found, initializing random weights


In [58]:
# create optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# training loop
for iter in range(max_iters):
    if iter % eval_iters == 0:
        losses = estimate_loss()
        print(f"iter: {iter}, train loss: {losses['train']:.4f}, val loss: {losses['val']:.4f}")
    # sample batch of data
    xb, yb =  get_batch("train")

    #evaluate the loss
    logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()



with open(f"model-{int(time.time())}.pkl", "wb") as f:
    pickle.dump(model, f)

iter: 0, train loss: 10.3933, val loss: 10.4012
iter: 200, train loss: 2.3363, val loss: 2.4187
iter: 400, train loss: 2.1721, val loss: 2.2065
iter: 600, train loss: 2.0859, val loss: 2.0737
iter: 800, train loss: 1.9941, val loss: 1.9889
iter: 1000, train loss: 1.9175, val loss: 1.9968
iter: 1200, train loss: 1.9544, val loss: 1.8640
iter: 1400, train loss: 1.8932, val loss: 1.8678
iter: 1600, train loss: 1.8095, val loss: 1.8384
iter: 1800, train loss: 1.8454, val loss: 1.7665
iter: 2000, train loss: 1.7794, val loss: 1.8474
iter: 2200, train loss: 1.7786, val loss: 1.7869
iter: 2400, train loss: 1.7311, val loss: 1.7628
iter: 2600, train loss: 1.7144, val loss: 1.7614
iter: 2800, train loss: 1.6747, val loss: 1.7568
iter: 3000, train loss: 1.7334, val loss: 1.7528
iter: 3200, train loss: 1.6994, val loss: 1.6723
iter: 3400, train loss: 1.7050, val loss: 1.6782
iter: 3600, train loss: 1.6423, val loss: 1.6684
iter: 3800, train loss: 1.6576, val loss: 1.6491
iter: 4000, train loss: 1

In [59]:
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))

                                                         been on the Exlipels courcelle domicinuees intextres. FTAs" Elliconition to namioration.

Octs, all in considlerable than scan of the can't his tany to during comprhabic shemerce with occortune fall_ine a in the Berrinance, you Bovans to Neww "purms remocrimity

Kulkic Indar Surroficen, cansis of logor that carce dewlere astant rewly we're ring arealy all ond stave a turns he pallies, virous subjections.) For In the enry mail professiorsed to and that means an was and that openting on him them solucing als as his contailed by't host unters confugues a mainting normages to a not, who cortil prominity, bet cade book recorouilors. We that is countions a secreation out we aware acceptive use, lequesting.

But of can it. [that miLy, nor spoce two reank want I chave to dose the like through interristial they Jedng Eurt, 0                                                                                                                    