In [1]:
import torch

In [46]:
with open('data.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [47]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [68]:
# n_embd = 32
# n_head = 4
# n_layers = 4
# dropout = 0.2
# batch_size = 32
# block_size = 8
# learning_rate = 3e-4
n_embd = 384
n_head = 6
n_layers = 6
dropout = 0.2
batch_size = 64
block_size = 256
learning_rate = 3e-4

In [69]:
print("length of dataset in characters: ", len(text))

length of dataset in characters:  68196771


In [70]:
text = text[:1115394] # Andrej's shakespeare dataset was of this length

In [71]:
print("length of dataset in characters: ", len(text))

length of dataset in characters:  1115394


In [72]:
# let's look at the first 1000 characters
print(text[:1000])

"Look at her face, it's a wonderful face  
And it means something special to me  
Look at the way that she smiles when she sees me  
How lucky can one fellow be?  
  
She's just my kind of girl, she makes me feel fine  
Who could ever believe that she could be mine?  
She's just my kind of girl, without her I'm blue  
And if she ever leaves me what could I do, what could I do?  
  
And when we go for a walk in the park  
And she holds me and squeezes my hand  
We'll go on walking for hours and talking  
About all the things that we plan  
  
She's just my kind of girl, she makes me feel fine  
Who could ever believe that she could be mine?  
She's just my kind of girl, without her I'm blue  
And if she ever leaves me what could I do, what could I do?

"
"Take it easy with me, please  
Touch me gently like a summer evening breeze  
Take your time, make it slow  
Andante, Andante  
Just let the feeling grow  
  
Make your fingers soft and light  
Let your body be the velvet of the night 

In [73]:
# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !"'(),-.0123456789:?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]abcdefghijklmnopqrstuvwxyz
76


In [74]:
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i.item()] for i in l]) # decoder: take a list of integers, output a string

# print(encode("hii there"))
# print(decode(encode("hii there")))

In [75]:
# let's now encode the entire text dataset and store it into a torch.Tensor
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000]) # the 1000 characters we looked at earier will to the GPT look like this

torch.Size([1115394]) torch.int64
tensor([ 3, 33, 64, 64, 60,  1, 50, 69,  1, 57, 54, 67,  1, 55, 50, 52, 54,  7,
         1, 58, 69,  4, 68,  1, 50,  1, 72, 64, 63, 53, 54, 67, 55, 70, 61,  1,
        55, 50, 52, 54,  1,  1,  0, 22, 63, 53,  1, 58, 69,  1, 62, 54, 50, 63,
        68,  1, 68, 64, 62, 54, 69, 57, 58, 63, 56,  1, 68, 65, 54, 52, 58, 50,
        61,  1, 69, 64,  1, 62, 54,  1,  1,  0, 33, 64, 64, 60,  1, 50, 69,  1,
        69, 57, 54,  1, 72, 50, 74,  1, 69, 57, 50, 69,  1, 68, 57, 54,  1, 68,
        62, 58, 61, 54, 68,  1, 72, 57, 54, 63,  1, 68, 57, 54,  1, 68, 54, 54,
        68,  1, 62, 54,  1,  1,  0, 29, 64, 72,  1, 61, 70, 52, 60, 74,  1, 52,
        50, 63,  1, 64, 63, 54,  1, 55, 54, 61, 61, 64, 72,  1, 51, 54, 21,  1,
         1,  0,  1,  1,  0, 40, 57, 54,  4, 68,  1, 59, 70, 68, 69,  1, 62, 74,
         1, 60, 58, 63, 53,  1, 64, 55,  1, 56, 58, 67, 61,  7,  1, 68, 57, 54,
         1, 62, 50, 60, 54, 68,  1, 62, 54,  1, 55, 54, 54, 61,  1, 55, 58, 63,
      

In [76]:
# Let's now split up the data into train and validation sets
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

In [77]:

# x = train_data[:block_size]
# y = train_data[1:block_size+1]
# for t in range(block_size):
#     context = x[:t+1]
#     target = y[t]
#     print(f"when input is {context} the target: {target}")

In [78]:
torch.manual_seed(1337)
# # what is the maximum context length for predictions?

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

xb, yb = get_batch('train')
# print('inputs:')
# print(xb.shape)
# print(xb)
# print('targets:')
# print(yb.shape)
# print(yb)

# print('----')

# for b in range(batch_size): # batch dimension
#     for t in range(block_size): # time dimension
#         context = xb[b, :t+1]
#         target = yb[b,t]
#         print(f"when input is {context.tolist()} the target: {target}")

In [79]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(20834)

<torch._C.Generator at 0x18f6590a350>

In [80]:
class Head(nn.Module):
    
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias = False)
        self.query = nn.Linear(n_embd, head_size, bias = False)
        self.value = nn.Linear(n_embd, head_size, bias = False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x) #BTC
        q = self.query(x) #BTC
        #compute attention scores
        wei = q @ k.transpose(-2,-1) * C**-0.5 # BTC BCT BTT // C**-0.5 is multiplied to preserve the variance before softmax aka scaling the attention
        wei = wei.masked_fill(self.tril[:T, :T]==0, float('-inf')) #BTT
        wei = F.softmax(wei, dim=-1) #BTT
        #to avoid overfitting dropout
        wei = self.dropout(wei)
        #weighted aggregation of the values
        v = self.value(x)
        out = wei @ v
        return out

In [81]:
class MultiHeadAttention(nn.Module):
    
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.proj(out)
        return out

In [82]:
class FeedForward(nn.Module):
    
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd,4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd,n_embd),
            nn.Dropout(dropout)
        )
    
    def forward(self, x):
        return self.net(x)

In [83]:
class Block(nn.Module):
    
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)
        
    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

In [84]:
class BigramLanguageModel(nn.Module):
    
    def __init__(self):
        
        super().__init__()
        #embedding for the respective word
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        #embedding for the position of that respective word
        self.position_embedding_table = nn.Embedding(vocab_size, n_embd)
        #using attention blocks
        self.blocks = nn.Sequential(
            *[Block(n_embd, n_head = n_head) for _ in range(n_layers)]  
        )
        self.lnorm = nn.LayerNorm(n_embd)
        
        #self attention
#         self.sa_heads = MultiHeadAttention(4, n_embd//4)
        #feed-forward
#         self.ffwd = FeedForward(n_embd)

        #linear layer to get our logits
        self.lm_head = nn.Linear(n_embd, vocab_size)
        
    def forward(self, idx, targets = None):
        B, T = idx.shape
        #idx and targets are both of B,T
        tok_embd = self.token_embedding_table(idx)
        pos_embd = self.position_embedding_table(torch.arange(T, device=device)) # T,C
        X = tok_embd + pos_embd # B,T,C
        X = self.blocks(X)
        X = self.lnorm(X)
#         X = self.sa_heads(X)
#         X = self.ffwd(X)
        logits = self.lm_head(X) # B,T,vocab_size
        if targets == None:
            loss = None
        else:
            B, T, C = logits.shape
        #         print(logits.shape)
            logits = logits.view(B*T, C)
        #         print(targets.shape)
            targets = targets.view(B*T)
        #         print(logits.shape)
        #         print(targets.shape)

            loss = F.cross_entropy(logits, targets)
        
        return logits,loss
    
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            cond_idx = idx[:, -block_size:]
            #get the predictions
            logits, loss = self(cond_idx)
            #focus on the last time step
            logits = logits[:,-1,:]
            #softmax
            probs = F.softmax(logits, dim=-1)
            #sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)
            #append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1)
#             print(idx)
        return idx
            

In [85]:
model = BigramLanguageModel()
model = model.to(device)
logits, loss = model(xb, yb)
loss

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [86]:
yb.shape

torch.Size([64, 256])

In [66]:
# pytorch optimizer Adamw
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [67]:

for steps in range(10000):
    
    #get a batch
    xb, yb = get_batch('train')
    
    #calculate the loss and logits
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    
    #backprop
    loss.backward()
    
    #optimize
    optimizer.step()
    if steps%500 == 0:
        print({f'Epoch : {steps+1} Loss : {loss.item}'})
    
print(loss.item())

{'Epoch : 1 Loss : <built-in method item of Tensor object at 0x0000018FC52FEE70>'}


KeyboardInterrupt: 

In [None]:
print(decode(model.generate(idx = torch.zeros((1, 1), dtype = torch.long, device=device), max_new_tokens = 100)[0]))