In [1]:
import torch
import torch.nn as nn
from torch.nn import  functional as F
from sklearn.model_selection import train_test_split

In [2]:
oliver_twist = open('/kaggle/input/dickens/dickens/pg730.txt', 'r').read()

In [3]:
#getting all the characters, like in the n-gram model 
chars = sorted(list(set(oliver_twist)))
vocab_size = len(chars)

In [4]:
#a simple tokenizer
stoi = {s:i for i,s in enumerate(chars)}
itos = {i:s for s,i in stoi.items()}

encode = lambda s: [stoi[ch] for ch in s] #tokenize some characters
decode = lambda i: ' '.join([itos[num] for num in i]) #detokenize some integers

In [5]:
#tokenizing the entire data set
enc = torch.tensor(encode(oliver_twist), dtype=torch.long)
enc[:20]

tensor([84, 48, 65, 62,  1, 44, 75, 72, 67, 62, 60, 77,  1, 35, 78, 77, 62, 71,
        59, 62])

In [6]:
#splitting the text into train, test portions
train, test = train_test_split(enc, test_size = 0.2)

When the model is trained on a data set, it essentially samples random chunks of the set and processes them one by one, instead of taking the entire set all at once. That would be computationally expensive and unreasonable. Thus, the procedure is defined below.

In [7]:
#the length of the sampled block
block_size = 8

#an example sample from the training data
x = train[:block_size]
y = train[1:block_size+1] #the targets for x (x offset by 1)

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    
    print(f'for: {context} the target is: {target}')

for: tensor([63]) the target is: 7
for: tensor([63,  7]) the target is: 0
for: tensor([63,  7,  0]) the target is: 1
for: tensor([63,  7,  0,  1]) the target is: 73
for: tensor([63,  7,  0,  1, 73]) the target is: 63
for: tensor([63,  7,  0,  1, 73, 63]) the target is: 72
for: tensor([63,  7,  0,  1, 73, 63, 72]) the target is: 0
for: tensor([63,  7,  0,  1, 73, 63, 72,  0]) the target is: 65


However, there is another dimension to care about -- the batch size, the number of sequences (samples) to be processed simultaneously. 

In [8]:
batch_size = 4 #the number of sequences to be processed at the same time 

#a function which returns the 4 random sequences of context length 8
def get_batch(split):
    data = train if split == 'train' else test
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix]) #stacking the 1D tensors as rows into a matrix
    y = torch.stack([data[i+1:i+block_size+1] for i in ix]) #same thing for ys
    
    return x, y

xt, yt = get_batch('train')

In [9]:
class BigramModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        #creating a lookup table, just like in makemore #1
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
        
    def forward(self, idx, targets=None):
        #idx, targets -- (B,T) tensors of integers                                 #Time = block_size
        logits = self.token_embedding_table(idx) # (a tensor of dimensions (Batch x Time x Channel))
        #logits -- the logs of probabilities of the characters being the next ones after some other characters
        
        if targets is None:
            loss = None
        else:
        
            #reshaping the tensor becaus of torch specifics 
            B, T, C = logits.shape
            logits = logits.view(B*T, C)

            targets = targets.view(B*T)

            #cross entropy = negative log likelihood
            loss = F.cross_entropy(logits, targets)

        return logits, loss
                      #idx - the current context of some characters in the current batch 
    def generate(self, idx, max_new_tokens):
        #idx - a (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            #getting the predictions 
            logits, loss = self(idx)
            #focus only on the last time step (pluck out the last value in the Time dimension, pytorch notation)
            logits = logits[:, -1, :] #transforms into (B, C)
            #apply the softmaax activation to probabilities
            probs = F.softmax(logits, dim=-1) #(B,C)
            #sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) #(B,1)
            #append the sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) #(B, T+1)#
            
        return idx
    
bi = BigramModel(vocab_size)
logits, loss = bi(xt, yt)

#idx = a 1x1 tensor, meaning that batch=1, time=1; is a zero-tensor, since we start with the new line character,
#which is encoded as 0, a reasonable char to start with 
idx = torch.zeros((1,1), dtype=torch.long)
print(decode(bi.generate(idx, max_new_tokens=100)[0].tolist()))


 q I r G b s O O - p $ % i r l l 4 ] V M V ( m $ [ ] U G 3 F i i I % i s # G s b 6 8 r - g V [ f ( V S g J 9 ] _ , m i 1 l $   
 / m s # . e ] @ q j P ] z n 2 C ' a 6 $ C h A [ v G x o y i ( Q h 5 r i


In [10]:
#creating a pytorch optimizer
optimizer = torch.optim.Adam(bi.parameters(), lr=0.01) #basically updates the parameters (weights, biases whatever) based on the gradients 

In [11]:
batch_size = 32
#training the model
for steps in range(10000):
    #sampling new data
    xt, yt = get_batch('train')
    #calculating the loss
    logits, loss = bi(xt, yt)
    #zeroing all the gradients from the previous step
    optimizer.zero_grad(set_to_none=True)
    #getting the gradients of all the parameters
    loss.backward()
    #using the gradients to update the parameters
    optimizer.step()
    
print(loss.item())

3.131145477294922


In [12]:
print(decode(bi.generate(idx, max_new_tokens=300)[0].tolist()))


 t e a f n w a   r   l , L t t t e y e 
 g g w l u t n t   o a t u t   ' C k y f f e e 
 y d i I l i r n a c r 
   d d a 
 p o k r   h n ,   e i h a   a t n d   t o v r w     i     a g H w p e   f g h i r     e s a   a y e 
 ! 
       s o y s g e h   v i r i , o v ; h f c , h a e y I u     e g v h m l s u r r h s   i i h   h s , s N t   ' n s - s e p e   e I h o c   i c e t r e s   I   g e h e h l   e   r   t   g g t   f , - d u   a       s e h f u u ' u   S a h o i   o l d n g d s a e     r     r t   a     g k t i   n e i r e o   n m i d N i n v , m   , s e   n e M t ' d o A   o v e t e i s r


There is progress, though, the text is still quite unreasonable. 