In [22]:
import torch
import torch.nn as nn
import torch.nn.functional as F

with open('input.txt','r') as f:
    input=f.read()

In [3]:
chars=sorted(list(set(input)))
vocab_size=len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [5]:
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda s: ''.join([itos[i] for i in s])

print(encode("Hi there"))
print(decode(encode("Heyyy!")))

[20, 47, 1, 58, 46, 43, 56, 43]
Heyyy!


In [17]:
data=torch.tensor(encode(input), dtype=torch.long)
n=int(0.9*len(data))
train_data=data[:n]
val_data=data[n:]

In [16]:
block_size=8
x=train_data[:block_size]
y=train_data[1:block_size+1]
for t in range(block_size):
    context=x[:t+1]
    target=y[t]
    print(context,target)

tensor([18]) tensor(47)
tensor([18, 47]) tensor(56)
tensor([18, 47, 56]) tensor(57)
tensor([18, 47, 56, 57]) tensor(58)
tensor([18, 47, 56, 57, 58]) tensor(1)
tensor([18, 47, 56, 57, 58,  1]) tensor(15)
tensor([18, 47, 56, 57, 58,  1, 15]) tensor(47)
tensor([18, 47, 56, 57, 58,  1, 15, 47]) tensor(58)


In [18]:
torch.manual_seed(42)
batch_size=4
block_size=8

def get_batch(split):
    data=train_data if split=="train" else val_data
    ix=torch.randint(len(data)-block_size,(batch_size,))
    x=torch.stack([data[i:i+block_size] for i in ix])
    y=torch.stack([data[i+1:i+1+block_size] for i in ix])

    return x,y

xb,yb=get_batch('train')

In [19]:
xb,yb

(tensor([[57,  1, 46, 47, 57,  1, 50, 53],
         [ 1, 58, 46, 43, 56, 43,  1, 41],
         [17, 26, 15, 17, 10,  0, 32, 53],
         [57, 58,  6,  1, 61, 47, 58, 46]]),
 tensor([[ 1, 46, 47, 57,  1, 50, 53, 60],
         [58, 46, 43, 56, 43,  1, 41, 39],
         [26, 15, 17, 10,  0, 32, 53,  1],
         [58,  6,  1, 61, 47, 58, 46,  0]]))

In [48]:
embedding_dim=65
# Embedding dim has to be = vocab size as each alphabet is embedded in 1 dim

In [59]:
class BigramLanguageModel(nn.Module):
    def __init__(self,vocab_size):
        super().__init__()
        self.token_embedding_table=nn.Embedding(vocab_size,embedding_dim)
    def forward(self,idx,targets=None):
        logits=self.token_embedding_table(idx)
        if targets is None:
            loss=None
        else:
            B,T,C=logits.shape # Batch size, timestamps, embedding size(channel size)
            logits=logits.view(B*T,C)
            targets=targets.view(B*T)
            loss=F.cross_entropy(logits,targets)
        return logits, loss
    
    def generate(self, idx,max_new_tokens):

        for _ in range(max_new_tokens):
            logits,_=self(idx)
            # Focus only on the last token.
            logits=logits[:,-1,:]
            # Apply softmax to get probs on the embedding dim
            probs=F.softmax(logits,dim=-1)
            # Sample from distribution of embeddings to generate the next token.
            # This is what creates the randomness
            idx_next=torch.multinomial(probs,num_samples=1)
            # Append sampled index to the running sequence
            idx=torch.cat((idx,idx_next),dim=1)
        return idx
    
m=BigramLanguageModel(vocab_size)
out,loss=m(xb,yb)

In [60]:
loss

tensor(4.4131, grad_fn=<NllLossBackward0>)

In [69]:
decode(list(m.generate(torch.tensor(torch.zeros(1,1), dtype=torch.long),100).numpy()[0]))

  decode(list(m.generate(torch.tensor(torch.zeros(1,1), dtype=torch.long),100).numpy()[0]))


"\nh?',w;3cnImBqmJW'IcnsM,oRp:wXm;UsNz;jCtpIcEGpKyCAL-3?Y -EJleDjH;Kmzo$QTenDehAQy-GXJOJJj3wsvl&qCsLf3s"