In [97]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import random

In [98]:
data = open('input.txt').read()

In [99]:
chars = sorted(set(data))

In [100]:
#Lets create a simple tokenizer

atoi = {x:i for i,x in enumerate(chars)}
itoa = {i:x for i,x in enumerate(chars)}
encode = lambda s:[atoi[ch] for ch in s]
decode = lambda lst:''.join([itoa[i] for i in lst])

In [101]:
encode('hello world')

[46, 43, 50, 50, 53, 1, 61, 53, 56, 50, 42]

In [102]:
decode([46, 43, 50, 50, 53, 1, 61, 53, 56, 50, 42])

'hello world'

In [103]:
#Lets create train and split data sets
data_encoded = encode(data)
data_train = torch.tensor(data_encoded[:int(len(data)*.9)])
data_valid = torch.tensor(data_encoded[int(len(data)*.9):])

print(len(data_encoded),len(data_train), len(data_valid),len(data_train)+len(data_valid))


1115394 1003854 111540 1115394


In [104]:
#Lets create a function that provides the data for us to give us test samples from a sample

vocab_size = len(chars)
vocab_size

65

In [105]:
#pluck one of the sample
context_size = 32
sample = data_train[:context_size+1]

In [106]:
print(sample)
for i in range(context_size):
    print(sample[:i+1], sample[i+1])

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1])
tensor([18]) tensor(47)
tensor([18, 47]) tensor(56)
tensor([18, 47, 56]) tensor(57)
tensor([18, 47, 56, 57]) tensor(58)
tensor([18, 47, 56, 57, 58]) tensor(1)
tensor([18, 47, 56, 57, 58,  1]) tensor(15)
tensor([18, 47, 56, 57, 58,  1, 15]) tensor(47)
tensor([18, 47, 56, 57, 58,  1, 15, 47]) tensor(58)
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58]) tensor(47)
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47]) tensor(64)
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64]) tensor(43)
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43]) tensor(52)
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52]) tensor(10)
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10]) tensor(0)
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0]) tensor(14)
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14]) 

In [107]:
# so we create 8 samples for each pluck

# lets createa  batch dimension

batch_size = 12

def get_sample():
    ret = []
    for b in range(batch_size):
        #pluck context–size +1 chars
        start = random.randint(0,len(data_train)-context_size-1)
        sample = data_train[start: start+context_size+1]
        for i in range(context_size):
            ret.append([b,[sample[:i+1],sample[i+1]]])
    return ret

get_sample()

[[0, [tensor([46]), tensor(47)]],
 [0, [tensor([46, 47]), tensor(57)]],
 [0, [tensor([46, 47, 57]), tensor(1)]],
 [0, [tensor([46, 47, 57,  1]), tensor(54)]],
 [0, [tensor([46, 47, 57,  1, 54]), tensor(53)]],
 [0, [tensor([46, 47, 57,  1, 54, 53]), tensor(61)]],
 [0, [tensor([46, 47, 57,  1, 54, 53, 61]), tensor(43)]],
 [0, [tensor([46, 47, 57,  1, 54, 53, 61, 43]), tensor(56)]],
 [0, [tensor([46, 47, 57,  1, 54, 53, 61, 43, 56]), tensor(0)]],
 [0, [tensor([46, 47, 57,  1, 54, 53, 61, 43, 56,  0]), tensor(32)]],
 [0, [tensor([46, 47, 57,  1, 54, 53, 61, 43, 56,  0, 32]), tensor(53)]],
 [0, [tensor([46, 47, 57,  1, 54, 53, 61, 43, 56,  0, 32, 53]), tensor(1)]],
 [0,
  [tensor([46, 47, 57,  1, 54, 53, 61, 43, 56,  0, 32, 53,  1]), tensor(55)]],
 [0,
  [tensor([46, 47, 57,  1, 54, 53, 61, 43, 56,  0, 32, 53,  1, 55]),
   tensor(59)]],
 [0,
  [tensor([46, 47, 57,  1, 54, 53, 61, 43, 56,  0, 32, 53,  1, 55, 59]),
   tensor(39)]],
 [0,
  [tensor([46, 47, 57,  1, 54, 53, 61, 43, 56,  0, 32, 5

In [108]:
torch.stack([torch.tensor([1,2,3]),torch.tensor([1,2,3])])

tensor([[1, 2, 3],
        [1, 2, 3]])

In [109]:

def get_batch(split):
    data = data_train if split=='train' else data_valid
    ret = []
    targets = []
    #lets pluck a sample
    rints = torch.randint(0,len(data_train)-context_size-1,(batch_size,))
    for ix in rints:
        sample = data[ix:ix+context_size+1]
        y = []
        for i in range(context_size):
            y.append(sample[i+1])
        targets.append(torch.tensor(y))
        ret.append(sample[:context_size])
        
    return torch.stack(ret), torch.stack(targets)

source, targets = get_batch('train')

for s,t in zip(source,targets):
    print(f"{decode(s.tolist())}->{decode(t.tolist())}")

him. If it be honest you have sp->im. If it be honest you have spo
e mad:
Day, night, hour, tide, t-> mad:
Day, night, hour, tide, ti
se, so long
I daily vow to use i->e, so long
I daily vow to use it
e bid me give you, sir:
Hie you,-> bid me give you, sir:
Hie you, 
alour puts well forth: pray, fol->lour puts well forth: pray, foll
 art most ignorant by age,
Or th->art most ignorant by age,
Or tho
'd!
How fain, like Pilate, would->d!
How fain, like Pilate, would 
 be gone.

ROMEO:
Give me a torc->be gone.

ROMEO:
Give me a torch
 music at the close,
As the last->music at the close,
As the last 
am, if't please the queen to sen->m, if't please the queen to send
pinion, they will to't then.
If ->inion, they will to't then.
If y
rivately twice or thrice a day, ->ivately twice or thrice a day, e


In [110]:
a = torch.tensor([1,2,3])
a = torch.stack([a,a])
a

tensor([[1, 2, 3],
        [1, 2, 3]])

In [111]:
#now lets create a model

class BigramModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.embedding_table = nn.Embedding(vocab_size,vocab_size)
        
    def forward(self,x, targets = None):
        logits = self.embedding_table(x)
        loss = None
        if targets is not None:
            B,T,C = logits.shape
            loss = F.cross_entropy(logits.view(B*T,C),targets.view(B*T))
        
        return logits, loss
    
    @torch.no_grad
    def generate(self,input,max_tokens=2):
        for _ in range(max_tokens):
            logits, loss = self(input)
            logits = logits[:,-1,:] # Only last item for each batch since its what the model predicts
            probs = F.softmax(logits, dim=-1) #pick probs
            idx_next = torch.multinomial(probs,num_samples=1)
            input = torch.cat((input,idx_next), dim = 1)
        return input

In [112]:
x = BigramModel()

input = torch.zeros(1,1, dtype = torch.long)

generated = x.generate(input,max_tokens=200)
decode(generated[0].tolist())

"\ntElnLvgd!?$pLJC;TOxNul ccIgsxDv!mcOsu3?NBl:PVh qMXFPNE:Vj&lc?heMjRUIE:Tx r$E3Qar-;:Qf,zKv!LWoXeKLcoW,BajTsgH3?Ps\nY-CPNIFxZ\nX'OjphNJ &oFdU3?liHfDcoWvS:pMPmkywwOM\nxPNpthMeGOUxigko, !FyJHSNFZeUltxTuDYfeN"

In [115]:
optimizer = torch.optim.AdamW(x.parameters(), lr = 1e-3)

for steps in range(10000):
    xb, yb = get_batch('train')
    logits, loss = x(xb,yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())


2.5600171089172363


In [116]:
generated = x.generate(input,max_tokens=200)
print(decode(generated[0].tolist()))


NUS:
Thecongh GENI:
RD ve t'Bushiu sinceer inoror
Th ashe, l grivught Dund!
ERYo lsofa'LABore hld d:
ugorfer, hacaisow, e ll wonfovea as
Tharodinecere'shrobiee qus, tand:
CHat be.

fo h an.
Ayo ayo me
