In [1]:
# reading the training data
with open('input.txt', 'r', encoding='utf-8') as file:
    text = file.read()

In [2]:
# length of the training data i.e. how many characters in dataset
print(len(text))

1115394


In [3]:
# a small part of data
print(text[:250])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.



In [4]:
# creatig the character list
chars = sorted(set(text))
vocab_size = len(chars)
print(vocab_size)
print(chars)

65
['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [5]:
# tokenizing the characters
itos = {i:char for i,char in enumerate(chars)}
stoi = {char:i for i,char in enumerate(chars)}

# encoder and decoder
encode = lambda s: [stoi[c] for c in s]
decode = lambda i: ''.join([itos[j] for j in i])

# a simple sample
print(encode("hii there"))
print(decode(encode("hii there")))

[46, 47, 47, 1, 58, 46, 43, 56, 43]
hii there


In [6]:
# encoding all the training set
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data[:250])

  _dtype_to_storage = {data_type(0).dtype: data_type for data_type in _storages}


tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
        47, 59, 57,  1, 47, 57,  1, 41, 

In [7]:
# train/val/test split
n1 = int(0.8*len(data))
n2 = int(0.9*len(data))
train_set = data[:n1]
val_set = data[n1:n2]
test_set = data[n2:]

In [8]:
# max block size or context length for an individual input
block_size = 8
train_set[:block_size]

tensor([18, 47, 56, 57, 58,  1, 15, 47])

In [9]:
x = train_set[:block_size]
y = train_set[1:block_size+1]

for i in range(block_size):
    context = x[:i+1]
    output = y[i]
    print(f'When context is {context}, output is {output}')

When context is tensor([18]), output is 47
When context is tensor([18, 47]), output is 56
When context is tensor([18, 47, 56]), output is 57
When context is tensor([18, 47, 56, 57]), output is 58
When context is tensor([18, 47, 56, 57, 58]), output is 1
When context is tensor([18, 47, 56, 57, 58,  1]), output is 15
When context is tensor([18, 47, 56, 57, 58,  1, 15]), output is 47
When context is tensor([18, 47, 56, 57, 58,  1, 15, 47]), output is 58


In [10]:
torch.manual_seed(31)
batch_size = 4
block_size = 8
def get_batch(data, batch_size=4, block_size=8): # 4 different chunks with context size of 8
    # specifying the dataset
    if data == 'train':
        data = train_set
    elif data =='val':
        data = val_set
    else:
        data = test_set
    
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)
print('--------------------------')

for b in range(batch_size):
    for i in range(block_size):
        context = xb[b, :i+1]
        output = yb[b, i]
        print(f'context{context.tolist()}, output {output}')

inputs:
torch.Size([4, 8])
tensor([[43,  1, 54, 39, 50, 43,  1, 51],
        [57, 39, 50, 47, 52, 43,  6,  1],
        [ 0, 20, 53, 61,  1, 61, 43, 50],
        [53,  1, 39, 57,  1, 58, 46, 53]])
targets:
torch.Size([4, 8])
tensor([[ 1, 54, 39, 50, 43,  1, 51, 53],
        [39, 50, 47, 52, 43,  6,  1, 51],
        [20, 53, 61,  1, 61, 43, 50, 50],
        [ 1, 39, 57,  1, 58, 46, 53, 59]])
--------------------------
context[43], output 1
context[43, 1], output 54
context[43, 1, 54], output 39
context[43, 1, 54, 39], output 50
context[43, 1, 54, 39, 50], output 43
context[43, 1, 54, 39, 50, 43], output 1
context[43, 1, 54, 39, 50, 43, 1], output 51
context[43, 1, 54, 39, 50, 43, 1, 51], output 53
context[57], output 39
context[57, 39], output 50
context[57, 39, 50], output 47
context[57, 39, 50, 47], output 52
context[57, 39, 50, 47, 52], output 43
context[57, 39, 50, 47, 52, 43], output 6
context[57, 39, 50, 47, 52, 43, 6], output 1
context[57, 39, 50, 47, 52, 43, 6, 1], output 51
cont

In [11]:
import torch.nn as nn
from torch.nn import functional as functional
torch.manual_seed(31)

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        # each token reads the logit for the next token
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
        
    def forward(self, idx, targets=None):
        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) # (B,T,C)
        
        if targets is None: # to successfully run the generate func
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = functional.cross_entropy(logits, targets)
        
        return logits, loss
    
    def generate(self, idx, max_tokens):
        # idx is (B,T) array of indices
        for _ in range(max_tokens):
            # getting the predictions
            logits, loss = self(idx)
            # focusing on the last indices
            logits = logits[:, -1, :] # becomes (B,C)
            # calculating the probabilities
            probs = functional.softmax(logits, dim=1) # (B,C)
            # sampling from prob distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B,1)
            # append the sampled index to the end of the context
            idx = torch.cat((idx, idx_next), dim=1) # becomes (B,T+1)
        return idx
            
m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(logits)

print(decode(m.generate(idx=torch.zeros((1,1), dtype=torch.long), max_tokens=100)[0].tolist()))

torch.Size([32, 65])
tensor([[-0.7645,  1.1787, -0.6077,  ...,  0.5836, -0.3440, -1.0587],
        [ 0.3502, -0.6820,  0.1160,  ..., -1.9782, -1.5274,  2.5114],
        [-1.5073,  0.4108, -1.6051,  ..., -0.3714,  0.1212, -1.2650],
        ...,
        [-0.8370,  0.5484,  0.0139,  ..., -1.3409,  0.0160, -0.4040],
        [-1.2525,  0.9259,  0.0539,  ..., -0.2007,  1.1198,  0.4097],
        [-0.5397, -1.2057, -0.0312,  ..., -0.5900, -0.2815,  0.8179]],
       grad_fn=<ViewBackward0>)

ldvqup;leglxvhOj,tOncBNr'iXXSSStTMHdt?NlIG'fOTKZEFFTWont-Bj.IO,cVquR!RVk?N
lTm..vTqPP3!'QqiJfbb!'rpE


In [12]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3) # setting the adam optimizer

In [13]:
batch_size = 32

for steps in range(10000):
    # getting the batch
    xb, yb = get_batch('train')
    
    # loss and backprop
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    
print(loss.item())

2.604684352874756


In [14]:
print(decode(m.generate(idx=torch.zeros((1,1), dtype=torch.long), max_tokens=100)[0].tolist()))


Thyourthiu l I way ss.
Bmofouth:
qO: bD$MEORBGLE:NETHTDDWarab oughay backWhavirsu y ulor th ssH'd'FK


In [16]:
torch.manual_seed(31232)
B, T, C = 4, 8, 2
x = torch.randn(B, T, C)
print(x.shape)

torch.Size([4, 8, 2])


In [17]:
xbow = torch.zeros((B, T, C))
for b in range(B):
    for t in range(T):
        xprev = x[b,:t+1]
        xbow[b,t] = torch.mean(xprev, 0)

xbow2 = torch.zeros((B, T, C))
a = torch.tril(torch.ones(T,T))
a = a / torch.tril(torch.ones(T,T)).sum(1, keepdim=True)
xbow2 = a @ x

torch.allclose(xbow,xbow2)

True

In [18]:
xbow2 = torch.zeros((B, T, C))
tril = torch.tril(torch.ones(T,T))
wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = functional.softmax(wei, dim=1)
xbow3 = wei @ x

In [20]:
a = torch.randint(0,10,(4,8,32))
b = torch.randint(0,10,(8,32))
(a+b).shape

torch.Size([4, 8, 32])

In [21]:
b

tensor([[8, 5, 5, 2, 5, 5, 6, 4, 9, 2, 8, 0, 6, 3, 2, 7, 2, 6, 9, 8, 8, 9, 5, 9,
         7, 3, 6, 5, 6, 8, 8, 4],
        [2, 0, 3, 1, 1, 4, 9, 5, 7, 5, 1, 6, 5, 6, 8, 2, 1, 3, 3, 0, 8, 5, 0, 3,
         4, 3, 6, 3, 2, 7, 8, 1],
        [2, 5, 5, 4, 2, 6, 4, 9, 0, 6, 9, 2, 8, 4, 9, 6, 4, 7, 6, 6, 9, 4, 1, 6,
         4, 7, 5, 7, 4, 5, 1, 3],
        [3, 5, 0, 2, 6, 7, 8, 7, 9, 3, 9, 2, 9, 0, 9, 5, 8, 9, 1, 3, 6, 0, 8, 6,
         9, 1, 6, 2, 0, 9, 5, 6],
        [8, 9, 1, 2, 0, 6, 1, 0, 7, 0, 0, 0, 0, 5, 7, 5, 1, 1, 2, 2, 6, 3, 5, 9,
         8, 6, 7, 8, 7, 1, 7, 9],
        [4, 0, 9, 2, 8, 3, 6, 0, 2, 8, 1, 7, 9, 2, 9, 2, 9, 6, 1, 8, 6, 2, 0, 8,
         1, 3, 8, 2, 3, 1, 9, 1],
        [2, 0, 0, 1, 1, 5, 2, 6, 5, 0, 6, 8, 6, 0, 1, 6, 9, 2, 2, 2, 5, 7, 7, 5,
         1, 6, 0, 5, 0, 2, 0, 9],
        [6, 4, 2, 7, 2, 0, 8, 9, 5, 4, 1, 5, 4, 2, 0, 0, 6, 3, 5, 1, 8, 9, 9, 7,
         3, 5, 1, 8, 6, 9, 3, 0]])

In [80]:
print(xbow[0])
print(xbow2[0])

tensor([[-0.2340,  1.3857],
        [-0.5039, -0.1627],
        [-0.3504, -0.5292],
        [-0.3257, -0.4636],
        [-0.1988, -0.3157],
        [-0.3342, -0.4475],
        [-0.2120, -0.4250],
        [-0.3504, -0.2801]])
tensor([[-0.2340,  1.3857],
        [-0.5039, -0.1627],
        [-0.3504, -0.5292],
        [-0.3257, -0.4636],
        [-0.1988, -0.3157],
        [-0.3342, -0.4475],
        [-0.2120, -0.4250],
        [-0.3504, -0.2801]])


In [70]:
torch.manual_seed(31)
a = torch.tril(torch.ones(3,3))
a = a / torch.tril(torch.ones(3,3)).sum(1, keepdim=True)
b = torch.randint(0 , 10, (3,2)).float()
c = a @ b

print(f"{a}")
print(f"{b}")
print(f"{c}")

#torch.tril(torch.ones(3,3))

tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
tensor([[6., 5.],
        [0., 2.],
        [4., 1.]])
tensor([[6.0000, 5.0000],
        [3.0000, 3.5000],
        [3.3333, 2.6667]])


In [68]:
a = torch.tril(torch.ones(3,3))
a / torch.tril(torch.ones(3,3)).sum(1, keepdim=True)

tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])