In [1]:
with open("input.txt", "r") as f:
    text = f.read()

In [2]:
print("Length of input in characters:", len(text))

Length of input in characters: 805579


In [3]:
print(text[:1000])

Sing, O goddess, the anger of Achilles son of Peleus, that brought
countless ills upon the Achaeans. Many a brave soul did it send hurrying
down to Hades, and many a hero did it yield a prey to dogs and vultures,
for so were the counsels of Jove fulfilled from the day on which the
son of Atreus, king of men, and great Achilles, first fell out with
one another. 

And which of the gods was it that set them on to quarrel? It was the
son of Jove and Leto; for he was angry with the king and sent a pestilence
upon the host to plague the people, because the son of Atreus had
dishonoured Chryses his priest. Now Chryses had come to the ships
of the Achaeans to free his daughter, and had brought with him a great
ransom: moreover he bore in his hand the sceptre of Apollo wreathed
with a suppliant's wreath and he besought the Achaeans, but most of
all the two sons of Atreus, who were their chiefs. 

"Sons of Atreus," he cried, "and all other Achaeans, may the gods
who dwell in Olympus grant you to

In [4]:
chars = sorted(list(set(text)))
vocabulary_size = len(chars)
print("No. of unique characters:", vocabulary_size)
print(''.join(chars))

No. of unique characters: 66

 !"&'(),-.:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


In [7]:
# Tokenization
# Single Character Tokenization - 1 character is 1 token 
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i, ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder 
decode = lambda l: ''.join([itos[i] for i in l]) # decoder 

print(encode("My name is Shreyas Deo"))
print(decode(encode("My name is Shreyas Deo")))

[26, 64, 1, 53, 40, 52, 44, 1, 48, 58, 1, 32, 47, 57, 44, 64, 40, 58, 1, 17, 44, 54]
My name is Shreyas Deo


In [8]:
import torch 
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000])

torch.Size([805579]) torch.int64
tensor([32, 48, 53, 46,  8,  1, 28,  1, 46, 54, 43, 43, 44, 58, 58,  8,  1, 59,
        47, 44,  1, 40, 53, 46, 44, 57,  1, 54, 45,  1, 14, 42, 47, 48, 51, 51,
        44, 58,  1, 58, 54, 53,  1, 54, 45,  1, 29, 44, 51, 44, 60, 58,  8,  1,
        59, 47, 40, 59,  1, 41, 57, 54, 60, 46, 47, 59,  0, 42, 54, 60, 53, 59,
        51, 44, 58, 58,  1, 48, 51, 51, 58,  1, 60, 55, 54, 53,  1, 59, 47, 44,
         1, 14, 42, 47, 40, 44, 40, 53, 58, 10,  1, 26, 40, 53, 64,  1, 40,  1,
        41, 57, 40, 61, 44,  1, 58, 54, 60, 51,  1, 43, 48, 43,  1, 48, 59,  1,
        58, 44, 53, 43,  1, 47, 60, 57, 57, 64, 48, 53, 46,  0, 43, 54, 62, 53,
         1, 59, 54,  1, 21, 40, 43, 44, 58,  8,  1, 40, 53, 43,  1, 52, 40, 53,
        64,  1, 40,  1, 47, 44, 57, 54,  1, 43, 48, 43,  1, 48, 59,  1, 64, 48,
        44, 51, 43,  1, 40,  1, 55, 57, 44, 64,  1, 59, 54,  1, 43, 54, 46, 58,
         1, 40, 53, 43,  1, 61, 60, 51, 59, 60, 57, 44, 58,  8,  0, 45, 54, 57,
       

In [9]:
# Separate dataset to train and validation splits 
n = int(0.9*len(data)) # first 90% is training dataset
training_data = data[:n]
validation_data = data[n:]

In [12]:
chunk_size = 8 
training_data[:chunk_size+1]

tensor([32, 48, 53, 46,  8,  1, 28,  1, 46])

In [15]:
torch.manual_seed(1234)
batch_size = 4 # how many chunks to process together
chunk_size = 8 # mazimum context length for predictions

def get_batch(split):
    data = training_data if split=='train' else validation_data
    ix = torch.randint(len(data)-chunk_size, (batch_size,))
    x = torch.stack([data[i:i+chunk_size] for i in ix])
    y = torch.stack([data[i+1:i+chunk_size+1] for i in ix])
    return x,y

xb, yb = get_batch('train')
print("Inputs: ")
print(xb.shape)
print(xb)
print("Targets: ")
print(yb.shape)
print(yb)
print("---------------------------")

for b in range(batch_size):
    for t in range(chunk_size):
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"Input is {context.tolist()} then target is {target}")

Inputs: 
torch.Size([4, 8])
tensor([[40, 59, 44, 58, 59, 12,  1, 48],
        [62, 40, 48, 59, 48, 53, 46,  1],
        [54, 53, 59, 44, 57, 64,  0, 53],
        [52, 44, 43,  1, 59, 54,  1, 41]])
Targets: 
torch.Size([4, 8])
tensor([[59, 44, 58, 59, 12,  1, 48, 45],
        [40, 48, 59, 48, 53, 46,  1, 45],
        [53, 59, 44, 57, 64,  0, 53, 44],
        [44, 43,  1, 59, 54,  1, 41, 60]])
---------------------------
Input is [40] then target is 59
Input is [40, 59] then target is 44
Input is [40, 59, 44] then target is 58
Input is [40, 59, 44, 58] then target is 59
Input is [40, 59, 44, 58, 59] then target is 12
Input is [40, 59, 44, 58, 59, 12] then target is 1
Input is [40, 59, 44, 58, 59, 12, 1] then target is 48
Input is [40, 59, 44, 58, 59, 12, 1, 48] then target is 45
Input is [62] then target is 40
Input is [62, 40] then target is 48
Input is [62, 40, 48] then target is 59
Input is [62, 40, 48, 59] then target is 48
Input is [62, 40, 48, 59, 48] then target is 53
Input is [62

In [16]:
print(xb) # input to the transformer

tensor([[40, 59, 44, 58, 59, 12,  1, 48],
        [62, 40, 48, 59, 48, 53, 46,  1],
        [54, 53, 59, 44, 57, 64,  0, 53],
        [52, 44, 43,  1, 59, 54,  1, 41]])


In [None]:
import torch 
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1234)

class BigramLanguageModel(nn.Module):
    
    def __init__(self, vocabulary_size):
        super().__init__()
        self.token_embedding_table = nn.embedding(vocabulary_size, vocabulary_size)