In [95]:
import torch
import torch.nn as nn
from torch.nn import functional as F

In [65]:
with open("./mahabharata.txt", "r", encoding="utf-8") as file:
    text = file.read()

print("Length of dataset: ", len(text))
print("First 1000 characters: \n\n"+text[:1000])

Length of dataset:  3639580
First 1000 characters: 

THE MAHABHARATA

BOOK ONE

SECTION I
Om! Having bowed down to Narayana and Nara, the most exalted male being, and also to the goddess Saraswati, must the word Jaya be uttered.

Ugrasrava, the son of Lomaharshana, surnamed Sauti, well-versed in the Puranas, bending with humility, one day approached the great sages of rigid vows, sitting at their ease, who had attended the twelve years’ sacrifice of Saunaka, surnamed Kulapati, in the forest of Naimisha. Those ascetics, wishing to hear his wonderful narrations, presently began to address him who had thus arrived at that recluse abode of the inhabitants of the forest of Naimisha. Having been entertained with due respect by those holy men, he saluted those Munis (sages) with joined palms, even all of them, and inquired about the progress of their asceticism. Then all the ascetics being again seated, the son of Lomaharshana humbly occupied the seat that was assigned to him. Seeing that he 

In [67]:
# Get the set of all characters used 
chars = sorted(list(set(text)))
print("".join(chars))
print("Length:",len(chars))


 !&(),-.0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]abcdefghijklmnopqrstuvwxyz—‘’“”
Length: 81


In [69]:
# Convert each character into integer map -- tokenizer 
stoi = { char:int for int, char in enumerate(chars) }
itos = { int:char for int, char in enumerate(chars) }
encode = lambda word : [ stoi[char] for char in word ]
decode = lambda arr : "".join([ itos[i] for i in arr ])

print(encode("Rama"))
print(decode(encode("Rama")))

[39, 50, 62, 50]
Rama


In [75]:
# Convert dataset into integer map
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000])

torch.Size([3639580]) torch.int64
tensor([41, 29, 26,  1, 34, 22, 29, 22, 23, 29, 22, 39, 22, 41, 22,  0,  0, 23,
        36, 36, 32,  1, 36, 35, 26,  0,  0, 40, 26, 24, 41, 30, 36, 35,  1, 30,
         0, 36, 62,  2,  1, 29, 50, 71, 58, 63, 56,  1, 51, 64, 72, 54, 53,  1,
        53, 64, 72, 63,  1, 69, 64,  1, 35, 50, 67, 50, 74, 50, 63, 50,  1, 50,
        63, 53,  1, 35, 50, 67, 50,  6,  1, 69, 57, 54,  1, 62, 64, 68, 69,  1,
        54, 73, 50, 61, 69, 54, 53,  1, 62, 50, 61, 54,  1, 51, 54, 58, 63, 56,
         6,  1, 50, 63, 53,  1, 50, 61, 68, 64,  1, 69, 64,  1, 69, 57, 54,  1,
        56, 64, 53, 53, 54, 68, 68,  1, 40, 50, 67, 50, 68, 72, 50, 69, 58,  6,
         1, 62, 70, 68, 69,  1, 69, 57, 54,  1, 72, 64, 67, 53,  1, 31, 50, 74,
        50,  1, 51, 54,  1, 70, 69, 69, 54, 67, 54, 53,  8,  0,  0, 42, 56, 67,
        50, 68, 67, 50, 71, 50,  6,  1, 69, 57, 54,  1, 68, 64, 63,  1, 64, 55,
         1, 33, 64, 62, 50, 57, 50, 67, 68, 57, 50, 63, 50,  6,  1, 68, 70, 67,
      

In [79]:
# Take first 90% to train, rest will be validation
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [81]:
# Maximum length of 'snip' of data being trained 
block_size = 8 
train_data[:block_size+1]

tensor([41, 29, 26,  1, 34, 22, 29, 22, 23])

In [85]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"When the input is {context} the target is: {target}")

When the input is tensor([41]) the target is: 29
When the input is tensor([41, 29]) the target is: 26
When the input is tensor([41, 29, 26]) the target is: 1
When the input is tensor([41, 29, 26,  1]) the target is: 34
When the input is tensor([41, 29, 26,  1, 34]) the target is: 22
When the input is tensor([41, 29, 26,  1, 34, 22]) the target is: 29
When the input is tensor([41, 29, 26,  1, 34, 22, 29]) the target is: 22
When the input is tensor([41, 29, 26,  1, 34, 22, 29, 22]) the target is: 23


In [91]:
batch_size = 4 # The number of independent sequences processing in parallel 
block_size = 8 # The maximum context length for predictions
 
def get_batch(split):
    data = train_data if split == "train" else val_data
    ix = torch. randint(len (data) - block_size, (batch_size, ))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb, = get_batch("train")
print("Inputs: ")
print(xb.shape)
print(xb)
print("Targets: ")
print(yb.shape)
print(yb)

print("----")

for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the target: {target}")

Inputs: 
torch.Size([4, 8])
tensor([[50, 58, 68, 50, 62, 65, 50, 74],
        [58, 69, 68,  6,  1, 50, 68,  1],
        [57, 50,  6,  1, 69, 57, 64, 70],
        [ 1, 58, 63, 68, 69, 50, 61, 61]])
Targets: 
torch.Size([4, 8])
tensor([[58, 68, 50, 62, 65, 50, 74, 50],
        [69, 68,  6,  1, 50, 68,  1, 50],
        [50,  6,  1, 69, 57, 64, 70,  1],
        [58, 63, 68, 69, 50, 61, 61, 54]])
----
when input is [50] the target: 58
when input is [50, 58] the target: 68
when input is [50, 58, 68] the target: 50
when input is [50, 58, 68, 50] the target: 62
when input is [50, 58, 68, 50, 62] the target: 65
when input is [50, 58, 68, 50, 62, 65] the target: 50
when input is [50, 58, 68, 50, 62, 65, 50] the target: 74
when input is [50, 58, 68, 50, 62, 65, 50, 74] the target: 50
when input is [58] the target: 69
when input is [58, 69] the target: 68
when input is [58, 69, 68] the target: 6
when input is [58, 69, 68, 6] the target: 1
when input is [58, 69, 68, 6, 1] the target: 50
when input 

In [99]:
class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__() # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):

        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) # (B,T,C)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self(idx)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

m = BigramLanguageModel(len(chars))
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))

torch.Size([32, 81])
tensor(4.5380, grad_fn=<NllLossBackward0>)

!KMf?&&Bo—p)]l SWqc3kFZ—ixdKvsRJs’—F)—‘“xbae5Q;;Zj?rZ;ripf1H0k!—6Ky9—L“baOlqB
L:]4WP iuB63or]!s[IhRr
