In [2]:
with open("data/input.txt",'r', encoding="utf-8") as f:
    text = f.read()

In [3]:
print(f"Text length {len(text)}")

Text length 1115394


In [4]:
print(text[0:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [5]:
chars = sorted(list(set(text)))
vocab_size = len(chars)

# Tokenizer

In [6]:
# Character to Integer
ctoi = {ch:i for i,ch in enumerate(chars)}
# Integer to Character
itoc = {i:ch for i,ch in enumerate(chars)}


encode = lambda string: [ctoi[c] for c in string]
decode = lambda list: ''.join([itoc[i] for i in list])

print(encode("Hello World"))

[20, 43, 50, 50, 53, 1, 35, 53, 56, 50, 42]


In [8]:
import torch
data = torch.tensor(encode(text), dtype = torch.long)
print(data.shape, data.dtype)
print(data[:100])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


# Train Test split

In [10]:
n = int(0.9 * len(data))
train_data = data[:n]
test_data = data[n:]

In [11]:
vocab_size

65

In [15]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# HyperParams
block_size = 8
batch_size = 32
max_iters = 3000
eval_interval = 300
learning_rate = 1e-2
device = "cuda" if torch.cuda.is_available() else "cpu"
eval_iters = 200 
# Reproducibility
torch.manual_seed(1337)


def get_bath(split):
    data = train_data if split == "train" else test_data
    ix = torch.randint( len(data) - block_size, (batch_size,)  )
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y


xb, yb = get_bath("train")


@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()

    for split in ["train","test"]:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_bath(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out


class BigramModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size) # (V, V)

    def forward(self, idx, targets = None):
        # B = batch, C = context, 
        # idx is (B, , C)
        logits = self.token_embedding_table(idx) # (B, , C)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            # predictions
            logits, loss = self(idx) #model call
            logits = logits[:,-1,:] # (B,C)
            probs = F.softmax(logits, dim = -1) # (B,C)
            idx_next = torch.multinomial(probs, num_samples = 1) # Probability sample
            idx = torch.cat((idx, idx_next), dim = 1) # Concat
        return idx
    

model = BigramModel(vocab_size)
m = model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr = learning_rate)


for iter in range(max_iters):

    if iter % eval_interval == 0:
        losses = estimate_loss()
        ltr = losses["train"]
        ltst = losses["test"]
        print(f"Step {iter} train loss {ltr:.4f}, test loss {ltst:.4f}")

    xb, yb = get_bath("train")

    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none = True)
    loss.backward()
    optimizer.step()


Step 0 train loss 4.7676, test loss 4.7677
Step 300 train loss 2.8381, test loss 2.8538
Step 600 train loss 2.5531, test loss 2.5811
Step 900 train loss 2.4967, test loss 2.5151
Step 1200 train loss 2.4878, test loss 2.5086
Step 1500 train loss 2.4677, test loss 2.4946
Step 1800 train loss 2.4695, test loss 2.4961
Step 2100 train loss 2.4707, test loss 2.4871
Step 2400 train loss 2.4645, test loss 2.4892
Step 2700 train loss 2.4736, test loss 2.4920


In [16]:
context = torch.zeros((1,1), dtype = torch.long, device = device)
print(decode(
    m.generate(context, max_new_tokens=500)[0].tolist()
))



Ber I t:
Prins,
Is

BURCEd he whindssurt w!
RGo t wand y.
Fioumeatto! as,
HAUSuse heyeshean iatenn po iowe, mbengnd s, te t hisploushount, hintall?
ANouthof
ts ban ts omy RCILLave malldece f t.
ORLI'tt g;ZASe stowl' t is he s as n my RWALI, uren, k de ishathesharou ore yome

Whouthareloreavery d thee.
NG t erethest m k tindorende s ceke
OUThery w'd looriscater a t t s m VEdw HELO:
O:
He vere beer angon
PONT:

KINatooBechen n t st witimbout gl othof JutyomiShe lides Ivimusthin t sw rs tofigh hob
