<a href="https://colab.research.google.com/github/christopherjneelankavil/bigram/blob/main/llm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
!pip install numpy matplotlib pylzma



In [12]:
import torch
import torch.nn as nn
from torch.nn import functional as F
block_size = 8
batch_size = 4
learing_rate = 3e-4
max_iters = 10000

In [13]:
with open("/content/wizard_of_oz.txt","r",encoding="utf-8") as f:
  text = f.read()

chars = sorted(set(text))
print(chars)
vocabulary_size = len(chars)
print(len(chars))

['\n', ' ', '!', '(', ')', ',', '-', '.', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '—', '‘', '’', '“', '”', '\ufeff']
69


In [14]:
string_to_int = {ch : i for i,ch in enumerate(chars)}
int_to_string = {i : ch for i,ch in enumerate(chars)}
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: "".join([int_to_string[i] for i in l])

data = torch.tensor(encode(text), dtype=torch.long)
print(data[:200])

tensor([68,  1,  1, 30, 44, 41,  1, 33, 51, 50, 40, 41, 54, 42, 57, 48,  1, 33,
        45, 62, 37, 54, 40,  1, 51, 42,  1, 25, 62,  0,  0,  0,  0,  0, 13, 44,
        37, 52, 56, 41, 54,  1, 19,  0, 30, 44, 41,  1, 13, 61, 39, 48, 51, 50,
        41,  0,  0,  0, 14, 51, 54, 51, 56, 44, 61,  1, 48, 45, 58, 41, 40,  1,
        45, 50,  1, 56, 44, 41,  1, 49, 45, 40, 55, 56,  1, 51, 42,  1, 56, 44,
        41,  1, 43, 54, 41, 37, 56,  1, 21, 37, 50, 55, 37, 55,  1, 52, 54, 37,
        45, 54, 45, 41, 55,  5,  1, 59, 45, 56, 44,  1, 31, 50, 39, 48, 41,  0,
        18, 41, 50, 54, 61,  5,  1, 59, 44, 51,  1, 59, 37, 55,  1, 37,  1, 42,
        37, 54, 49, 41, 54,  5,  1, 37, 50, 40,  1, 11, 57, 50, 56,  1, 15, 49,
         5,  1, 59, 44, 51,  1, 59, 37, 55,  1, 56, 44, 41,  1, 42, 37, 54, 49,
        41, 54, 65, 55,  1, 59, 45, 42, 41,  7,  1, 30, 44, 41, 45, 54,  0, 44,
        51, 57])


In [15]:
## Train and val split

n = int(0.8 * len(data))
train_data = data[:n]
val_data = data[n:]

def get_batch(split):
  data = train_data if split == 'train' else val_data
  ix = torch.randint(len(data) - block_size, (batch_size,))
  print(ix)

  x = torch.stack([data[i:i+block_size] for i in ix])
  y = torch.stack([data[i+1:i+block_size+1] for i in ix])
  return x,y

x,y = get_batch('train')
print('inputs:')
print(x)
print('targets:')
print(y)

tensor([ 59920, 126365, 113551, 120353])
inputs:
tensor([[67,  0,  0, 66, 33, 37, 45, 56],
        [51, 57, 48, 40,  1, 38, 41,  1],
        [45, 55,  1, 39, 51, 49, 52, 37],
        [56,  1, 56, 44, 41,  1, 43, 45]])
targets:
tensor([[ 0,  0, 66, 33, 37, 45, 56,  1],
        [57, 48, 40,  1, 38, 41,  1, 56],
        [55,  1, 39, 51, 49, 52, 37, 50],
        [ 1, 56, 44, 41,  1, 43, 45, 54]])


In [16]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
  context = x[:t+1]
  target = y[t]
  print('when input is', context, 'target is', target)

when input is tensor([68]) target is tensor(1)
when input is tensor([68,  1]) target is tensor(1)
when input is tensor([68,  1,  1]) target is tensor(30)
when input is tensor([68,  1,  1, 30]) target is tensor(44)
when input is tensor([68,  1,  1, 30, 44]) target is tensor(41)
when input is tensor([68,  1,  1, 30, 44, 41]) target is tensor(1)
when input is tensor([68,  1,  1, 30, 44, 41,  1]) target is tensor(33)
when input is tensor([68,  1,  1, 30, 44, 41,  1, 33]) target is tensor(51)


In [17]:
from math import log
class BigramLanguageModel(nn.Module):
  def __init__(self, vocab_size):
    super().__init__()
    self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

  def forward(self, index, targets=None):
    logits = self.token_embedding_table(index)

    if targets is None:
      loss = None

    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)

    return logits, loss

  def generate(self, index, max_new_tokens):
      for _ in range(max_new_tokens):
          logits, loss = self.forward(index)
          logits = logits[:, -1, :]
          probs = F.softmax(logits, dim=-1)
          index_next = torch.multinomial(probs, num_samples=1)
          index = torch.cat((index, index_next), dim=1)
      return index

model = BigramLanguageModel(vocab_size=vocabulary_size)
context = torch.zeros((1,1), dtype = torch.long)
generated_chars = decode(model.generate(context,max_new_tokens=500)[0].tolist())
print(generated_chars)


IyRfasbu(wDg!x ?XAIyEMT”yrg—h
AtGd:HZh’dZKN F‘cS?)P:Qar;z)?‘!)e)H;
Y:phQqAtI; dpvoC﻿T;PP; cKkDCz
hDuS?XaRYMJksv“”N’BQ -sv;NftR”onQDC‘!n’QfMMQZ-qs NgP“xmtacc;o﻿DSZlTS-MWKPJ﻿kuvtEt‘PyRWdVx—
xp﻿kXDN pT:pXk;;P﻿-KdTzYuj;Q“RU,z—cyRy﻿-!c; 
:HPnuw!)t’?,dlrrtDBs“rDA! ;iXe—tUhtI..(,g?)X
 igEPiR
DnK?ZB”:Iw!Y,dGH“
bdonrQa?”ViRKm”xCh
xZsbrDnwNj pI’Dg—iRzyR“uQ clzRwR’HDn)Nx—LSEuREOIKgEKk-!(MA!,dR-Mo﻿fG?X,dJrGOI‘F‘KWByWY:MCCDYnujWY)Bv-,zSoeDAh)hZFt;uSX”Lkg﻿.,—﻿e)pHeKDrtEFL.MQJsv“gEYsvt iRVcV
AkL;PATHPAGLEvw﻿P-


In [18]:
optimizer = torch.optim.AdamW(model.parameters(), lr = learing_rate)

for iter in range(max_iters):

  xb, yb = get_batch('train')

  logits, loss = model.forward(xb,yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()

print(loss.item())


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
tensor([144165, 102032, 103383, 115392])
tensor([159324,   3306,  47615,  94180])
tensor([  1219, 105432,  53327, 139112])
tensor([ 26568, 119062, 137191,  59686])
tensor([ 10217, 119468, 115371,  23486])
tensor([97428, 21325, 47420, 51151])
tensor([93656, 50808, 13321, 47960])
tensor([ 89037,  53412, 137902,   9700])
tensor([ 83503, 129821,  24884, 129670])
tensor([ 14481,  79716,  37669, 135760])
tensor([ 48737,  71985, 153998,   5617])
tensor([132529,  78359,  87488,   8716])
tensor([100840, 133534, 128971,  34212])
tensor([156962, 164342, 123239,  63331])
tensor([ 70490, 120920,    230,  35634])
tensor([ 63947, 136070, 154204, 141171])
tensor([131289,  29533, 118854,  79403])
tensor([149304,   9729,  16794,  92293])
tensor([116524, 147708,  62759, 133943])
tensor([ 57805, 153622,  73303, 164410])
tensor([113701,   5078,  86281,   8885])
tensor([15244, 72014, 80176, 72341])
tensor([ 94055,  71935, 155035,  94300])
tens