In [None]:
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, TensorDataset, DataLoader

In [None]:
%%capture
!wget https://raw.githubusercontent.com/karpathy/makemore/refs/heads/master/names.txt

In [None]:
names = open('names.txt').read().splitlines()

vocab = sorted(set(''.join(names)+'.'))
vocab_size=len(vocab)
stoi={v:k for k,v in enumerate(vocab)}
itos={v:k for k,v in stoi.items()}

def decode(seq: list[int])->str:
  return ''.join([itos[i] for i in seq])

def encode(name:str)->list[int]:
  return [stoi[s] for s in name]


In [None]:
import joblib
names = open('names.txt').read().splitlines()
class MLPTokenizer: # stateful class
    def __init__(self, text):
        self.text = text
        self.vocab = sorted(set(text))
        self.vocab_size = len(self.vocab)
        self.stoi = {k: v for v, k in enumerate(self.vocab)}
        self.itos = {v: k for k, v in self.stoi.items()}

    def decode(self, seq: list[int]) -> str:
        return ''.join([self.itos[i] for i in seq])

    def encode(self, name: str) -> list[int]:
        return [self.stoi[s] for s in name]
    @classmethod
    def from_pretrained(cls,file_path):
        return joblib.load(filename = file_path)

    def save(self, file_name):
        joblib.dump(self, filename = file_name)

In [None]:
tokenizer = MLPTokenizer(vocab)

In [None]:
tokenizer.save("mlp_tokenizer.joblib")

In [None]:
new_tokenizer = MLPTokenizer.from_pretrained("mlp_tokenizer.joblib")

In [None]:
tokenizer.encode("emma")

[5, 13, 13, 1]

In [None]:
tokenizer.decode([5, 13, 13, 1])

'emma'

In [None]:
from torch.utils.data import Dataset, DataLoader, TensorDataset

context_length=8

X=[]
Y=[]

for name in names:
  context=[0]*context_length
  for ch in name + '.':
    ix=stoi[ch]
    X.append(context)
    Y.append(ix)
    context = context[1:]+[ix]

X = torch.tensor(X)
Y = torch.tensor(Y)

train_size=int(.8*X.shape[0])
Xtr, Xts = X[:train_size], X[train_size:]
Ytr, Yts = Y[:train_size], Y[train_size:]

Dtr = TensorDataset(Xtr, Ytr)
Dts = TensorDataset(Xts, Yts)

DLtr = DataLoader(Dtr, batch_size=32, shuffle=True, drop_last=True)
DLts = DataLoader(Dts, batch_size=32, shuffle=False, drop_last=False)

In [None]:
names[:5]

['emma', 'olivia', 'ava', 'isabella', 'sophia']

In [None]:
X[:7]

tensor([[ 0,  0,  0,  0,  0,  0,  0,  0],
        [ 0,  0,  0,  0,  0,  0,  0,  5],
        [ 0,  0,  0,  0,  0,  0,  5, 13],
        [ 0,  0,  0,  0,  0,  5, 13, 13],
        [ 0,  0,  0,  0,  5, 13, 13,  1],
        [ 0,  0,  0,  0,  0,  0,  0,  0],
        [ 0,  0,  0,  0,  0,  0,  0, 15]])

In [None]:
X[4]

tensor([ 0,  0,  0,  0,  5, 13, 13,  1])

In [None]:
config = {
    'vocab_size': 27,
    'context_length': 8,
    'n_embd':32,
    'hidden_size':200
}

class MLP(nn.Module):
  def __init__(self, config):
      super().__init__()
      self.vocab_size=config['vocab_size']
      self.context_length = config['context_length']
      self.n_embd=config['n_embd']
      self.hidden_size=config['hidden_size']
      self.E=nn.Embedding(self.vocab_size, self.n_embd)
      self.flat=nn.Flatten()
      self.fc = nn.Linear(self.context_length*self.n_embd, self.hidden_size)
      self.tanh = nn.Tanh()
      self.out = nn.Linear(self.hidden_size, self.vocab_size)

  def forward(self, x):
      x=self.E(x)
      x=self.flat(x)
      x=self.fc(x)
      x=self.tanh(x)
      x=self.out(x)
      return x

  @torch.no_grad()
  def generate(self, max_length=100):
      self.eval()
      context=[0]*8
      output=[]

      for i in range(max_length):
        logits=self(torch.tensor(context).unsqueeze(0)).squeeze(0)
        probs = torch.softmax(logits, dim=0)
        ix=torch.multinomial(probs, num_samples=1).item()
        if ix==0:
          break
        output.append(ix)
        context=context[1:]+[ix]

      return decode(output)
  @classmethod
  def from_pretrained(cls, file_path):
      sd=torch.load(file_path)
      model = cls(sd["model_config"])
      model.load_state_dict(sd["model_state_dict"])
      return model
#model = MLP(config)
model = MLP.from_pretrained('mlp_32_embed.pth')
optimizer=optim.AdamW(model.parameters())

In [None]:
model

MLP(
  (E): Embedding(27, 32)
  (flat): Flatten(start_dim=1, end_dim=-1)
  (fc): Linear(in_features=256, out_features=200, bias=True)
  (tanh): Tanh()
  (out): Linear(in_features=200, out_features=27, bias=True)
)

In [None]:
print("Initial generation")
for _ in range(5):
  print(generate(model))

Initial generation


NameError: name 'generate' is not defined

In [None]:
for epoch in range(1, 10):
    model.train()
    epoch_loss = 0.0
    for x, y in train_loader:
        logits = model(x)
        loss = F.cross_entropy(logits, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
    epoch_loss = epoch_loss / len(DLtr)
    print(f"Epoch {epoch+1} loss: {epoch_loss}")

Epoch 2 loss: 0.0003959389709494061
Epoch 3 loss: 0.00035546600620056396


KeyboardInterrupt: 

In [None]:
torch.save({
    "model_state_dict":model.state_dict(),
    "model_config": config,
    "optimizer": optimizer.state_dict()}, 'mlp_32_embed.pth')

In [None]:
loaded = torch.load('mlp_32_embed.pth')

In [None]:
loaded

{'model_state_dict': OrderedDict([('E.weight',
               tensor([[ 6.0813e-01,  1.0039e+00,  1.8090e+00,  4.6172e-01,  3.5113e-01,
                         5.3192e-01,  1.4519e-01,  5.3676e-01,  4.0246e-01, -2.5357e+00,
                         6.6420e-01, -5.3281e-01, -2.0600e-01,  1.8836e+00, -1.6059e+00,
                         1.2998e+00, -2.3554e-01,  1.3245e+00, -1.5759e+00,  1.6181e-01,
                         9.3331e-01,  1.1262e+00, -2.8327e-01,  1.1098e+00,  6.5904e-03,
                        -2.4550e-01,  2.9421e-01,  7.4770e-01, -1.0649e+00, -1.4750e+00,
                        -6.1441e-01, -1.4215e+00],
                       [-3.1657e-01,  8.4251e-01, -9.0124e-01,  1.4207e-02, -5.1405e-01,
                         8.7679e-01, -1.5746e+00,  2.0927e+00, -5.1526e-02, -1.3560e+00,
                        -4.9427e-01, -8.9369e-03,  5.1868e-01,  9.6176e-01,  2.4340e-01,
                         2.8047e-01, -2.4407e-01, -2.1591e+00,  3.5335e-01,  1.0386e+00,
            

In [None]:
X[[0]]

tensor([[0, 0, 0, 0, 0, 0, 0, 0]])

In [None]:
context = [0]*8
output = []
for i in range(100):
    print(f"Context {i}:", context, end = '\t\t')
    logits = model(X[[0]])
    probs = torch.softmax(logits, dim=0)
    probs
    ix = torch.multinomial(probs, num_samples=1).item()
    if ix==0:
      break
    output.append(ix)
    print(f"Output {i}:", ix)

    context = context[1:] + [ix]


Context 0: [0, 0, 0, 0, 0, 0, 0, 0]		Output 0: 9
Context 1: [0, 0, 0, 0, 0, 0, 0, 9]		Output 1: 15
Context 2: [0, 0, 0, 0, 0, 0, 9, 15]		Output 2: 3
Context 3: [0, 0, 0, 0, 0, 9, 15, 3]		Output 3: 18
Context 4: [0, 0, 0, 0, 9, 15, 3, 18]		Output 4: 15
Context 5: [0, 0, 0, 9, 15, 3, 18, 15]		Output 5: 18
Context 6: [0, 0, 9, 15, 3, 18, 15, 18]		Output 6: 16
Context 7: [0, 9, 15, 3, 18, 15, 18, 16]		Output 7: 10
Context 8: [9, 15, 3, 18, 15, 18, 16, 10]		Output 8: 17
Context 9: [15, 3, 18, 15, 18, 16, 10, 17]		Output 9: 21
Context 10: [3, 18, 15, 18, 16, 10, 17, 21]		Output 10: 12
Context 11: [18, 15, 18, 16, 10, 17, 21, 12]		Output 11: 16
Context 12: [15, 18, 16, 10, 17, 21, 12, 16]		Output 12: 22
Context 13: [18, 16, 10, 17, 21, 12, 16, 22]		Output 13: 26
Context 14: [16, 10, 17, 21, 12, 16, 22, 26]		Output 14: 26
Context 15: [10, 17, 21, 12, 16, 22, 26, 26]		Output 15: 4
Context 16: [17, 21, 12, 16, 22, 26, 26, 4]		Output 16: 19
Context 17: [21, 12, 16, 22, 26, 26, 4, 19]		Output 17: 

In [None]:
probs.argmax()

tensor(0)

In [None]:
from transformers import GPT2LMHeadModel

In [None]:
gpt2 =GPT2LMHeadModel.from_pretrained('gpt2')

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
@torch.no_grad()
def generate(model, max_length=100):
  model.eval()
