In [1]:
!python -m pip install --upgrade pip && pip install numpy && pip install tinygrad 

Collecting pip
  Downloading pip-24.2-py3-none-any.whl.metadata (3.6 kB)
Downloading pip-24.2-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 23.3.1
    Uninstalling pip-23.3.1:
      Successfully uninstalled pip-23.3.1
Successfully installed pip-24.2
[0mCollecting tinygrad
  Downloading tinygrad-0.9.2-py3-none-any.whl.metadata (10 kB)
Downloading tinygrad-0.9.2-py3-none-any.whl (751 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m752.0/752.0 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tinygrad
Successfully installed tinygrad-0.9.2
[0m

In [2]:
import numpy as np
from tinygrad.helpers import Timing
from tinygrad import dtypes, nn, Tensor
from tinygrad.nn.optim import SGD
import random
from tinygrad.nn.optim import AdamW

In [3]:
def sparse_categorical_crossentropy(self, Y, ignore_index=-1) -> Tensor:
    loss_mask = Y != ignore_index
    y_counter = Tensor.arange(self.shape[-1], dtype=dtypes.int32, requires_grad=False, device=self.device).unsqueeze(0).expand(Y.numel(), self.shape[-1])
    y = ((y_counter == Y.flatten().reshape(-1, 1)).where(-1.0, 0) * loss_mask.reshape(-1, 1)).reshape(*Y.shape, self.shape[-1])
    return self.log_softmax().mul(y).sum() / loss_mask.sum()

In [4]:
# Load and shuffle words
with open('./sandbox/names.txt') as f:
    words = f.read().splitlines()
random.seed(42)
random.shuffle(words)

# Create character mappings
chars = sorted(set(''.join(words)))
stoi = {ch: i + 1 for i, ch in enumerate(chars)}
stoi['.'] = 0
itos = {i: ch for ch, i in stoi.items()}

block_size = 3

def build_dataset(words, block_size, stoi):
    X, Y = [], []
    context = [0] * block_size
    for word in words:
        for ch in word + '.':
            X.append(context.copy())
            Y.append(stoi[ch])
            context = context[1:] + [stoi[ch]]
    return Tensor(X), Tensor(Y)

# Split data indices
n1, n2 = int(0.8 * len(words)), int(0.9 * len(words))

# Build datasets
Xtrain, Ytrain = build_dataset(words[:n1], block_size, stoi)
Xdev, Ydev = build_dataset(words[n1:n2], block_size, stoi)
Xtest, Ytest = build_dataset(words[n2:], block_size, stoi)

In [5]:
class Model:
    def __init__(self):
        self.input_layer = nn.Linear(30, 200, bias=False)
        self.hidden1 = nn.Linear(200, 200, bias=False)
        self.hidden2 = nn.Linear(200, 200, bias=False)
        self.hidden3 = nn.Linear(200, 200, bias=False)
        self.hidden4 = nn.Linear(200, 200, bias=False)
        self.hidden5 = nn.Linear(200, 200, bias=False)
        self.output_layer = nn.Linear(200, 27, bias=False)
        
    def __call__(self, x):
        x = self.input_layer(x)
        x = x.batchnorm(None, None, x.mean(axis=0, keepdim=True), x.var(axis=0, keepdim=True).add(1e-5).rsqrt()).tanh()
        x = self.hidden1(x)
        x = x.batchnorm(None, None, x.mean(axis=0, keepdim=True), x.var(axis=0, keepdim=True).add(1e-5).rsqrt()).tanh()
        x = self.hidden2(x)
        x = x.batchnorm(None, None, x.mean(axis=0, keepdim=True), x.var(axis=0, keepdim=True).add(1e-5).rsqrt()).tanh()
        x = self.hidden3(x)
        x = x.batchnorm(None, None, x.mean(axis=0, keepdim=True), x.var(axis=0, keepdim=True).add(1e-5).rsqrt()).tanh()
        x = self.hidden4(x)
        x = x.batchnorm(None, None, x.mean(axis=0, keepdim=True), x.var(axis=0, keepdim=True).add(1e-5).rsqrt()).tanh()
        x = self.hidden5(x)
        x = x.batchnorm(None, None, x.mean(axis=0, keepdim=True), x.var(axis=0, keepdim=True).add(1e-5).rsqrt()).tanh()
        x = self.output_layer(x)
        x = x.batchnorm(None, None, x.mean(axis=0, keepdim=True), x.var(axis=0, keepdim=True).add(1e-5).rsqrt())
        return x

character_embeddings = Tensor.randn(27, 10)

In [7]:
model = Model()
learning_rate = 0.1
opt = AdamW([model.input_layer.weight, model.hidden1.weight, model.hidden2.weight, model.hidden3.weight, model.hidden4.weight, model.hidden5.weight, model.output_layer.weight, character_embeddings], lr=learning_rate)

with Tensor.train():
    for i in range(10000):
        ix =  Tensor.randint(32, low=0, high=Xtrain.shape[0])
        Xb, Yb = Xtrain[ix], Ytrain[ix]
        embeddings = character_embeddings[Xb]
        logits = model(embeddings.view(embeddings.shape[0],-1))
        loss = sparse_categorical_crossentropy(logits, Ytrain[ix])
        opt.zero_grad()
        loss.backward()
        opt.step()
        if i % 100 == 0: print(loss.numpy())

3.5775607
2.9161918
3.008948
2.8131108
2.7558143
2.905391
2.6558833
2.872062
2.5891078
2.9595323
2.7778232
2.8275373
2.8901703
2.8577313
2.805689
2.775821
2.7272813
2.657825
3.1732593
2.843945
3.1515384
2.8996062
2.8071826
2.9663632
2.9408212
2.8184867
2.962437
2.8157332
2.8696675
2.8251088
2.9844685
3.1011186
2.857622
3.0347276
2.7551575
3.1547635
2.903566
2.9013696
2.9489655
2.9080474
2.672242
2.6935434
2.8935328
2.978457
2.9486709
2.7383237
2.681944
2.7717202
2.634066
2.7534766
2.670509
2.6601596
2.873941
2.8930004
2.673729
2.7392917
2.6705372
2.9604363
2.5784538
2.6966646
2.7001734
2.6778982
2.8520308
2.547723
2.798663
2.982172
2.9186797
2.7363188
2.9238236
2.986453
3.0698
3.0837898
2.8224816
3.0793884
2.7983735
2.6560056
3.0373123
2.7987404
2.8656976
2.6527245
2.8034105
2.9812958
2.7411366
2.6224039
2.7680812
2.8199403
2.7177129
2.783424
2.7430747
2.6833227
2.857601
2.8868356
2.7898672
2.798888
2.8557913
2.7524636
2.959264
2.8351257
2.8978102
2.7352762


In [30]:
# Generate and print one name
context = [0] * 3  # Start with '.' characters
generated_name = ''

while True:
    # Convert context to embeddings
    context_embeddings = character_embeddings[Tensor(context)].reshape(-1)
    
    # Get model output
    output = model(context_embeddings)
    
    # Sample the next character
    probs = output.softmax().numpy()
    next_char_index = np.random.choice(len(probs), p=probs)
    next_char = itos[next_char_index]
    
    if next_char == '.':
        break  # End of name
    
    generated_name += next_char
    
    # Update context
    context = context[1:] + [next_char_index]

print(f"\nGenerated name: {generated_name}")

Generating name: 
Generated name: gbso
