In [1]:
!python -m pip install --upgrade pip && pip install numpy && pip install tinygrad 

Collecting pip
  Downloading pip-24.2-py3-none-any.whl.metadata (3.6 kB)
Downloading pip-24.2-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m37.4 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 23.3.1
    Uninstalling pip-23.3.1:
      Successfully uninstalled pip-23.3.1
Successfully installed pip-24.2
[0mCollecting tinygrad
  Downloading tinygrad-0.9.2-py3-none-any.whl.metadata (10 kB)
Downloading tinygrad-0.9.2-py3-none-any.whl (751 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m752.0/752.0 kB[0m [31m115.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tinygrad
Successfully installed tinygrad-0.9.2
[0m

In [2]:
import numpy as np
from tinygrad.helpers import Timing
from tinygrad import Tensor
from tinygrad import dtypes
from tinygrad.nn.optim import SGD
from tinygrad import Tinyjit

#-------------------------------------------------------------------------------------------------

def sparse_categorical_crossentropy(self, Y, ignore_index=-1) -> Tensor:
    loss_mask = Y != ignore_index
    y_counter = Tensor.arange(self.shape[-1], dtype=dtypes.int32, requires_grad=False, device=self.device).unsqueeze(0).expand(Y.numel(), self.shape[-1])
    y = ((y_counter == Y.flatten().reshape(-1, 1)).where(-1.0, 0) * loss_mask.reshape(-1, 1)).reshape(*Y.shape, self.shape[-1])
    return self.log_softmax().mul(y).sum() / loss_mask.sum()

#-------------------------------------------------------------------------------------------------

class Linear:
    def __init__(self, in_features, out_features, bias=True, initialization: str='kaiming_uniform'):
        self.weight = getattr(Tensor, initialization)(in_features, out_features)
        self.bias = Tensor.zeros(out_features) if bias else None

    def __call__(self, x):
        return x.linear(self.weight.transpose(), self.bias)

class TinyNet:
    def __init__(self):
        self.C = Linear(27, 10, bias=False)
        self.W1 = Linear(30, 200, bias=True)
        self.W2 = Linear(200, 27, bias=True)

    def __call__(self, x):
        emb = self.C.weight[x]
        h = Tensor.tanh(emb.view(-1, 30) @ self.W1.weight + self.W1.bias)
        logits = h @ self.W2.weight + self.W2.bias
        loss = sparse_categorical_crossentropy(logits, Ytr[ix])
        return loss

    def sample(self, itos, block_size=3, num_samples=5):
        """
        Generate samples from the trained model.

        Args:
            itos (dict): Dictionary mapping indices to characters.
            block_size (int): Number of previous characters to use as context.
            num_samples (int): Number of samples to generate.
        """
        for _ in range(num_samples):
            output = []
            context = [0] * block_size  # Initialize with the start token index
            while True:
                # Convert context to a Tensor
                context_tensor = Tensor(np.array([context]), dtype=dtypes.int32, requires_grad=False, device=self.C.weight.device)
                
                # Forward pass
                emb = self.C.weight[context_tensor]  # Shape: (1, block_size, out_features)
                emb = emb.view(1, -1)  # Flatten the embedding
                h = Tensor.tanh(emb @ self.W1.weight + self.W1.bias)  # Hidden layer
                logits = h @ self.W2.weight + self.W2.bias  # Output logits
                
                # Apply softmax to get probabilities
                probs = Tensor.softmax(logits).numpy().flatten()
                
                # Handle potential numerical issues by normalizing
                probs = probs / probs.sum()
                
                # Sample from the probability distribution
                ix = np.random.choice(len(probs), p=probs)
                
                # Update context and output
                context = context[1:] + [ix]
                output.append(ix)
                
                # End token encountered
                if ix == 0:
                    break
            
            # Convert indices to characters and print the generated word
            generated_word = ''.join(itos[i] for i in output)
            print(generated_word)

net = TinyNet()

#-------------------------------------------------------------------------------------------------

words = open('./sandbox/names.txt', 'r').read().splitlines()
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}

# build the dataset
block_size = 3  # context length: how many characters do we take to predict the next one?
X, Y = [], []
for w in words:
    context = [0] * block_size
    for ch in w + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        context = context[1:] + [ix]
      
X = Tensor(X)
Y = Tensor(Y)

# build the dataset
block_size = 3  # context length: how many characters do we take to predict the next one?

def build_dataset(words):  
    X, Y = [], []
    for w in words:
        context = [0] * block_size
        for ch in w + '.':
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            context = context[1:] + [ix] 
    X = Tensor(X)
    Y = Tensor(Y)
    return X, Y

import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8 * len(words))
n2 = int(0.9 * len(words))

Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])

#-------------------------------------------------------------------------------------------------

learning_rate = 0.1
opt = SGD([net.C.weight, net.W1.weight, net.W1.bias, net.W2.weight, net.W2.bias], lr=learning_rate)

with Tensor.train():
    for step in range(5000):
        # random sample a batch / Minibatch construct 
        ix = Tensor.randint(32, low=0, high=Xtr.shape[0])

        # forward pass / Forward pass
        loss = net(Xtr[ix])
        
        # zero gradients
        opt.zero_grad()

        # backward pass
        loss.backward()

        # update parameters
        opt.step()

        if step % 100 == 0:
            print(f"Step {step+1} | Loss: {loss.numpy()} | Learning Rate: {learning_rate}")
        
        if step > 100 and step % 1000 == 0:
            learning_rate *= 0.1
            opt.lr = learning_rate  # Update optimizer's learning rate

#-------------------------------------------------------------------------------------------------

Step 1 | Loss: 3.8469650745391846 | Learning Rate: 0.1
Step 101 | Loss: 3.0333406925201416 | Learning Rate: 0.1
Step 201 | Loss: 2.523958683013916 | Learning Rate: 0.1
Step 301 | Loss: 2.5368735790252686 | Learning Rate: 0.1
Step 401 | Loss: 2.4694879055023193 | Learning Rate: 0.1
Step 501 | Loss: 2.2382071018218994 | Learning Rate: 0.1
Step 601 | Loss: 2.846656322479248 | Learning Rate: 0.1
Step 701 | Loss: 2.722745418548584 | Learning Rate: 0.1
Step 801 | Loss: 2.0899734497070312 | Learning Rate: 0.1
Step 901 | Loss: 1.9871314764022827 | Learning Rate: 0.1
Step 1001 | Loss: 2.1950466632843018 | Learning Rate: 0.1
Step 1101 | Loss: 2.528294324874878 | Learning Rate: 0.010000000000000002
Step 1201 | Loss: 2.721104621887207 | Learning Rate: 0.010000000000000002
Step 1301 | Loss: 2.4947049617767334 | Learning Rate: 0.010000000000000002
Step 1401 | Loss: 2.3223023414611816 | Learning Rate: 0.010000000000000002
Step 1501 | Loss: 2.5442099571228027 | Learning Rate: 0.010000000000000002
Step

In [4]:
net.sample(itos, block_size=block_size, num_samples=5)

neyuhranos.
ter.
kaah.
iva.
gulangker.
