In [1]:
import torch 
import torch.nn.functional as F
import matplotlib.pyplot as plt 

words = open('data/names.txt', 'r').read().splitlines()
print(words[:8])
print(len(words))

list_words = sorted(list(set(''.join(words))))

stoi =  {s:i+1 for i,s in enumerate(list_words)}
stoi['.'] = 0 
itos = {i:s for i , s in stoi.items()}


['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']
32033


In [2]:
block_size = 3

def build_dataset(words):
    X , Y = [], []
    for w in words:
        context = [0] *block_size
        for ch in w + '.':
            xi = stoi[ch]
            X.append(context)
            Y.append(xi)
            context = context[1:] + [xi]
    
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print(X.shape, Y.shape)
    return X, Y 

In [3]:
import random 
random.shuffle(words)
n1 = int(0.8 * len(words))
n2 = int(0.9 * len(words))


In [4]:

print(f"train dataset - {n1}, validataion dataset - {n2-n1}")
Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])

train dataset - 25626, validataion dataset - 3203
torch.Size([182590, 3]) torch.Size([182590])
torch.Size([22775, 3]) torch.Size([22775])
torch.Size([22781, 3]) torch.Size([22781])


In [5]:

#model architecture 

block_size = 3
embedding_dim = 10
w1_neurons = 200

g = torch.Generator().manual_seed(2147483647)
C = torch.randn((27,embedding_dim  ),generator=g)
W1 = torch.randn((embedding_dim * block_size,w1_neurons),generator=g)
b1 = torch.randn(w1_neurons, generator=g)

W2 = torch.randn((w1_neurons,27), generator = g )
b2 = torch.randn(27,generator =g)

parameters = [C,W1, b1,W2,b2]

for p in parameters:
    p.requires_grad = True


#number of paramerters 
number_of_paramerters = sum(p.nelement() for p in parameters)
print(f"Number of paramerters: {number_of_paramerters}")


Number of paramerters: 11897


# 1. Random Mini Batch

In [6]:
for i in range(200000):
    ix = torch.randint(0,Xtr.shape[0],(32,))

    emb = C[Xtr[ix]]
    h = torch.tanh(emb.view(-1,30) @ W1 + b1)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits,Ytr[ix])

    for p in parameters:
        p.grad = None 
    loss.backward()

    p.data += -0.1 * p.grad

print(loss)
    


tensor(20.0099, grad_fn=<NllLossBackward0>)


# 2. Sequential Sampling

In [7]:
batch_size = 32
for _ in range(10):
    for i in range(0, Xtr.shape[0], batch_size):
        ix = torch.arange(i, min(i + batch_size, Xtr.shape[0]))

        emb = C[Xtr[ix]]
        h = torch.tanh(emb.view(-1, 30) @ W1 + b1)
        logits = h @ W2 + b2
        loss = F.cross_entropy(logits, Ytr[ix])

        for p in parameters:
            p.grad = None
        loss.backward()

        p.data += -0.1 * p.grad

    print(loss)


tensor(21.5919, grad_fn=<NllLossBackward0>)
tensor(21.5911, grad_fn=<NllLossBackward0>)
tensor(21.5910, grad_fn=<NllLossBackward0>)
tensor(21.5909, grad_fn=<NllLossBackward0>)
tensor(21.5909, grad_fn=<NllLossBackward0>)
tensor(21.5909, grad_fn=<NllLossBackward0>)
tensor(21.5908, grad_fn=<NllLossBackward0>)
tensor(21.5908, grad_fn=<NllLossBackward0>)
tensor(21.5908, grad_fn=<NllLossBackward0>)
tensor(21.5908, grad_fn=<NllLossBackward0>)


# 3. Curriculum Learning

In [8]:
# Example: Sort indices by a predefined difficulty (e.g., loss value)
for _ in range(10):
    difficulty = torch.rand(Xtr.shape[0])  # Replace with actual metric
    sorted_indices = torch.argsort(difficulty)

    batch_size = 32
    for i in range(0, Xtr.shape[0], batch_size):
        ix = sorted_indices[i:i + batch_size]

        emb = C[Xtr[ix]]
        h = torch.tanh(emb.view(-1, 30) @ W1 + b1)
        logits = h @ W2 + b2
        loss = F.cross_entropy(logits, Ytr[ix])

        for p in parameters:
            p.grad = None
        loss.backward()

        p.data += -0.1 * p.grad

    print(loss)


tensor(18.8380, grad_fn=<NllLossBackward0>)
tensor(22.8129, grad_fn=<NllLossBackward0>)
tensor(19.7566, grad_fn=<NllLossBackward0>)
tensor(17.9392, grad_fn=<NllLossBackward0>)
tensor(24.6213, grad_fn=<NllLossBackward0>)
tensor(22.0555, grad_fn=<NllLossBackward0>)
tensor(17.3176, grad_fn=<NllLossBackward0>)
tensor(18.2183, grad_fn=<NllLossBackward0>)
tensor(18.2400, grad_fn=<NllLossBackward0>)
tensor(17.1507, grad_fn=<NllLossBackward0>)


# 4. Balanced Sampling 

In [9]:
from torch.utils.data import DataLoader, WeightedRandomSampler

for _ in range(10):
    class_counts = torch.bincount(Ytr)
    class_weights = 1.0 / class_counts
    sample_weights = class_weights[Ytr]

    sampler = WeightedRandomSampler(weights=sample_weights, num_samples=batch_size, replacement=True)
    dataloader = DataLoader(torch.arange(Xtr.shape[0]), batch_size=batch_size, sampler=sampler)

    for ix in dataloader:
        emb = C[Xtr[ix]]
        h = torch.tanh(emb.view(-1, 30) @ W1 + b1)
        logits = h @ W2 + b2
        loss = F.cross_entropy(logits, Ytr[ix])

        for p in parameters:
            p.grad = None
        loss.backward()

        p.data += -0.1 * p.grad

    print(loss)


tensor(19.2795, grad_fn=<NllLossBackward0>)
tensor(25.0091, grad_fn=<NllLossBackward0>)
tensor(25.2921, grad_fn=<NllLossBackward0>)
tensor(24.2747, grad_fn=<NllLossBackward0>)
tensor(18.7885, grad_fn=<NllLossBackward0>)
tensor(24.0624, grad_fn=<NllLossBackward0>)
tensor(25.0910, grad_fn=<NllLossBackward0>)
tensor(25.0412, grad_fn=<NllLossBackward0>)
tensor(27.1798, grad_fn=<NllLossBackward0>)
tensor(22.0716, grad_fn=<NllLossBackward0>)


# 5. Clustered Mini-Batching


In [18]:
from scikit-learn.cluster import KMeans

# Cluster data into `k` groups
k = 10
kmeans = KMeans(n_clusters=k).fit(Xtr)
clusters = kmeans.labels_

batch_size = 32
for cluster_id in range(k):
    cluster_indices = torch.where(torch.tensor(clusters) == cluster_id)[0]
    for i in range(0, len(cluster_indices), batch_size):
        ix = cluster_indices[i:i + batch_size]

        emb = C[Xtr[ix]]
        h = torch.tanh(emb.view(-1, 30) @ W1 + b1)
        logits = h @ W2 + b2
        loss = F.cross_entropy(logits, Ytr[ix])

        for p in parameters:
            p.grad = None
        loss.backward()

        p.data += -0.1 * p.grad

print(loss)


SyntaxError: invalid syntax (2420053499.py, line 1)

In [19]:
from tabulate import tabulate

data = [
    ["Random Sampling", "Randomly selects samples from the training data to form mini-batches", "High", "General, large datasets", "Improves generalization, avoids bias", "Gradient fluctuations"],
    ["Sequential Sampling", "Selects samples in order from the training data", "None", "Time-series, ordered data", "Simpler implementation", "Overfitting, less generalization"],
    ["Stratified Sampling", "Ensures each mini-batch contains samples from all classes, maintaining class distribution", "Moderate", "Imbalanced datasets", "Maintains class balance", "Higher computational cost"],
    ["Adaptive Mini-Batching", "Dynamically adjusts mini-batch composition based on model performance", "Dynamic", "Focus on harder examples", "Faster convergence", "More computational overhead"],
    ["Clustered Mini-Batching", "Groups similar samples together in mini-batches", "None", "Naturally clustered data", "Improves clustering tasks", "Requires clustering pre-process"],
    ["Curriculum Learning", "Progressively increases the difficulty of samples in mini-batches", "Progressive", "Complex tasks", "Faster, stable learning", "Defining difficulty is task-specific"],
    ["Balanced Mini-Batching", "Ensures equal representation of all classes in each mini-batch", "Moderate", "Tasks needing balance in features or labels", "Reduces bias, faster convergence", "Higher computation for balance"],
    ["Streaming Mini-Batching", "Processes data in real-time as it becomes available", "Dynamic", "Real-time or large datasets", "Handles large data", "Sensitive to data drift"]
]

headers = ["Type", "Description", "Randomness", "Use Case", "Pros", "Cons"]

print(tabulate(data, headers=headers, tablefmt="grid"))

+-------------------------+-------------------------------------------------------------------------------------------+--------------+---------------------------------------------+--------------------------------------+--------------------------------------+
| Type                    | Description                                                                               | Randomness   | Use Case                                    | Pros                                 | Cons                                 |
| Random Sampling         | Randomly selects samples from the training data to form mini-batches                      | High         | General, large datasets                     | Improves generalization, avoids bias | Gradient fluctuations                |
+-------------------------+-------------------------------------------------------------------------------------------+--------------+---------------------------------------------+--------------------------------------+----