<a href="https://colab.research.google.com/github/dominiksakic/zero_to_hero/blob/main/adv_01_dialated_conv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Goal
- implement a dialated Convolutions
- link: https://arxiv.org/pdf/1609.03499

In [1]:
#BOILERPLATE CONVNET
!wget https://raw.githubusercontent.com/karpathy/makemore/master/names.txt

--2025-07-07 23:16:42--  https://raw.githubusercontent.com/karpathy/makemore/master/names.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 228145 (223K) [text/plain]
Saving to: ‘names.txt’


2025-07-07 23:16:42 (41.2 MB/s) - ‘names.txt’ saved [228145/228145]



In [42]:
import torch
import torch.nn.functional as F
import torch.nn as nn
import matplotlib.pyplot as plt
%matplotlib inline

In [43]:
words = open('names.txt', 'r').read().splitlines()

# build vocab and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s: i + 1 for i, s in enumerate(chars)}
stoi['.'] = 0
itos = {i : s for s, i in stoi.items()}
vocab_size = len(itos)


# build the dataset
block_size = 8

def build_dataset(words):
  X, Y = [], []
  for w in words:

    #print(w)
    context = [0] * block_size
    for ch in w + '.':
      ix = stoi[ch]
      X.append(context)
      Y.append(ix)
      context = context[1:] + [ix] # crop and append

  X = torch.tensor(X)
  Y = torch.tensor(Y)
  print(X.shape, Y.shape)
  return X, Y

import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])

torch.Size([182625, 8]) torch.Size([182625])
torch.Size([22655, 8]) torch.Size([22655])
torch.Size([22866, 8]) torch.Size([22866])


In [48]:
class EmbeddingWithReshape(nn.Module):
  def __init__(self, vocab_size, n_embd):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, n_embd)
  """
  Use transpose to rearrange dimensions.
  View wont work unless the tensor is already in the correct memory layout.
  Usefull link: https://blog.ezyang.com/2019/05/pytorch-internals/
  """
  def forward(self, x):
    # x: (B, T)
    x = self.embedding(x) # (B,T,C)
    x = x.transpose(1, 2) # (B,C,T)
    return x

In [49]:
n_embd = 10
n_hidden = 200
block_size = 8

model = nn.Sequential(
    EmbeddingWithReshape(vocab_size, n_embd),
    nn.Conv1d(n_embd, n_hidden, kernel_size=2),
    nn.BatchNorm1d(n_hidden),
    nn.Tanh(),

    nn.Conv1d(n_hidden, n_hidden, kernel_size=2),
    nn.BatchNorm1d(n_hidden),
    nn.Tanh(),

    nn.Conv1d(n_hidden, n_hidden, kernel_size=2),
    nn.BatchNorm1d(n_hidden),
    nn.Tanh(),

    nn.AdaptiveAvgPool1d(1),
    nn.Flatten(),
    nn.Linear(n_hidden, vocab_size),
)

x = torch.randint(0, vocab_size, (32, block_size))
print(f"Input shape: {x.shape}")

for layer in model:
    x = layer(x)
    print(f"{layer.__class__.__name__}: {x.shape}")

Input shape: torch.Size([32, 8])
EmbeddingWithReshape: torch.Size([32, 10, 8])
Conv1d: torch.Size([32, 200, 7])
BatchNorm1d: torch.Size([32, 200, 7])
Tanh: torch.Size([32, 200, 7])
Conv1d: torch.Size([32, 200, 6])
BatchNorm1d: torch.Size([32, 200, 6])
Tanh: torch.Size([32, 200, 6])
Conv1d: torch.Size([32, 200, 5])
BatchNorm1d: torch.Size([32, 200, 5])
Tanh: torch.Size([32, 200, 5])
AdaptiveAvgPool1d: torch.Size([32, 200, 1])
Flatten: torch.Size([32, 200])
Linear: torch.Size([32, 27])


In [50]:
parameters = list(model.parameters())
print(f"Parameters: {sum((p.nelement() for p in parameters))}")

Parameters: 171497


In [None]:
max_steps = 200000
batch_size = 32
lossi = []
ud = []

for i in range(max_steps):

  # minibatch construct
  ix = torch.randint(0, Xtr.shape[0], (batch_size,))
  Xb, Yb = Xtr[ix], Ytr[ix]

  # forward pass
  logits = model(Xb)
  loss = F.cross_entropy(logits, Yb)

  # backward pass
  for p in parameters:
    p.grad = None

  loss.backward()

  # update
  lr = 0.1 if i < 150000 else 0.01
  for p in parameters:
    p.data += -lr * p.grad

  # track stats
  if i % 10000 == 0:
    print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
  lossi.append(loss.log10().item())

      0/ 200000: 3.3550


In [None]:
plt.plot(torch.tensor(lossi).view(-1, 1000).mean(1))

In [None]:
@torch.no_grad()
def split_loss(split):
  model.eval()
  x,y = {
    'train': (Xtr, Ytr),
    'val': (Xdev, Ydev),
    'test': (Xte, Yte),
  }[split]

  logits = model(x)
  loss = F.cross_entropy(logits, y)
  print(split, loss.item())

split_loss('train')
split_loss('val')

In [None]:
# sample from the model
for _ in range(20):

    out = []
    context = [0] * block_size
    while True:
      logits = model(torch.tensor([context]))
      probs = F.softmax(logits, dim=1)
      # sample from the distribution
      ix = torch.multinomial(probs, num_samples=1).item()
      # shift the context window and track the samples
      context = context[1:] + [ix]
      out.append(ix)
      if ix == 0:
        break

    print(''.join(itos[i] for i in out))

In [None]:
# BOILERPLATE END