In [1]:
import torch
import torch.nn.functional as F
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset, random_split

In [2]:
vocab_size = 16
X = torch.randint(0, vocab_size, (5, ))
X_onehot = F.one_hot(X, num_classes=vocab_size).float()
g = torch.Generator().manual_seed(99)
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [3]:

# X -> Vec -> Y
# we take the sequence X into a vector represetation (Encoder)
# And later we take that vector representation and tranform it in to a new sequence Y (Decoder)


# Encoder

#for i in T:
	#hx[t] = f(hx[t-1], x[t])
#C = hx[T-1]

#Decoder
# hy[t] = f(hy[t-1], y[t-1], C)


# Encoder

#for i in T:
	#hx[t] = f(hx[t-1], x[t])
#C = hx[T-1]

In [90]:
hidden_size = 32
# reset gate weights
Wr = torch.randn(vocab_size, hidden_size)
Ur = torch.randn(hidden_size, hidden_size)

# update gate weights
Wz = torch.randn(vocab_size, hidden_size)
Uz  = torch.randn(hidden_size, hidden_size)

# x-to-h Weights
Wxh = torch.randn(vocab_size, hidden_size)
Whh = torch.randn(hidden_size, hidden_size)

#
V = torch.randn(hidden_size, hidden_size)


In [91]:
h = torch.zeros((1, hidden_size))
t = 0

r = F.sigmoid((X_onehot[t] @ Wr) + (h @ Ur)) # the reset gate
z = F.sigmoid((X_onehot[t] @ Wr) + (h @ Ur)) # updated gate

# This gate decides if we keep h[t-1] to update h
# if r is rougly zeros, ignores the previous hidden state and only carries x[t]'s projection to h 
# to perform the update.
reset_gate = (X_onehot[t] @ Wxh) + (h * r) @ Whh

# the update gate, decides wheter we need to update h or not, however not binary yes or no.
# is a smoothed decision, interpolates what was kept from the reset gate in [0,1]
h1 = z * h + (1-z) * reset_gate


In [5]:
# Encoder
T, _ = X_onehot.shape

h = torch.zeros((1, hidden_size))

for t in range(T):
	r = F.sigmoid((X_onehot[t] @ Wr) + (h @ Ur)) # the reset gate
	z = F.sigmoid((X_onehot[t] @ Wz) + (h @ Uz)) # updated gate
	update = F.tanh((X_onehot[t] @ Wxh) + (r * h) @ Whh)
	h = z*h + (1-z)*update

C = F.tanh(h@V)

In [3]:
import kagglehub
import pandas as pd

In [4]:
path = kagglehub.dataset_download("lonnieqin/englishspanish-translation-dataset")

Downloading from https://www.kaggle.com/api/v1/datasets/download/lonnieqin/englishspanish-translation-dataset?dataset_version_number=1...


100%|██████████| 2.72M/2.72M [00:01<00:00, 2.42MB/s]

Extracting files...





In [5]:
import os
from pathlib import Path
file =  Path(path) / "data.csv"
df = pd.read_csv(file)

In [6]:
df.head()

Unnamed: 0,english,spanish
0,Go.,Ve.
1,Go.,Vete.
2,Go.,Vaya.
3,Go.,Váyase.
4,Hi.,Hola.


In [108]:
import re

class Tokenizer:
	def __init__(self, target: str):
		if target not in ["english", "spanish"]:
			raise ValueError("Target must be one of : ['english', 'spanish']")

		self.target = target

		# special tokens
		self.eos_token = "<EOS>"
		self.sos_token = "<SOS>"
		self.unk_token = "<UNK>"
		self.pad_token = "<PAD>"

		self.special_tokens = [
			self.pad_token,
			self.eos_token,
			self.sos_token,
			self.unk_token,
		]

		#vocab
		self.vocab = self.build_vocab(target, special_tokens=self.special_tokens)
		self.vocab_size = len(self.vocab)

		# mappings
		self.token_to_idx = {token: idx for idx, token in enumerate(self.vocab)}
		self.idx_to_token = {idx: token for idx, token in enumerate(self.vocab)}

	def tokenize(self, s: str) -> list[str]:
		s = re.sub(r"([.!?])", r" \1", s)
		tokens = s.split()
		return tokens

	def build_vocab(self, target: str, special_tokens: list[str]):
		vocab = set()
		sentences = df[target]
		for s in sentences:
			tokens = self.tokenize(s)
			for token in tokens:
				vocab.add(token)
		return special_tokens + sorted(list(vocab))

	def encode(self, s: str) -> torch.Tensor:
		SOS = [self.token_to_idx[self.sos_token]]
		EOS = [self.token_to_idx[self.eos_token]]
		out = []
		for token in self.tokenize(s):
			idx = self.token_to_idx.get(token, self.token_to_idx[self.unk_token])
			out.append(idx)
		return torch.tensor(SOS + out + EOS)

	def decode(self, indices: list[int]) -> str:
		out = []
		for idx in indices:
			if idx == self.token_to_idx[self.unk_token]:
				out.append(self.unk_token)
			else:
				out.append(self.idx_to_token[idx])
		return " ".join(out)

In [109]:
en_tokenizer = Tokenizer(target="english")
es_tokenizer = Tokenizer(target="spanish")

In [110]:
tokens_en = en_tokenizer.encode("Hello how are you today my friend?")
tokens_es = es_tokenizer.encode("hola como estas hoy amigo? xdxd")

In [111]:
print(tokens_en, en_tokenizer.decode(tokens_en.tolist()))
print(tokens_es, es_tokenizer.decode(tokens_es.tolist()))

tensor([    2,  1741,  9602,  4231, 17364, 16057, 11459,  8643,   504,     1]) <SOS> Hello how are you today my friend ? <EOS>
tensor([    2, 18632, 10983, 16342, 18713,  7166,   536,     3,     1]) <SOS> hola como estas hoy amigo ? <UNK> <EOS>


In [233]:
class Encoder(nn.Module):
	def __init__(self, hidden_size, emb_dim, vocab_size):
		super().__init__()

		self.hidden_size = hidden_size 
		self.vocab_size = vocab_size

		self.embeddings = nn.Embedding(vocab_size, emb_dim, device=device)

		#self.Wr = torch.randn(emb_dim, hidden_size)
		#self.Ur  = torch.randn(hidden_size, hidden_size)

		self.Wr = nn.Linear(emb_dim, hidden_size)
		self.Ur  = nn.Linear(hidden_size, hidden_size)

		# update gate weights
		#self.Wz = torch.randn(emb_dim, hidden_size)
		#self.Uz  = torch.randn(hidden_size, hidden_size)

		self.Wz = nn.Linear(emb_dim, hidden_size)
		self.Uz  = nn.Linear(hidden_size, hidden_size)

		# x-to-h Weights
		#self.Wxh = torch.randn(emb_dim, hidden_size)
		#self.Whh = torch.randn(hidden_size, hidden_size)

		self.Wxh = nn.Linear(emb_dim, hidden_size)
		self.Whh = nn.Linear(hidden_size, hidden_size)

		#
		#self.V = torch.randn(hidden_size, hidden_size)
		self.V = nn.Linear(hidden_size, hidden_size)

	def forward(self, X: torch.Tensor):
		Xemb = self.embeddings(X)

		B, T, _ = Xemb.shape
		h = torch.zeros((B, self.hidden_size), device=device)

		#h @ hh  -> [1, h] @ (h, h) -> [1, h]

		#h @ hh  -> [1, h] @ (h, h) -> [1, h]

		#1[]

		for t in range(T):
			r = F.sigmoid((self.Wr(Xemb[:, t])) + self.Ur(h)) # the reset gate
			z = F.sigmoid(self.Wz(Xemb[:, t]) + self.Uz(h)) # updated gate
			#update = F.tanh(Xemb[t] @ self.Wxh + (r * h) @ self.Whh)
			update = F.tanh(self.Wxh(Xemb[:, t]) + self.Whh(r*h))
			h = z*h + (1-z)*update

		C = F.tanh(self.V(h))
		return C

In [230]:
def f():
	hidden_size = 5
	D = 16
	B = 4

	Whh  = torch.randn(hidden_size, hidden_size)
	Whx = torch.randn(D, hidden_size)

	h = torch.randn(1, 5)

	Xt = torch.randn(B, D)

	#next_h = xt@Whx + h @ Whh  [1,D] @ (D, h) -> h + (1, h) @ (h, h) -> h = h + h -> h
	next_h = Xt@Whx + h @ Whh  #[B,D] @ (D, h) -> (B, h) + (1, h) @ (h, h) -> h = h + h -> h

	print(next_h)

f()

tensor([[  2.2114,   6.1889,   1.7093,  -1.8465,  -1.9907],
        [  7.2244,  -0.2579,  -6.9254,   7.9222, -15.2532],
        [ -2.5257,  -2.2603,  -2.4184,   4.4782,   5.5888],
        [ -1.7843,  -2.4352,   3.0125,   1.7526,  -0.9829]])


In [234]:
torch.arange(12).view(4, 3) + torch.ones(3)

tensor([[ 1.,  2.,  3.],
        [ 4.,  5.,  6.],
        [ 7.,  8.,  9.],
        [10., 11., 12.]])

In [368]:
class Decoder(nn.Module):
	def __init__(self, hidden_size, emb_dim, vocab_size):
		super().__init__()

		self.hidden_size = hidden_size
		self.vocab_size = vocab_size

		self.embeddings = nn.Embedding(vocab_size, emb_dim, device=device)
		#self.V = torch.randn(hidden_size, hidden_size)
		self.V = nn.Linear(hidden_size, hidden_size, bias=False)

		#update gate
		#self.Cz = torch.randn(hidden_size, hidden_size)
		#self.Uz = torch.randn(hidden_size, hidden_size)
		#self.Wz = torch.randn(emb_dim, hidden_size)

		self.Cz = nn.Linear(hidden_size, hidden_size, bias=False)
		self.Uz = nn.Linear(hidden_size, hidden_size, bias=False)
		self.Wz = nn.Linear(emb_dim, hidden_size, bias=False)

		#reset gate
		#self.Cr = torch.randn(hidden_size, hidden_size)
		#self.Ur = torch.randn(hidden_size, hidden_size)
		#self.Wr = torch.randn(emb_dim, hidden_size)

		self.Cr = nn.Linear(hidden_size, hidden_size, bias=False)
		self.Ur = nn.Linear(hidden_size, hidden_size, bias=False)
		self.Wr = nn.Linear(emb_dim, hidden_size, bias=False)

		#update params

		#self.U = torch.randn(hidden_size, hidden_size)
		#self.C = torch.randn(hidden_size, hidden_size)
		#self.W = torch.randn(emb_dim, hidden_size)

		self.U = nn.Linear(hidden_size, hidden_size, bias=False)
		self.C = nn.Linear(hidden_size, hidden_size, bias=False)
		self.W = nn.Linear(emb_dim, hidden_size, bias=False)

		# output
		# Im using 2*vocab_size as the target size here since we are using maxout for output
		# keeping it close to what the paper does oringinally.
		# Might change it to something simpler.
		d = 64

		#self.Oh = torch.randn(hidden_size, 2 * d)
		#self.Oy = torch.randn(emb_dim, 2 * d)
		#self.Oc = torch.randn(hidden_size, 2 * d)

		#self.G = torch.randn(d, vocab_size)

		self.Oh = nn.Linear(hidden_size, 2 * d, bias=False)
		self.Oy = nn.Linear(emb_dim, 2 * d, bias=False)
		self.Oc = nn.Linear(hidden_size, 2 * d, bias=False)

		self.G = nn.Linear(d, vocab_size, bias=False)
		

	def forward(self, c: torch.Tensor, Y: torch.tensor):
		#h =  F.tanh(c @ self.V)
		h = F.tanh(self.V(c))

		Yemb = self.embeddings(Y)
		B, T, D = Yemb.shape

		#y_logits = []
		y_logits = torch.zeros(B, T-1, self.vocab_size, device=device)

		#[B, D]
		for t in range(1, T):
			#update gate
			#z =  F.sigmoid(Yemb[t-1]@self.Wz +  h@self.Uz + c@self.Cz)
			z = F.sigmoid(self.Wz(Yemb[:, t-1]) + self.Uz(h) + self.Cz(c))
			
			#r =  F.sigmoid(Yemb[t-1]@self.Wr +  h@self.Ur + c@self.Cr)
			#reset gate
			r = F.sigmoid(self.Wr(Yemb[:, t-1]) + self.Ur(h) + self.Cr(c))

			#candidate update to h
			#update = F.tanh((Yemb[t-1] @ self.W) + r*(h @ self.U + c@self.C))
			update = F.tanh((self.W(Yemb[:, t-1])) + r * (self.U(h) + self.C(c)))

			#update
			h = z*h + (1-z)*update

			# [1, 2*h] => s = [s1, s2, s3, s4, ..... ] #
			# [[s1, s2],
			# [s3, s4]

			# MAX OUT (Might be a good idea to turn this into a module)
			#s_ = h @ self.Oh + Yemb[t-1] @ self.Oy + c @ self.Oc
			s_ = self.Oh(h) + self.Oy(Yemb[:, t-1]) + self.Oc(c)
			s = s_.view(B, -1, 2).max(dim=-1).values

			logits = self.G(s)
			y_logits[:, t-1] = logits

		return y_logits


In [370]:
X = en_tokenizer.encode("Hello how are you today my friend?")
Y = es_tokenizer.encode("hola como estas hoy amigo?")

In [371]:
from dataclasses import dataclass

@dataclass
class Config:
	hidden_size = 32
	emb_dim = 64 # 500 in paper
	encoder_vocab_size = len(en_tokenizer.vocab)
	decoder_vocab_size = len(es_tokenizer.vocab)
	batch_size = 32

In [None]:
class Model(nn.Module):
	def __init__(self, config: Config):
		super().__init__()

		self.encoder = Encoder(config.hidden_size, config.emb_dim, config.encoder_vocab_size)
		self.decoder = Decoder(config.hidden_size, config.emb_dim, config.decoder_vocab_size)

	def forward(self, X: torch.Tensor, Y: torch.Tensor):
		c = self.encoder(X) 
		return self.decoder(c, Y)

In [373]:
class EnglishSpanishDataset(Dataset):
	def __init__(self, en_tokenizer: Tokenizer, es_tokenizer: Tokenizer, df: pd.DataFrame):
		super().__init__()
		self.df = df
		self.en_tokenizer = en_tokenizer
		self.es_tokenizer = es_tokenizer

		assert len(df[self.en_tokenizer.target]) == len(df[self.es_tokenizer.target])

	def __getitem__(self, index): 
		en = df[self.en_tokenizer.target]
		es = df[self.es_tokenizer.target]

		src = en[index]
		target = es[index]
		return en_tokenizer.encode(src), es_tokenizer.encode(target)

	def __len__(self):
		return len(df[self.en_tokenizer.target])


In [374]:
dataset = EnglishSpanishDataset(en_tokenizer, es_tokenizer, df)
train_dataset, test_dataset = random_split(dataset, [0.80, 0.20], generator=g)

In [375]:
class BatchProcess:
	def __init__(self, pad_idx_src, pad_idx_dst):
		self.pad_idx_src = pad_idx_src
		self.pad_idx_dst = pad_idx_dst

	def __call__(self, batch):
		# Adds padding to both target and source sententes.
		X, Y = [], []
		max_x, max_y = 0, 0

		for sample in batch:
			x, y = sample
			max_x = max(len(x), max_x)
			max_y = max(len(y), max_y)

		for sample in batch:
			x, y = sample

			padded_x = torch.cat([x, torch.tensor([self.pad_idx_src] * (max_x - len(x)), dtype=torch.long)])
			padded_y = torch.cat([y, torch.tensor([self.pad_idx_dst] * (max_y - len(y)), dtype=torch.long)])

			X.append(padded_x)
			Y.append(padded_y)

		return torch.vstack(X), torch.vstack(Y)


In [376]:
batch_process = BatchProcess(
	pad_idx_src=en_tokenizer.token_to_idx[en_tokenizer.pad_token], # both are the same since we use the same tokenizer, but better to pass both anyways
	pad_idx_dst=es_tokenizer.token_to_idx[es_tokenizer.pad_token]
)
train_loader = DataLoader(train_dataset, batch_size=Config.batch_size, shuffle=True, collate_fn=batch_process)
test_loader = DataLoader(test_dataset, batch_size=Config.batch_size, shuffle=True, collate_fn=batch_process)

In [377]:
model = Model(Config).to(device)

In [387]:
# FORWARD PASS TEST
pad_idx = en_tokenizer.token_to_idx[en_tokenizer.pad_token]
for X, Y in train_loader:

	print(f"SRC = {X[:1]}")
	print(f"TARGET = {Y[:1]}")
	print(f"Expected initial LOSS -> {-torch.log(torch.tensor(1/Config.decoder_vocab_size)):.4f}")
	X = X.to(device)
	Y = Y.to(device)
	with torch.no_grad():
		model.eval()
		y_logits = model.forward(X, Y)
		y_log_probs = F.log_softmax(y_logits, dim=-1)
		B, T, Y_VOCAB = y_log_probs.shape
		#L = F.nll_loss(y_log_probs.view(B*T, Y_VOCAB), Y.view(B * T)) # SKIP <SOS>

		# Batched version of nll_loss call, this function doesnt like batches so we need to adjust the shapes.
		L = F.nll_loss(y_log_probs.view(B*T, Y_VOCAB), 
					   Y[:, 1:].contiguous().view(-1),
				 	   #Y[:, 1:].reshape(B * (Y.size(1) - 1)), 
				       ignore_index=pad_idx)

		#L = F.nll_loss(y_log_probs[:, ], Y[0, 1:]) # SKIP <SOS>
		print(f"Initial Loss : {L.item():.4f}" )
		model.train()
	break

SRC = tensor([[    2,   525, 16050,  9401,  9304, 10138,  3653,  6443,   175,     1,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0]])
TARGET = tensor([[    2,  4869, 14903, 32269, 15816, 30071, 10665,   256,     1,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0]])
Expected initial LOSS -> 10.3841
Initial Loss : 10.4543


In [395]:
# Overfitting one example as sanity check: Loss should go down fast
def overfit_one_batch():
	X, Y  = next(iter(train_loader))
	X = X.to(device)
	Y = Y.to(device)

	optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=0)

	#src = en_tokenizer.decode(X[:1].tolist())
	#target = es_tokenizer.decode(Y[:1].tolist())
	#print(f"SRC = {src} \nTARGET = {target}")

	for i in range(100):
		optimizer.zero_grad()


		y_logits = model.forward(X, Y)
		y_log_probs = F.log_softmax(y_logits, dim=-1)
		B, T, Y_VOCAB = y_log_probs.shape
		#L = F.nll_loss(y_log_probs.view(B*T, Y_VOCAB), Y.view(B * T)) # SKIP <SOS>

		# Batched version of nll_loss call, this function doesnt like batches so we need to adjust the shapes.
		L = F.nll_loss(y_log_probs.view(B*T, Y_VOCAB), 
					   Y[:, 1:].contiguous().view(-1),
				 	   #Y[:, 1:].reshape(B * (Y.size(1) - 1)), 
				       ignore_index=pad_idx)


		if (i + 1) % 10 == 0:
			print(f"Loss: {L.item():.4f}")

		L.backward()

		#clip gradients
		torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

		optimizer.step()

In [396]:
overfit_one_batch()

Loss: 9.5172
Loss: 7.9793
Loss: 5.3908
Loss: 3.5308
Loss: 2.9263
Loss: 2.4458
Loss: 1.9899
Loss: 1.5775
Loss: 1.2410
Loss: 1.0029


In [23]:
def train(model: Model, epochs: int, lr: float):
	train_losses = []
	optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=0)
	i = 0
	log_step = 100
	model.train()

	for _ in range(1):
		epoch_loss = 0.0

		for X, Y in train_loader:
			X = X.to(device)
			Y = Y.to(device)

			optimizer.zero_grad()

			y_logits = model.forward(X, Y)
			y_log_probs = F.log_softmax(y_logits, dim=-1)
			#print(y_log_probs.shape, Y.shape)

			L = F.nll_loss(y_log_probs[:, ], Y[0, 1:]) # SKIP <SOS>
			#epoch_loss += L.item()
			epoch_loss = L.item()

			#backprop
			L.backward()

			#clip gradients
			torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

			#update params
			optimizer.step()
			
			#record loss
			if (i+1) % log_step == 0:
				train_losses.append(epoch_loss)
			i+=1

			if i == 1000:
				break

		#train_losses.append(epoch_loss / len(train_loader))

	return train_losses


In [24]:
train_losses = train(
	model,
	epochs=1,
	lr=0.001
)