In [10]:
import torch
print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))
    print("CUDA version:", torch.version.cuda)
else:
    print("⚠️ No GPU found. Go to Runtime > Change runtime type > Hardware accelerator > GPU")

PyTorch version: 2.9.0+cu126
CUDA available: True
GPU: Tesla T4
CUDA version: 12.6


In [11]:
from google.colab import files
import os

print("Please upload your GNOME dataset files:")
print("1. gnome-en-ne.en (English sentences)")
print("2. gnome-en-ne.ne (Nepali sentences)")
 ?

uploaded = files.upload()

# Verify files
for filename in uploaded.keys():
    print(f"✓ Uploaded: {filename} ({len(uploaded[filename])} bytes)")

Please upload your GNOME dataset files:
1. gnome-en-ne.en (English sentences)
2. gnome-en-ne.ne (Nepali sentences)

Click 'Choose Files' button below:


KeyboardInterrupt: 

In [12]:
def load_parallel_data(en_file, ne_file, max_samples=None):
    """Load English and Nepali parallel sentences"""

    # Read English sentences
    with open(en_file, 'r', encoding='utf-8') as f:
        english_sentences = [line.strip() for line in f if line.strip()]

    # Read Nepali sentences
    with open(ne_file, 'r', encoding='utf-8') as f:
        nepali_sentences = [line.strip() for line in f if line.strip()]

    # Ensure same number of sentences
    min_len = min(len(english_sentences), len(nepali_sentences))
    english_sentences = english_sentences[:min_len]
    nepali_sentences = nepali_sentences[:min_len]

    # Limit samples if specified
    if max_samples:
        english_sentences = english_sentences[:max_samples]
        nepali_sentences = nepali_sentences[:max_samples]

    return english_sentences, nepali_sentences

# Load data
print("Loading data...")
english_sentences, nepali_sentences = load_parallel_data(
    'gnome-en-ne.en',
    'gnome-en-ne.ne',
    max_samples=373800  # Start with 10K for faster training, increase later
)

print(f"\n✓ Loaded {len(english_sentences)} parallel sentences")
print("\nFirst 5 examples:")
print("="*80)
for i in range(5):
    print(f"\nExample {i+1}:")
    print(f"EN: {english_sentences[i]}")
    print(f"NE: {nepali_sentences[i]}")
print("="*80)

Loading data...

✓ Loaded 373800 parallel sentences

First 5 examples:

Example 1:
EN: GNOME
NE: जिनोम

Example 2:
EN: Default GNOME Theme
NE: पूर्वनिर्धारित जिनोम विषयवस्तु

Example 3:
EN: OK
NE: ठीक छ

Example 4:
EN: Art
NE: कला

Example 5:
EN: Camera
NE: क्यामेरा


In [13]:

import torch.nn as nn
import torch.nn.functional as F
import math

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        assert d_model % num_heads == 0

        self.d_model = d_model #256
        self.num_heads = num_heads #8
        self.d_k = d_model // num_heads #32 EACH HEAD

        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)

    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)
        attn_probs = torch.softmax(attn_scores, dim=-1)
        output = torch.matmul(attn_probs, V)
        return output

        #JUST THE FORMULA

    def split_heads(self, x):
        batch_size, seq_length, d_model = x.size()
        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)
        # SPLIT THE HEADS FOR THE BATCH ( 2, 100 , 32, 8)

    def combine_heads(self, x):
        batch_size, _, seq_length, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)
        # COMBINE

    def forward(self, Q, K, V, mask=None):
        Q = self.split_heads(self.W_q(Q))
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))

        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
        output = self.W_o(self.combine_heads(attn_output))  # PUT TO THE OUTPUT , TO CHANGE IT WITH SLIGHTLY TRAINED DATA
        return output

class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super().__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()
#JUST FORWARD
    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x))) # LINEAR -> RELU ->LINEAR

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length):
        super().__init__()

        pe = torch.zeros(max_seq_length, d_model)
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
        # encoding formula


        pe[:, 0::2] = torch.sin(position * div_term) # sinfor the even dimensions
        pe[:, 1::2] = torch.cos(position * div_term)# cos for the odd dimensions

        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        return x + self.pe[:, :x.size(1)] #return x+f(x)

class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_output, src_mask, tgt_mask):
        attn_output = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(attn_output))
        attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        return x

class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_heads,
                 num_layers, d_ff, max_seq_length, dropout):
        super().__init__()
        self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)
        self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)

        self.encoder_layers = nn.ModuleList([
            EncoderLayer(d_model, num_heads, d_ff, dropout)
            for _ in range(num_layers)
        ])
        self.decoder_layers = nn.ModuleList([
            DecoderLayer(d_model, num_heads, d_ff, dropout)
            for _ in range(num_layers)
        ])

        self.fc = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def generate_mask(self, src, tgt):
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2)
        tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(3)
        seq_length = tgt.size(1)
        nopeak_mask = (1 - torch.triu(torch.ones(1, seq_length, seq_length), diagonal=1)).bool()
        tgt_mask = tgt_mask & nopeak_mask.to(tgt.device)
        return src_mask, tgt_mask

    def forward(self, src, tgt):
        src_mask, tgt_mask = self.generate_mask(src, tgt)
        src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src)))
        tgt_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(tgt)))

        enc_output = src_embedded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)

        dec_output = tgt_embedded
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)

        output = self.fc(dec_output)
        return output

print("✓ Transformer model defined!")

✓ All dependencies ready!
✓ Transformer model defined!


In [14]:
from collections import Counter
import json

class SimpleTokenizer:
    def __init__(self):
        self.word2idx = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
        self.idx2word = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
        self.vocab_size = 4

    def fit(self, sentences, max_vocab_size=10000):
        """Build vocabulary from sentences"""
        word_freq = Counter()
        for sentence in sentences:
            words = sentence.lower().split()
            word_freq.update(words)

        most_common = word_freq.most_common(max_vocab_size - 4)

        for word, _ in most_common:
            if word not in self.word2idx:
                self.word2idx[word] = self.vocab_size
                self.idx2word[self.vocab_size] = word
                self.vocab_size += 1

        print(f"  Built vocabulary: {self.vocab_size} words")

    def encode(self, sentence, max_length=50):
        """Convert sentence to token IDs"""
        words = sentence.lower().split()
        tokens = [self.word2idx.get(word, self.word2idx["<UNK>"]) for word in words]
        tokens = [self.word2idx["<SOS>"]] + tokens + [self.word2idx["<EOS>"]]

        if len(tokens) < max_length:
            tokens += [self.word2idx["<PAD>"]] * (max_length - len(tokens))
        else:
            tokens = tokens[:max_length-1] + [self.word2idx["<EOS>"]]

        return tokens

    def decode(self, tokens):
        """Convert token IDs back to sentence"""
        words = []
        for token in tokens:
            if token == self.word2idx["<EOS>"]:
                break
            if token not in [self.word2idx["<PAD>"], self.word2idx["<SOS>"]]:
                words.append(self.idx2word.get(token, "<UNK>"))
        return " ".join(words)

print("✓ Tokenizer defined!")

✓ Tokenizer defined!


In [15]:
from torch.utils.data import Dataset, DataLoader

class TranslationDataset(Dataset):
    def __init__(self, src_sentences, tgt_sentences, src_tokenizer, tgt_tokenizer, max_length=50):
        self.src_sentences = src_sentences
        self.tgt_sentences = tgt_sentences
        self.src_tokenizer = src_tokenizer
        self.tgt_tokenizer = tgt_tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.src_sentences)

    def __getitem__(self, idx):
        src = self.src_tokenizer.encode(self.src_sentences[idx], self.max_length)
        tgt = self.tgt_tokenizer.encode(self.tgt_sentences[idx], self.max_length)
        return torch.tensor(src), torch.tensor(tgt)

print("✓ Dataset class defined!")

✓ Dataset class defined!


In [16]:
print("Preparing data for training...")

# Split into train/validation (90/10)
split_idx = int(0.9 * len(english_sentences))

train_en = english_sentences[:split_idx]
train_ne = nepali_sentences[:split_idx]
val_en = english_sentences[split_idx:]
val_ne = nepali_sentences[split_idx:]

print(f"Training samples: {len(train_en)}")
print(f"Validation samples: {len(val_en)}")

# Build tokenizers
print("\nBuilding English tokenizer...")
src_tokenizer = SimpleTokenizer()
src_tokenizer.fit(train_en, max_vocab_size=8000)

print("Building Nepali tokenizer...")
tgt_tokenizer = SimpleTokenizer()
tgt_tokenizer.fit(train_ne, max_vocab_size=8000)

print(f"\n✓ English vocabulary size: {src_tokenizer.vocab_size}")
print(f"✓ Nepali vocabulary size: {tgt_tokenizer.vocab_size}")

# Create datasets
max_length = 50
train_dataset = TranslationDataset(train_en, train_ne, src_tokenizer, tgt_tokenizer, max_length)
val_dataset = TranslationDataset(val_en, val_ne, src_tokenizer, tgt_tokenizer, max_length)

# Create dataloaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

print(f"\n✓ Created dataloaders with batch size: {batch_size}")
print(f"✓ Training batches: {len(train_loader)}")
print(f"✓ Validation batches: {len(val_loader)}")

Preparing data for training...
Training samples: 336420
Validation samples: 37380

Building English tokenizer...
  Built vocabulary: 8000 words
Building Nepali tokenizer...
  Built vocabulary: 8000 words

✓ English vocabulary size: 8000
✓ Nepali vocabulary size: 8000

✓ Created dataloaders with batch size: 32
✓ Training batches: 10514
✓ Validation batches: 1169


In [17]:
d_model = 256          # Embedding dimension
num_heads = 8          # Number of attention heads
num_layers = 4         # Number of encoder/decoder layers
d_ff = 1024           # Feed-forward dimension
dropout = 0.1
learning_rate = 0.0001

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Create model
model = Transformer(
    src_vocab_size=src_tokenizer.vocab_size,
    tgt_vocab_size=tgt_tokenizer.vocab_size,
    d_model=d_model,
    num_heads=num_heads,
    num_layers=num_layers,
    d_ff=d_ff,
    max_seq_length=max_length,
    dropout=dropout
).to(device)

# Count parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"\n✓ Model created!")
print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")

# Loss and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=0)  # Ignore padding
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

print("✓ Optimizer and loss function ready!")


Using device: cuda

✓ Model created!
Total parameters: 13,524,800
Trainable parameters: 13,524,800
✓ Optimizer and loss function ready!


In [18]:
import torch.optim as optim
from tqdm.notebook import tqdm

def train_epoch(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0

    progress_bar = tqdm(dataloader, desc="Training")
    for src, tgt in progress_bar:
        src, tgt = src.to(device), tgt.to(device)

        tgt_input = tgt[:, :-1]
        tgt_output = tgt[:, 1:]

        optimizer.zero_grad()
        output = model(src, tgt_input)

        loss = criterion(output.reshape(-1, output.shape[-1]), tgt_output.reshape(-1))
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        total_loss += loss.item()
        progress_bar.set_postfix({'loss': f'{loss.item():.4f}'})

    return total_loss / len(dataloader)

def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0

    with torch.no_grad():
        for src, tgt in dataloader:
            src, tgt = src.to(device), tgt.to(device)
            tgt_input = tgt[:, :-1]
            tgt_output = tgt[:, 1:]

            output = model(src, tgt_input)
            loss = criterion(output.reshape(-1, output.shape[-1]), tgt_output.reshape(-1))
            total_loss += loss.item()

    return total_loss / len(dataloader)

def translate(model, sentence, src_tokenizer, tgt_tokenizer, device, max_length=50):
    """Translate a single sentence"""
    model.eval()

    src_tokens = src_tokenizer.encode(sentence, max_length)
    src = torch.tensor([src_tokens]).to(device)

    tgt_tokens = [tgt_tokenizer.word2idx["<SOS>"]]

    for _ in range(max_length):
        tgt = torch.tensor([tgt_tokens]).to(device)

        with torch.no_grad():
            output = model(src, tgt)

        next_token = output[0, -1, :].argmax().item()
        tgt_tokens.append(next_token)

        if next_token == tgt_tokenizer.word2idx["<EOS>"]:
            break

    translation = tgt_tokenizer.decode(tgt_tokens)
    return translation

print("✓ Training functions defined!")


✓ Training functions defined!


In [19]:
num_epochs = 20
train_losses = []
val_losses = []

print("Starting training...")
print("="*80)

for epoch in range(num_epochs):
    print(f"\nEpoch {epoch+1}/{num_epochs}")

    # Train
    train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
    train_losses.append(train_loss)

    # Validate
    val_loss = evaluate(model, val_loader, criterion, device)
    val_losses.append(val_loss)

    print(f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")

    # Test translation every 5 epochs
    if (epoch + 1) % 5 == 0:
        test_sentences = [
            "Hello, how are you?",
            "Thank you very much.",
            "Good morning."
        ]

        print("\nSample translations:")
        for sent in test_sentences:
            translation = translate(model, sent, src_tokenizer, tgt_tokenizer, device)
            print(f"  EN: {sent}")
            print(f"  NE: {translation}")

print("\n" + "="*80)
print("✓ Training complete!")




Starting training...

Epoch 1/20


Training:   0%|          | 0/10514 [00:00<?, ?it/s]

Train Loss: 3.2058 | Val Loss: 2.5797

Epoch 2/20


Training:   0%|          | 0/10514 [00:00<?, ?it/s]

Train Loss: 1.4103 | Val Loss: 2.3301

Epoch 3/20


Training:   0%|          | 0/10514 [00:00<?, ?it/s]

Train Loss: 0.8838 | Val Loss: 2.2843

Epoch 4/20


Training:   0%|          | 0/10514 [00:00<?, ?it/s]

Train Loss: 0.6618 | Val Loss: 2.3163

Epoch 5/20


Training:   0%|          | 0/10514 [00:00<?, ?it/s]

Train Loss: 0.5427 | Val Loss: 2.3688

Sample translations:
  EN: Hello, how are you?
  NE: <UNK> कसरी <UNK> <UNK> <UNK>
  EN: Thank you very much.
  NE: तपाईँले धेरै धेरै लामो धेरै लामो !
  EN: Good morning.
  NE: असल <UNK>

Epoch 6/20


Training:   0%|          | 0/10514 [00:00<?, ?it/s]

Train Loss: 0.4691 | Val Loss: 2.4336

Epoch 7/20


Training:   0%|          | 0/10514 [00:00<?, ?it/s]

Train Loss: 0.4201 | Val Loss: 2.4828

Epoch 8/20


Training:   0%|          | 0/10514 [00:00<?, ?it/s]

Train Loss: 0.3854 | Val Loss: 2.5407

Epoch 9/20


Training:   0%|          | 0/10514 [00:00<?, ?it/s]

Train Loss: 0.3589 | Val Loss: 2.5589

Epoch 10/20


Training:   0%|          | 0/10514 [00:00<?, ?it/s]

Train Loss: 0.3379 | Val Loss: 2.5591

Sample translations:
  EN: Hello, how are you?
  NE: <UNK> <UNK> <UNK> <UNK>
  EN: Thank you very much.
  NE: तपाईँले धेरै कठीन <UNK> <UNK> from <UNK> <UNK> you <UNK>
  EN: Good morning.
  NE: राम्रो <UNK>

Epoch 11/20


Training:   0%|          | 0/10514 [00:00<?, ?it/s]

Train Loss: 0.3210 | Val Loss: 2.6292

Epoch 12/20


Training:   0%|          | 0/10514 [00:00<?, ?it/s]

Train Loss: 0.3074 | Val Loss: 2.6417

Epoch 13/20


Training:   0%|          | 0/10514 [00:00<?, ?it/s]

Train Loss: 0.2964 | Val Loss: 2.6938

Epoch 14/20


Training:   0%|          | 0/10514 [00:00<?, ?it/s]

Train Loss: 0.2853 | Val Loss: 2.7023

Epoch 15/20


Training:   0%|          | 0/10514 [00:00<?, ?it/s]

Train Loss: 0.2776 | Val Loss: 2.7137

Sample translations:
  EN: Hello, how are you?
  NE: कसरी <UNK> <UNK>
  EN: Thank you very much.
  NE: तपाईँले <UNK> <UNK> धेरै <UNK>
  EN: Good morning.
  NE: असल <UNK>

Epoch 16/20


Training:   0%|          | 0/10514 [00:00<?, ?it/s]

Train Loss: 0.2701 | Val Loss: 2.7446

Epoch 17/20


Training:   0%|          | 0/10514 [00:00<?, ?it/s]

Train Loss: 0.2642 | Val Loss: 2.7821

Epoch 18/20


Training:   0%|          | 0/10514 [00:00<?, ?it/s]

Train Loss: 0.2574 | Val Loss: 2.8006

Epoch 19/20


Training:   0%|          | 0/10514 [00:00<?, ?it/s]

Train Loss: 0.2529 | Val Loss: 2.8058

Epoch 20/20


Training:   0%|          | 0/10514 [00:00<?, ?it/s]

Train Loss: 0.2475 | Val Loss: 2.8489

Sample translations:
  EN: Hello, how are you?
  NE: कसरी प्रकारको <UNK> <UNK>
  EN: Thank you very much.
  NE: तपाईँले धेरै ठूलो <UNK> !
  EN: Good morning.
  NE: राम्रो <UNK>

✓ Training complete!
