Q2. 2 Seq2Seq Transformer model En-Fr Fr-En

Load your two files
Tokenizers: spaCy (en_core_web_sm, fr_core_news_sm)
Build vocabs
Create a Dataset + collate_fn
Train two models: EN→FR and FR→EN (same architecture, just swap source/target)
Do round-trip: novel EN → FR (model1), then FR → EN (model2).
Read pairs (line-by-line EN–FR). Make word lists (tokenize) and a dictionary (vocab) for EN and FR. Turn words into numbers (tensor sequences). Build the same Transformer twice (same architecture). Teach EN➜FR with (EN as input, FR as target). Teach FR➜EN with (FR as input, EN as target). Try a new English sentence, translate to French, then back to English.

In [45]:
import io, math, random
import torch
import torch.nn as nn
import spacy

In [46]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

DATA

In [47]:
EN_FILE = "/content/english.txt"
FR_FILE = "/content/french.txt"

In [48]:
MAX_VOCAB = 20000     # cap vocab size (used for both EN & FR)
MAX_LEN   = 64       # truncate long sentences
BATCH_SIZE= 128
EPOCHS    = 5        # used for both EN→FR and FR→EN
LR        = 2e-4

D_MODEL   = 256      # model width
NHEAD     = 4        # attention heads
LAYERS    = 3        # encoder & decoder layers
VALID_RATIO  = 0.02
MAX_VOCAB_EN = MAX_VOCAB_FR = MAX_VOCAB
ENC_LAYERS   = DEC_LAYERS   = LAYERS
DIM_FF       = 4 * D_MODEL
DROPOUT      = 0.1

**# 2) Read paired data from two files, keep alignment**

In [49]:
def read_parallel_two_files(en_path, fr_path):
    with io.open(en_path, "r", encoding="utf-8") as fe:
        en_lines = [l.strip() for l in fe]
    with io.open(fr_path, "r", encoding="utf-8") as ff:
        fr_lines = [l.strip() for l in ff]
    n = min(len(en_lines), len(fr_lines))
    if len(en_lines) != len(fr_lines):
        print(f"Note: length mismatch EN={len(en_lines)} FR={len(fr_lines)} → using first {n} aligned pairs")
    return list(zip(en_lines[:n], fr_lines[:n]))

pairs = read_parallel_two_files(EN_FILE, FR_FILE)
random.shuffle(pairs)
n_valid = max(1, int(len(pairs) * VALID_RATIO))
valid_pairs = pairs[:n_valid]
train_pairs = pairs[n_valid:]
print(f"Train: {len(train_pairs)}  Valid: {len(valid_pairs)}")

Train: 151786  Valid: 3097


In [50]:
!pip -q install spacy
!python -m spacy download en_core_web_sm
!python -m spacy download fr_core_news_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m122.3 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting fr-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.8.0/fr_core_news_sm-3.8.0-py3-none-any.whl (16.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m78.7 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and insta

**# 3) spaCy tokenizers**

In [51]:
# 3) spaCy tokenizers
nlp_en = spacy.load("en_core_web_sm",
                    disable=["tagger","parser","ner","attribute_ruler","lemmatizer"])
nlp_fr = spacy.load("fr_core_news_sm",
                    disable=["tagger","parser","ner","attribute_ruler","lemmatizer"])

**# 4) Vocab build (separate EN & FR) with special tokens aligned**

In [52]:
PAD, BOS, EOS, UNK = "<pad>", "<bos>", "<eos>", "<unk>"

def tokenize_corpus(nlp, texts, batch_size=1000, n_process=2):
    """Return list[list[str]] of tokens for each text."""
    out = []
    # If n_process causes issues on your Colab (rare), set n_process=1
    for doc in nlp.pipe(texts, batch_size=batch_size, n_process=n_process):
        out.append([t.text.lower() for t in doc if not (t.is_space or t.is_punct)])
    return out

# Build vocab from token lists (no spaCy here)
def build_vocab_from_tokens(tokens_list, max_size):
    from collections import Counter
    cnt = Counter(tok for toks in tokens_list for tok in toks)
    # sort by freq desc, then alpha for tie-break
    items = sorted(cnt.items(), key=lambda x: (-x[1], x[0]))
    tokens = [PAD, BOS, EOS, UNK] + [w for w,_ in items]
    if len(tokens) > max_size:
        tokens = tokens[:max_size]
    stoi = {w:i for i,w in enumerate(tokens)}
    itos = tokens
    return stoi, itos


**5) Encode to ID lists**

In [53]:
# 5) Encode to ID lists (truncate to MAX_LEN-2; BOS/EOS added later in batching)
def encode_from_tokens(tokens_list, stoi, max_len):
    unk = stoi[UNK]
    out = []
    for toks in tokens_list:
        ids = [stoi.get(tok, unk) for tok in toks][:max_len-2]  # room for BOS/EOS in batching
        out.append(ids)
    return out

en_train_texts = [en for en,_ in train_pairs]
fr_train_texts = [fr for _,fr in train_pairs]
en_valid_texts = [en for en,_ in valid_pairs]
fr_valid_texts = [fr for _,fr in valid_pairs]

train_en_tokens = tokenize_corpus(nlp_en, en_train_texts, batch_size=1000, n_process=1)
train_fr_tokens = tokenize_corpus(nlp_fr, fr_train_texts, batch_size=1000, n_process=1)
valid_en_tokens = tokenize_corpus(nlp_en, en_valid_texts, batch_size=1000, n_process=1)
valid_fr_tokens = tokenize_corpus(nlp_fr, fr_valid_texts, batch_size=1000, n_process=1)

# Build vocabs from TRAIN tokens only (special tokens aligned)
en_stoi, en_itos = build_vocab_from_tokens(train_en_tokens, MAX_VOCAB_EN)
fr_stoi, fr_itos = build_vocab_from_tokens(train_fr_tokens, MAX_VOCAB_FR)
PAD_ID = en_stoi[PAD]; BOS_ID = en_stoi[BOS]; EOS_ID = en_stoi[EOS]; UNK_ID = en_stoi[UNK]

# Encode to ID lists
train_en_ids = encode_from_tokens(train_en_tokens, en_stoi, MAX_LEN)
train_fr_ids = encode_from_tokens(train_fr_tokens, fr_stoi, MAX_LEN)
valid_en_ids = encode_from_tokens(valid_en_tokens, en_stoi, MAX_LEN)
valid_fr_ids = encode_from_tokens(valid_fr_tokens, fr_stoi, MAX_LEN)

**# 6) Batch builder: pad + masks**

In [54]:
def make_batch(id_lists_src, id_lists_tgt, pad_id=PAD_ID):
    B = len(id_lists_src)
    src_lens = [len(x) for x in id_lists_src]
    tgt_lens = [len(y) for y in id_lists_tgt]
    S = max(1, max(src_lens))
    T = max(1, max(tgt_lens)) + 1  # +1 for BOS/EOS shift

    src    = torch.full((B, S), pad_id, dtype=torch.long, device=device)
    tgt_in = torch.full((B, T), pad_id, dtype=torch.long, device=device)
    tgt_out= torch.full((B, T), pad_id, dtype=torch.long, device=device)

    for i, (s, t) in enumerate(zip(id_lists_src, id_lists_tgt)):
        s = s[:S]; t = t[:T-1]
        src[i, :len(s)] = torch.tensor(s, dtype=torch.long)
        ti = [BOS_ID] + t
        to = t + [EOS_ID]
        tgt_in[i, :len(ti)] = torch.tensor(ti, dtype=torch.long)
        tgt_out[i, :len(to)] = torch.tensor(to, dtype=torch.long)

    src_pad = (src == pad_id)    # (B,S) bool
    tgt_pad = (tgt_in == pad_id) # (B,T) bool
    Tlen = tgt_in.size(1)
    tgt_sub = torch.triu(torch.ones((Tlen, Tlen), dtype=torch.bool, device=device), diagonal=1)  # (T,T) bool
    return src, tgt_in, tgt_out, src_pad, tgt_pad, tgt_sub

In [55]:
# 7) Model: embeddings + sinusoidal PE + nn.Transformer
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        pe = torch.zeros(max_len, d_model)
        pos = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0)/d_model))
        pe[:, 0::2] = torch.sin(pos * div)
        pe[:, 1::2] = torch.cos(pos * div)
        self.register_buffer("pe", pe)

    def forward(self, x):  # x: (L,B,E)
        x = x + self.pe[:x.size(0)].unsqueeze(1)
        return self.dropout(x)

class Seq2SeqTransformer(nn.Module):
    def __init__(self, src_vocab, tgt_vocab, d_model, nhead, enc_layers, dec_layers, dim_ff, dropout, pad_id):
        super().__init__()
        self.src_emb = nn.Embedding(src_vocab, d_model, padding_idx=pad_id)
        self.tgt_emb = nn.Embedding(tgt_vocab, d_model, padding_idx=pad_id)
        self.pos = PositionalEncoding(d_model, dropout)
        self.tr = nn.Transformer(d_model=d_model, nhead=nhead,
                                 num_encoder_layers=enc_layers, num_decoder_layers=dec_layers,
                                 dim_feedforward=dim_ff, dropout=dropout)
        self.fc = nn.Linear(d_model, tgt_vocab)
        self.d_model = d_model

    def forward(self, src, tgt_in, src_pad, tgt_pad, tgt_sub):
        s = self.src_emb(src) * math.sqrt(self.d_model)
        t = self.tgt_emb(tgt_in) * math.sqrt(self.d_model)
        s = self.pos(s.transpose(0,1))
        t = self.pos(t.transpose(0,1))
        out = self.tr(s, t,
                      src_key_padding_mask=src_pad,
                      tgt_key_padding_mask=tgt_pad,
                      tgt_mask=tgt_sub,
                      memory_key_padding_mask=src_pad)     # (T,B,E)
        return self.fc(out)  # (T,B,V)

In [56]:
# 8) Mini-batch iterator, train/valid loops
def iterate_minibatches(N, batch_size=BATCH_SIZE, shuffle=True):
    idx = torch.arange(N)
    if shuffle:
        idx = idx[torch.randperm(N)]
    for s in range(0, N, batch_size):
        e = min(s+batch_size, N)
        yield idx[s:e].tolist()

def train_epoch(model, opt, src_ids, tgt_ids):
    model.train()
    crit = nn.CrossEntropyLoss(ignore_index=PAD_ID)
    total, toks = 0.0, 0
    N = len(src_ids)
    for b in iterate_minibatches(N, BATCH_SIZE, shuffle=True):
        src, tgt_in, tgt_out, src_pad, tgt_pad, tgt_sub = make_batch(
            [src_ids[i] for i in b], [tgt_ids[i] for i in b]
        )
        logits = model(src, tgt_in, src_pad, tgt_pad, tgt_sub)     # (T,B,V)
        T,B,V = logits.shape
        loss = crit(logits.view(T*B, V), tgt_out.transpose(0,1).reshape(T*B))

        opt.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        opt.step()

        tokens = (tgt_out != PAD_ID).sum().item()
        total += loss.item() * tokens
        toks  += tokens
    return total / max(1, toks)

@torch.no_grad()
def valid_epoch(model, src_ids, tgt_ids):
    model.eval()
    crit = nn.CrossEntropyLoss(ignore_index=PAD_ID)
    total, toks = 0.0, 0
    N = len(src_ids)
    for b in iterate_minibatches(N, BATCH_SIZE, shuffle=False):
        src, tgt_in, tgt_out, src_pad, tgt_pad, tgt_sub = make_batch(
            [src_ids[i] for i in b], [tgt_ids[i] for i in b]
        )
        logits = model(src, tgt_in, src_pad, tgt_pad, tgt_sub)
        T,B,V = logits.shape
        loss = crit(logits.view(T*B, V), tgt_out.transpose(0,1).reshape(T*B))
        tokens = (tgt_out != PAD_ID).sum().item()
        total += loss.item() * tokens
        toks  += tokens
    return total / max(1, toks)

**# 9) Greedy decoding + translate helpers**

In [57]:
# 9) Greedy decoding + translate helpers
@torch.no_grad()
def beam_search_decode(model, src_ids, beam_size=4, alpha=0.6, max_len=MAX_LEN):
    def lp(l): return ((5 + l)**alpha) / ((5 + 1)**alpha)  # length penalty
    src = torch.tensor(src_ids, dtype=torch.long, device=device).unsqueeze(0)  # (1,S)
    src_pad = (src == PAD_ID)

    beams = [([BOS_ID], 0.0, False)]  # (ids, sum_logprob, finished)
    completed = []

    for _ in range(max_len - 1):
        new_beams = []
        for ids, logp, done in beams:
            if done:
                new_beams.append((ids, logp, True))
                continue
            ys = torch.tensor(ids, dtype=torch.long, device=device).unsqueeze(0)  # (1,t)
            T = ys.size(1)
            tgt_pad = (ys == PAD_ID)
            tgt_sub = torch.triu(torch.ones((T, T), dtype=torch.bool, device=device), diagonal=1)
            logits = model(src, ys, src_pad, tgt_pad, tgt_sub)          # (T,1,V)
            log_probs = torch.log_softmax(logits[-1, 0], dim=-1)        # (V,)
            topk = torch.topk(log_probs, beam_size)
            for next_id, lp_token in zip(topk.indices.tolist(), topk.values.tolist()):
                ids2 = ids + [next_id]
                done2 = (next_id == EOS_ID)
                logp2 = logp + lp_token
                new_beams.append((ids2, logp2, done2))

        new_beams.sort(key=lambda x: (x[1] / lp(len(x[0]))), reverse=True)
        beams = new_beams[:beam_size]

        still_alive = []
        for b in beams:
            if b[2]: completed.append(b)
            else:    still_alive.append(b)
        beams = still_alive or beams
        if len(completed) >= beam_size:
            break

    pool = completed if completed else beams
    best = max(pool, key=lambda x: (x[1] / lp(len(x[0]))))
    seq = best[0][1:]  # drop BOS
    if EOS_ID in seq:
        seq = seq[:seq.index(EOS_ID)]
    return seq

def decode(ids, itos):
    toks = []
    for i in ids:
        if 0 <= i < len(itos):
            tok = itos[i]
            if tok not in (PAD, BOS, EOS):
                toks.append(tok)
    return " ".join(toks)

# Tokenize a single sentence (for demo translation)
def tok_en_line(s): return [t.text.lower() for t in nlp_en(s) if not (t.is_space or t.is_punct)]
def tok_fr_line(s): return [t.text.lower() for t in nlp_fr(s) if not (t.is_space or t.is_punct)]

def translate(text, model, src_stoi, single_tok, tgt_itos):
    src_tokens = single_tok(text)
    src_ids = [src_stoi.get(tok, src_stoi[UNK]) for tok in src_tokens][:MAX_LEN-2]
    hyp_ids = beam_search_decode(model, src_ids, beam_size=4, alpha=0.6, max_len=MAX_LEN)
    return decode(hyp_ids, tgt_itos)


**# 10) Build + train EN→FR**

In [58]:
# 10) Build + train EN→FR
model_en2fr = Seq2SeqTransformer(
    src_vocab=len(en_itos), tgt_vocab=len(fr_itos),
    d_model=D_MODEL, nhead=NHEAD, enc_layers=ENC_LAYERS, dec_layers=DEC_LAYERS,
    dim_ff=DIM_FF, dropout=DROPOUT, pad_id=PAD_ID
).to(device)
# weight tying
model_en2fr.fc.weight = model_en2fr.tgt_emb.weight
opt_en2fr = torch.optim.Adam(model_en2fr.parameters(), lr=LR)

for ep in range(1, EPOCHS+1):
    tr = train_epoch(model_en2fr, opt_en2fr, train_en_ids, train_fr_ids)
    va = valid_epoch(model_en2fr, valid_en_ids, valid_fr_ids) if len(valid_en_ids)>0 else float("nan")
    print(f"[EN→FR] epoch {ep}  train_loss/token {tr:.4f}  valid {va:.4f}")


[EN→FR] epoch 1  train_loss/token 11.6192  valid 5.3265
[EN→FR] epoch 2  train_loss/token 5.2600  valid 4.0034
[EN→FR] epoch 3  train_loss/token 4.1844  valid 3.4154
[EN→FR] epoch 4  train_loss/token 3.5604  valid 2.9681
[EN→FR] epoch 5  train_loss/token 3.1319  valid 2.6699


**# 11) Build + train FR→EN**

In [59]:
# 11) Build + train FR→EN
model_fr2en = Seq2SeqTransformer(
    src_vocab=len(fr_itos), tgt_vocab=len(en_itos),
    d_model=D_MODEL, nhead=NHEAD, enc_layers=ENC_LAYERS, dec_layers=DEC_LAYERS,
    dim_ff=DIM_FF, dropout=DROPOUT, pad_id=PAD_ID
).to(device)
model_fr2en.fc.weight = model_fr2en.tgt_emb.weight
opt_fr2en = torch.optim.Adam(model_fr2en.parameters(), lr=LR)

for ep in range(1, EPOCHS+1):
    tr = train_epoch(model_fr2en, opt_fr2en, train_fr_ids, train_en_ids)
    va = valid_epoch(model_fr2en, valid_en_ids, valid_en_ids) if len(valid_en_ids)>0 else float("nan")
    print(f"[FR→EN] epoch {ep}  train_loss/token {tr:.4f}  valid {va:.4f}")



[FR→EN] epoch 1  train_loss/token 10.5338  valid 6.8422
[FR→EN] epoch 2  train_loss/token 4.7973  valid 6.4979
[FR→EN] epoch 3  train_loss/token 3.7762  valid 6.3417
[FR→EN] epoch 4  train_loss/token 3.1658  valid 6.3552
[FR→EN] epoch 5  train_loss/token 2.7618  valid 6.5700


**# 12) Examples: EN→FR then feed into FR→EN**

In [61]:
examples = [
    "hello how are you",
    "this is a great day today",
    "we test round trip translation",
    "I will help you whenever needed"
]
for en in examples:
    fr_hat = translate(en, model_en2fr, en_stoi, tok_en_line, fr_itos)
    en_back = translate(fr_hat, model_fr2en, fr_stoi, tok_fr_line, en_itos)
    print(f"\nEN: {en}")
    print(f"FR(hat): {fr_hat}")
    print(f"EN(back): {en_back}")


EN: hello how are you
FR(hat): comment êtes -vous
EN(back): how did you get

EN: this is a great day today
FR(hat): c' est une journée aujourd'hui
EN(back): today is a day today

EN: we test round trip translation
FR(hat): nous nous avons passer le voyage
EN(back): we have a trip trip

EN: I will help you whenever needed
FR(hat): je vais t' aider
EN(back): i 'm going to help
