<a href="https://colab.research.google.com/github/dixy52-beep/translation-CycleTRANS-and-Adversal-Network/blob/main/CycleTrans.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Unpaired Language Translation with Adversarial and Cycle Consistency

# **CycleTrans Model Architecture Overview**
The CycleTrans model is a **novel approach to multilingual translation** leveraging adversarial training and cycle consistency. It consists of two main components: **Generative Models (Translators) and Discriminators**, working together to improve translation quality and alignment between different languages.

**The model is trained without having direct pairs of translated sentences between English and Italian**. In traditional machine translation models, you'd typically have a dataset where each sentence in one language (e.g., English) has a corresponding translation in another language (e.g., Italian). This is called parallel data.

However, i**n the unpaired setup used by CycleTrans, the model doesn’t rely on having exact translations for each sentence. Instead, it uses two separate, unaligned datasets:**

*  **English sentences** (e.g., from one source like an **English corpus**).
*  **Italian sentences** (e.g., from another source, like an **Italian corpus**).

# how it works:

The model translates from English to Italian.
Then, it translates the Italian translation back to English.
**The key idea is that the English sentence should be roughly the same as the original one, even though the model has never seen a direct English-Italian pair during training.**
This unpaired nature is useful in cases where you don't have access to large datasets of parallel translations, which are often difficult to obtain. By leveraging adversarial training and cycle consistency, the model can still learn to translate effectively using only unpaired data.

In [None]:
!pip install datasets
import torch
from torch import nn, optim
from torch.cuda.amp import GradScaler, autocast
from datasets import load_dataset
from transformers import AutoTokenizer

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load datasets
english_dataset = load_dataset("agentlans/high-quality-english-sentences", split="train")
english_sentences = english_dataset["text"][:2000]

italian_dataset = load_dataset("scribis/italian-literature-corpus-mini", split="train")
italian_sentences = italian_dataset["review"][:2000]

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

# Tokenize sentences
def tokenize_sentences(sentences):
    return tokenizer(sentences, padding=True, truncation=True, return_tensors="pt", max_length=50).to(device)

english_tokens = tokenize_sentences(english_sentences)
italian_tokens = tokenize_sentences(italian_sentences)

english_input_ids = english_tokens["input_ids"]
italian_input_ids = italian_tokens["input_ids"]

# Translator model with improved architecture
class Translator(nn.Module):
    def __init__(self, shared_embedding, hidden_size, vocab_size):
        super(Translator, self).__init__()
        self.embedding = shared_embedding
        self.encoder = nn.LSTM(shared_embedding.embedding_dim, hidden_size, num_layers=2, batch_first=True, dropout=0.3)
        self.decoder = nn.LSTM(hidden_size, hidden_size, num_layers=2, batch_first=True, dropout=0.3)
        self.fc = nn.Sequential(
            nn.Linear(hidden_size, hidden_size // 2),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_size // 2, vocab_size)
        )

    def forward(self, x):
        x = self.embedding(x)
        # Add attention mechanism
        encoder_outputs, (hidden, cell) = self.encoder(x)

        # Use attention to focus on important parts of the input
        attention_weights = torch.softmax(torch.bmm(encoder_outputs, encoder_outputs.transpose(1, 2)), dim=-1)
        context_vector = torch.bmm(attention_weights, encoder_outputs)

        # Decoder with context vector
        decoder_input = context_vector
        decoded, _ = self.decoder(decoder_input)
        output = self.fc(decoded)
        return output

# Discriminator model with improved regularization
class Discriminator(nn.Module):
    def __init__(self, embedding_size):
        super(Discriminator, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(embedding_size, 512),
            nn.BatchNorm1d(512),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.5),
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.5),
            nn.Linear(256, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        if len(x.shape) > 2:
            x = x.mean(dim=1)
        return self.layers(x)

# Shared embedding layer
embedding_size = 256
shared_embedding = nn.Embedding(tokenizer.vocab_size, embedding_size).to(device)

# Initialize models and move to device
hidden_size = 512
G_E2I = Translator(shared_embedding, hidden_size, tokenizer.vocab_size).to(device)
G_I2E = Translator(shared_embedding, hidden_size, tokenizer.vocab_size).to(device)
D_I = Discriminator(embedding_size).to(device)
D_E = Discriminator(embedding_size).to(device)

# Optimizers with weight decay
optim_G = optim.AdamW(list(G_E2I.parameters()) + list(G_I2E.parameters()),
                      lr=0.0001, weight_decay=1e-5)
optim_D_I = optim.AdamW(D_I.parameters(), lr=0.0001, weight_decay=1e-5)
optim_D_E = optim.AdamW(D_E.parameters(), lr=0.0001, weight_decay=1e-5)

# Loss functions
adversarial_loss = nn.BCEWithLogitsLoss()
cycle_loss = nn.CrossEntropyLoss()
contrastive_loss = nn.MSELoss()

# Training parameters
batch_size = 32
epochs = 100

# Mixed precision scaler
scaler = GradScaler()

# Progressive training: Sort sentences by length
sorted_indices = sorted(range(len(english_sentences)), key=lambda x: len(english_sentences[x]))
english_input_ids = english_input_ids[sorted_indices]
italian_input_ids = italian_input_ids[sorted_indices]

# Training loop with improved regularization and logging
for epoch in range(epochs):
    total_loss_G, total_loss_D_I, total_loss_D_E = 0, 0, 0
    num_batches = len(english_input_ids) // batch_size

    # Shuffle batches each epoch
    shuffle_indices = torch.randperm(len(english_input_ids))
    english_input_ids = english_input_ids[shuffle_indices]
    italian_input_ids = italian_input_ids[shuffle_indices]

    for i in range(0, len(english_input_ids), batch_size):
        # Prepare batches
        english_batch = english_input_ids[i:i + batch_size]
        italian_batch = italian_input_ids[i:i + batch_size]

        valid = torch.ones((english_batch.size(0), 1), device=device)
        fake = torch.zeros((english_batch.size(0), 1), device=device)

        # === Train Generators ===
        with autocast():
            # Generate translations
            fake_italian = G_E2I(english_batch)
            fake_italian_ids = fake_italian.argmax(dim=-1)
            fake_italian_embedded = shared_embedding(fake_italian_ids)

            fake_english = G_I2E(italian_batch)
            fake_english_ids = fake_english.argmax(dim=-1)
            fake_english_embedded = shared_embedding(fake_english_ids)

            # Cycle reconstruction
            reconstructed_english = G_I2E(fake_italian_ids)
            reconstructed_italian = G_E2I(fake_english_ids)

            # Cycle Loss with masking
            mask_english = english_batch != tokenizer.pad_token_id
            mask_italian = italian_batch != tokenizer.pad_token_id

            loss_cycle = (
                cycle_loss(
                    reconstructed_english[mask_english].view(-1, tokenizer.vocab_size),
                    english_batch[mask_english].view(-1)
                ) +
                cycle_loss(
                    reconstructed_italian[mask_italian].view(-1, tokenizer.vocab_size),
                    italian_batch[mask_italian].view(-1)
                )
            )

            # Adversarial Loss
            loss_adv = (
                adversarial_loss(D_I(fake_italian_embedded), valid) +
                adversarial_loss(D_E(fake_english_embedded), valid)
            )

            # Contrastive Loss to encourage semantic similarity
            loss_contrastive = contrastive_loss(
                fake_italian_embedded.mean(dim=1),
                fake_english_embedded.mean(dim=1)
            )

            # Combined Generator Loss
            loss_G = loss_cycle + 0.5 * loss_adv + 0.1 * loss_contrastive

        # Backward pass for Generators
        optim_G.zero_grad()
        scaler.scale(loss_G).backward()
        scaler.step(optim_G)
        scaler.update()
        total_loss_G += loss_G.item()

        # === Train Discriminators ===
        with autocast():
            loss_D_I = (
                adversarial_loss(D_I(fake_italian_embedded.detach()), fake) +
                adversarial_loss(D_I(shared_embedding(italian_batch)), valid)
            )
            loss_D_E = (
                adversarial_loss(D_E(fake_english_embedded.detach()), fake) +
                adversarial_loss(D_E(shared_embedding(english_batch)), valid)
            )

        # Backward pass for Discriminators
        optim_D_I.zero_grad()
        scaler.scale(loss_D_I).backward()
        scaler.step(optim_D_I)
        scaler.update()
        total_loss_D_I += loss_D_I.item()

        optim_D_E.zero_grad()
        scaler.scale(loss_D_E).backward()
        scaler.step(optim_D_E)
        scaler.update()
        total_loss_D_E += loss_D_E.item()



    print(f"Epoch {epoch}/{epochs} - Avg Losses: "
          f"[G: {total_loss_G / num_batches:.4f}] "
          f"[D_I: {total_loss_D_I / num_batches:.4f}] "
          f"[D_E: {total_loss_D_E / num_batches:.4f}]")

    if (epoch + 1) % 5 == 0:
          with torch.no_grad():
            example_english = english_input_ids[0:1]
            example_translation = G_E2I(example_english)
            translated_sentence = tokenizer.decode(example_translation.argmax(dim=-1).squeeze().tolist(), skip_special_tokens=True)
            print(f"Example Translation at Epoch {epoch + 1}: {translated_sentence}")


    # Save models in a single checkpoint
    checkpoint = {
        'G_E2I': G_E2I.state_dict(),
        'G_I2E': G_I2E.state_dict(),
        'D_I': D_I.state_dict(),
        'D_E': D_E.state_dict()
    }
    torch.save(checkpoint, "model_checkpoint.pth")

    # Free memory
    del english_batch, italian_batch, fake_italian, fake_english, loss_G, loss_D_I, loss_D_E
    torch.cuda.empty_cache()

# Try the Model on Sample Sentences
In this section, **you can test the model using sample sentences** to see how it performs



This will allow you to test the model on sample sentences and get a quick idea of how it handles translation and diversity in its outputs.
Let’s proceed with the experiment!

In [None]:
# Define a function to translate sentences using the trained models
def translate_sentence(sentence, direction="E2I"):
    # Tokenize the sentence
    tokens = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=50).to(device)
    input_ids = tokens["input_ids"]

    # Choose the appropriate generator based on the direction
    if direction == "E2I":
        generator = G_E2I
    else:
        generator = G_I2E

    # Generate translation
    with torch.no_grad():
        output = generator(input_ids)
        output_ids = output.argmax(dim=-1)  # Get the token with the highest probability for each position
        translated_sentence = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    return translated_sentence

# Test on some sentences
english_test_sentence = "I am your"
italian_test_sentence = "Ciao, come stai oggi? ??"

translated_to_italian = translate_sentence(english_test_sentence, direction="E2I")
translated_to_english = translate_sentence(italian_test_sentence, direction="I2E")

print(f"Original English: {english_test_sentence}")
print(f"Translated to Italian: {translated_to_italian}")
print(f"Original Italian: {italian_test_sentence}")
print(f"Translated to English: {translated_to_english}")


Original English: I am your
Translated to Italian: ##cacacacaca
Original Italian: Ciao, come stai oggi? ??
Translated to English: ,,,,,,,,,,,,


# EXPERIMENTAL CODES

# Experimental: Similarity loss and Entropy


This cell contains an **experimental setup** for training a model **with entropy loss and similarity loss.**

**Important Notes:**

*   This experiment is still in the experimental phase and is not fully optimized.

*   The training process may take a long time depending on the dataset size and model complexity.
*   The use of entropy loss is aimed at increasing the diversity of predictions, and similarity loss is included to penalise the model from outputting similar translations for different texts.


Results may not be ideal, and further tuning is required for better performance.
We recommend running this on smaller datasets for now, and not scaling up without further adjustments. Proceed with caution, as this is an ongoing experimental setup.

In [None]:
!pip install datasets
import torch
from torch import nn, optim
from torch.cuda.amp import GradScaler, autocast
from datasets import load_dataset
from transformers import AutoTokenizer

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load datasets
english_dataset = load_dataset("agentlans/high-quality-english-sentences", split="train")
english_sentences = english_dataset["text"][:2000]

italian_dataset = load_dataset("scribis/italian-literature-corpus-mini", split="train")
italian_sentences = italian_dataset["review"][:2000]

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

# Tokenize sentences
def tokenize_sentences(sentences):
    return tokenizer(sentences, padding=True, truncation=True, return_tensors="pt", max_length=50).to(device)

english_tokens = tokenize_sentences(english_sentences)
italian_tokens = tokenize_sentences(italian_sentences)

english_input_ids = english_tokens["input_ids"]
italian_input_ids = italian_tokens["input_ids"]

# Translator model with shared embeddings
class Translator(nn.Module):
    def __init__(self, shared_embedding, hidden_size, vocab_size):
        super(Translator, self).__init__()
        self.embedding = shared_embedding
        self.encoder = nn.LSTM(shared_embedding.embedding_dim, hidden_size, batch_first=True)
        self.decoder = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        _, (hidden, _) = self.encoder(x)
        hidden = hidden.transpose(0, 1).repeat(1, x.size(1), 1)
        decoded, _ = self.decoder(hidden)
        output = self.fc(decoded)
        return output

# Discriminator model
class Discriminator(nn.Module):
    def __init__(self, embedding_size):
        super(Discriminator, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(embedding_size, 256),
            nn.ReLU(),
            nn.Linear(256, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        if len(x.shape) > 2:
            x = x.mean(dim=1)
        return self.layers(x)

# Shared embedding layer
embedding_size = 128
shared_embedding = nn.Embedding(tokenizer.vocab_size, embedding_size).to(device)

# Initialize models and move to device
hidden_size = 256
G_E2I = Translator(shared_embedding, hidden_size, tokenizer.vocab_size).to(device)
G_I2E = Translator(shared_embedding, hidden_size, tokenizer.vocab_size).to(device)
D_I = Discriminator(embedding_size).to(device)
D_E = Discriminator(embedding_size).to(device)

# Optimizers
optim_G = optim.Adam(list(G_E2I.parameters()) + list(G_I2E.parameters()), lr=0.0002)
optim_D_I = optim.Adam(D_I.parameters(), lr=0.0002)
optim_D_E = optim.Adam(D_E.parameters(), lr=0.0002)

# Loss functions
adversarial_loss = nn.BCEWithLogitsLoss()

# Custom weighted cycle loss
def weighted_cycle_loss(predictions, targets, token_frequencies):
    weights = 1 / (token_frequencies + 1e-5)
    weights = weights.to(device)
    loss = nn.CrossEntropyLoss(reduction="none")(predictions.view(-1, tokenizer.vocab_size), targets.view(-1))
    return (loss * weights[targets.view(-1)]).mean()

# Entropy loss to encourage diversity
def entropy_loss(predictions):
    probabilities = torch.softmax(predictions, dim=-1)
    return -torch.sum(probabilities * torch.log(probabilities + 1e-9), dim=-1).mean()

# Repetition penalty
def repetition_penalty_loss(output_tokens):
    unique_counts = torch.bincount(output_tokens.view(-1))
    penalty = unique_counts[unique_counts > 1].sum().float()
    return penalty

# Training parameters
batch_size = 32
epochs = 50

# Mixed precision scaler
scaler = GradScaler()

# Progressive training: Sort sentences by length
sorted_indices = sorted(range(len(english_sentences)), key=lambda x: len(english_sentences[x]))
english_input_ids = english_input_ids[sorted_indices]
italian_input_ids = italian_input_ids[sorted_indices]

# Training loop
for epoch in range(epochs):
    total_loss_G, total_loss_D_I, total_loss_D_E = 0, 0, 0
    num_batches = len(english_input_ids) // batch_size

    for i in range(0, len(english_input_ids), batch_size):
        # Prepare batches
        english_batch = english_input_ids[i:i + batch_size]
        italian_batch = italian_input_ids[i:i + batch_size]

        valid = torch.ones((english_batch.size(0), 1), device=device)
        fake = torch.zeros((english_batch.size(0), 1), device=device)

        # === Train Generators ===
        with autocast():
            fake_italian = G_E2I(english_batch)
            fake_italian_ids = fake_italian.argmax(dim=-1)
            fake_italian_embedded = shared_embedding(fake_italian_ids)

            fake_english = G_I2E(italian_batch)
            fake_english_ids = fake_english.argmax(dim=-1)
            fake_english_embedded = shared_embedding(fake_english_ids)

            reconstructed_english = G_I2E(fake_italian_ids)
            reconstructed_italian = G_E2I(fake_english_ids)

            # Weighted Cycle Loss
            token_frequencies = torch.bincount(torch.cat((english_input_ids, italian_input_ids)).view(-1))
            loss_cycle = (
                weighted_cycle_loss(reconstructed_english, english_batch, token_frequencies) +
                weighted_cycle_loss(reconstructed_italian, italian_batch, token_frequencies)
            )

            # Adversarial Loss
            loss_adv = (
                adversarial_loss(D_I(fake_italian_embedded), valid) +
                adversarial_loss(D_E(fake_english_embedded), valid)
            )

            # Diversity Loss
            loss_diversity = entropy_loss(fake_italian) + entropy_loss(fake_english)

            # Repetition Penalty
            loss_repetition = repetition_penalty_loss(fake_italian_ids) + repetition_penalty_loss(fake_english_ids)

            loss_G = loss_cycle + 0.5 * loss_adv + 0.1 * loss_diversity + 0.1 * loss_repetition

        # Backward pass for Generators
        optim_G.zero_grad()
        scaler.scale(loss_G).backward()
        scaler.step(optim_G)
        scaler.update()
        total_loss_G += loss_G.item()

        # === Train Discriminators ===
        with autocast():
            loss_D_I = (
                adversarial_loss(D_I(fake_italian_embedded.detach()), fake) +
                adversarial_loss(D_I(shared_embedding(italian_batch)), valid)
            )
            loss_D_E = (
                adversarial_loss(D_E(fake_english_embedded.detach()), fake) +
                adversarial_loss(D_E(shared_embedding(english_batch)), valid)
            )

        # Backward pass for Discriminators
        optim_D_I.zero_grad()
        scaler.scale(loss_D_I).backward()
        scaler.step(optim_D_I)
        scaler.update()
        total_loss_D_I += loss_D_I.item()

        optim_D_E.zero_grad()
        scaler.scale(loss_D_E).backward()
        scaler.step(optim_D_E)
        scaler.update()
        total_loss_D_E += loss_D_E.item()

    print(f"Epoch {epoch}/{epochs} - Avg Losses: "
          f"[G: {total_loss_G / num_batches:.4f}] "
          f"[D_I: {total_loss_D_I / num_batches:.4f}] "
          f"[D_E: {total_loss_D_E / num_batches:.4f}]")

    # Save models after each epoch
    torch.save(G_E2I.state_dict(), f"G_E2I_epoch{epoch}.pth")
    torch.save(G_I2E.state_dict(), f"G_I2E_epoch{epoch}.pth")
    torch.save(D_I.state_dict(), f"D_I_epoch{epoch}.pth")
    torch.save(D_E.state_dict(), f"D_E_epoch{epoch}.pth")


Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/2.30k [00:00<?, ?B/s]

train.txt.gz:   0%|          | 0.00/85.5M [00:00<?, ?B/s]

test.txt.gz:   0%|          | 0.00/9.49M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1534699 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/170522 [00:00<?, ? examples/s]

README.md:   0%|          | 0.00/533 [00:00<?, ?B/s]

(…)-00000-of-00001-aaf72b9960b78228.parquet:   0%|          | 0.00/129M [00:00<?, ?B/s]

(…)-00000-of-00001-e066ed56853f4a85.parquet:   0%|          | 0.00/14.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/872594 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/96955 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

  return disable_fn(*args, **kwargs)
  scaler = GradScaler()
  with autocast():
  with autocast():


Epoch 0/50 - Avg Losses: [G: 327.1273] [D_I: 1.3403] [D_E: 1.3191]
Epoch 1/50 - Avg Losses: [G: 326.4829] [D_I: 1.2111] [D_E: 1.1959]
Epoch 2/50 - Avg Losses: [G: 325.3918] [D_I: 1.1413] [D_E: 1.0845]
Epoch 3/50 - Avg Losses: [G: 324.7608] [D_I: 1.1059] [D_E: 1.0666]
Epoch 4/50 - Avg Losses: [G: 324.7054] [D_I: 1.0765] [D_E: 1.0575]
Epoch 5/50 - Avg Losses: [G: 324.6921] [D_I: 1.0570] [D_E: 1.0512]
Epoch 6/50 - Avg Losses: [G: 324.6842] [D_I: 1.0453] [D_E: 1.0463]
Epoch 7/50 - Avg Losses: [G: 324.6792] [D_I: 1.0383] [D_E: 1.0423]
Epoch 8/50 - Avg Losses: [G: 324.6759] [D_I: 1.0341] [D_E: 1.0390]
Epoch 9/50 - Avg Losses: [G: 324.6735] [D_I: 1.0313] [D_E: 1.0363]
Epoch 10/50 - Avg Losses: [G: 324.6717] [D_I: 1.0294] [D_E: 1.0340]
Epoch 11/50 - Avg Losses: [G: 324.6703] [D_I: 1.0280] [D_E: 1.0322]
Epoch 12/50 - Avg Losses: [G: 324.6690] [D_I: 1.0271] [D_E: 1.0307]
Epoch 13/50 - Avg Losses: [G: 324.6678] [D_I: 1.0263] [D_E: 1.0295]
Epoch 14/50 - Avg Losses: [G: 324.6667] [D_I: 1.0258] [D_E

# Experimental: Token Loss


Instead of relying only on the mean embeddings, **we added a token-level loss**. This means **the model will now also consider the loss for each token (word) in the sentence**, not just the average representation. By doing this, we can maintain more detailed information, such as the order and individual meaning of the words. **This makes the model better at handling sentence structure and word-specific translations.**

**The adversarial loss is now applied to both the mean embeddings and token-level embeddings.** This makes the discriminators more effective because they are now using more detailed word-level information, rather than just looking at an average representation of the whole sentence. This improves their ability to distinguish between real and fake translations.

In [None]:
!pip install datasets
import torch
from torch import nn, optim
from torch.cuda.amp import GradScaler, autocast
from datasets import load_dataset
from transformers import AutoTokenizer

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load datasets
english_dataset = load_dataset("agentlans/high-quality-english-sentences", split="train")
english_sentences = english_dataset["text"][:2000]

italian_dataset = load_dataset("scribis/italian-literature-corpus-mini", split="train")
italian_sentences = italian_dataset["review"][:2000]

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

# Tokenize sentences
def tokenize_sentences(sentences):
    return tokenizer(sentences, padding=True, truncation=True, return_tensors="pt", max_length=50).to(device)

english_tokens = tokenize_sentences(english_sentences)
italian_tokens = tokenize_sentences(italian_sentences)

english_input_ids = english_tokens["input_ids"]
italian_input_ids = italian_tokens["input_ids"]

# Translator model with shared embeddings
class Translator(nn.Module):
    def __init__(self, shared_embedding, hidden_size, vocab_size):
        super(Translator, self).__init__()
        self.embedding = shared_embedding
        self.encoder = nn.LSTM(shared_embedding.embedding_dim, hidden_size, batch_first=True)
        self.decoder = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        _, (hidden, _) = self.encoder(x)
        hidden = hidden.transpose(0, 1).repeat(1, x.size(1), 1)
        decoded, _ = self.decoder(hidden)
        output = self.fc(decoded)
        return output

# Discriminator model
class Discriminator(nn.Module):
    def __init__(self, embedding_size):
        super(Discriminator, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(embedding_size, 256),
            nn.ReLU(),
            nn.Linear(256, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        if len(x.shape) > 2:
            x = x.mean(dim=1)
        return self.layers(x)

# Shared embedding layer
embedding_size = 128
shared_embedding = nn.Embedding(tokenizer.vocab_size, embedding_size).to(device)

# Initialize models and move to device
hidden_size = 256
G_E2I = Translator(shared_embedding, hidden_size, tokenizer.vocab_size).to(device)
G_I2E = Translator(shared_embedding, hidden_size, tokenizer.vocab_size).to(device)
D_I = Discriminator(embedding_size).to(device)
D_E = Discriminator(embedding_size).to(device)

# Optimizers
optim_G = optim.Adam(list(G_E2I.parameters()) + list(G_I2E.parameters()), lr=0.0002)
optim_D_I = optim.Adam(D_I.parameters(), lr=0.0002)
optim_D_E = optim.Adam(D_E.parameters(), lr=0.0002)

# Loss functions
adversarial_loss = nn.BCEWithLogitsLoss()
cycle_loss = nn.CrossEntropyLoss()
contrastive_loss = nn.MSELoss()
per_token_loss = nn.CrossEntropyLoss()

# Training parameters
batch_size = 32
epochs = 100

# Mixed precision scaler
scaler = GradScaler()

# Progressive training: Sort sentences by length
sorted_indices = sorted(range(len(english_sentences)), key=lambda x: len(english_sentences[x]))
english_input_ids = english_input_ids[sorted_indices]
italian_input_ids = italian_input_ids[sorted_indices]

# Training loop
for epoch in range(epochs):
    total_loss_G, total_loss_D_I, total_loss_D_E = 0, 0, 0
    num_batches = len(english_input_ids) // batch_size

    for i in range(0, len(english_input_ids), batch_size):
        # Prepare batches
        english_batch = english_input_ids[i:i + batch_size]
        italian_batch = italian_input_ids[i:i + batch_size]

        valid = torch.ones((english_batch.size(0), 1), device=device)
        fake = torch.zeros((english_batch.size(0), 1), device=device)

        # === Train Generators ===
        with autocast():
            fake_italian = G_E2I(english_batch)
            fake_italian_ids = fake_italian.argmax(dim=-1)
            fake_italian_embedded = shared_embedding(fake_italian_ids)

            fake_english = G_I2E(italian_batch)
            fake_english_ids = fake_english.argmax(dim=-1)
            fake_english_embedded = shared_embedding(fake_english_ids)

            reconstructed_english = G_I2E(fake_italian_ids)
            reconstructed_italian = G_E2I(fake_english_ids)

            # Cycle Loss
            mask_english = english_batch != tokenizer.pad_token_id
            mask_italian = italian_batch != tokenizer.pad_token_id

            loss_cycle = (
                cycle_loss(
                    reconstructed_english[mask_english].view(-1, tokenizer.vocab_size),
                    english_batch[mask_english].view(-1)
                ) +
                cycle_loss(
                    reconstructed_italian[mask_italian].view(-1, tokenizer.vocab_size),
                    italian_batch[mask_italian].view(-1)
                )
            )

            # Adversarial Loss
            loss_adv = (
                adversarial_loss(D_I(fake_italian_embedded), valid) +
                adversarial_loss(D_E(fake_english_embedded), valid)
            )

            # Contrastive Loss
            loss_contrastive = contrastive_loss(
                fake_italian_embedded.mean(dim=1),
                fake_english_embedded.mean(dim=1)
            )

            # Per-token loss (detailed sequence)
            loss_token = (
                per_token_loss(fake_italian.view(-1, tokenizer.vocab_size), italian_batch.view(-1)) +
                per_token_loss(fake_english.view(-1, tokenizer.vocab_size), english_batch.view(-1))
            )

            loss_G = loss_cycle + 0.5 * loss_adv + 0.1 * loss_contrastive + 0.2 * loss_token

        # Backward pass for Generators
        optim_G.zero_grad()
        scaler.scale(loss_G).backward()
        scaler.step(optim_G)
        scaler.update()
        total_loss_G += loss_G.item()

        # === Train Discriminators ===
        with autocast():
            loss_D_I = (
                adversarial_loss(D_I(fake_italian_embedded.detach()), fake) +
                adversarial_loss(D_I(shared_embedding(italian_batch)), valid)
            )
            loss_D_E = (
                adversarial_loss(D_E(fake_english_embedded.detach()), fake) +
                adversarial_loss(D_E(shared_embedding(english_batch)), valid)
            )

        # Backward pass for Discriminators
        optim_D_I.zero_grad()
        scaler.scale(loss_D_I).backward()
        scaler.step(optim_D_I)
        scaler.update()
        total_loss_D_I += loss_D_I.item()

        optim_D_E.zero_grad()
        scaler.scale(loss_D_E).backward()
        scaler.step(optim_D_E)
        scaler.update()
        total_loss_D_E += loss_D_E.item()

        if i % (batch_size * 5) == 0:
            print(f"[Epoch {epoch}/{epochs}] [Batch {i}/{num_batches}] "
                  f"[G loss: {loss_G.item()}] [D_I loss: {loss_D_I.item()}] [D_E loss: {loss_D_E.item()}]")

    print(f"Epoch {epoch}/{epochs} - Avg Losses: "
          f"[G: {total_loss_G / num_batches:.4f}] "
          f"[D_I: {total_loss_D_I / num_batches:.4f}] "
          f"[D_E: {total_loss_D_E / num_batches:.4f}]")

    # Save models in a single checkpoint
    checkpoint = {
        'G_E2I': G_E2I.state_dict(),
        'G_I2E': G_I2E.state_dict(),
        'D_I': D_I.state_dict(),
        'D_E': D_E.state_dict()
    }
    torch.save(checkpoint, "model_checkpoint.pth")

    # Free memory
    del english_batch, italian_batch, fake_italian, fake_english, loss_G, loss_D_I, loss_D_E
    torch.cuda.empty_cache()
