# Question 9

In [None]:
import torch
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

import torch.nn as nn
import torch.optim as optim

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class Encoder(nn.Module):
    def __init__(self, hidden_dim, z_dim):
        super(Encoder, self).__init__()
        self.fc1 = nn.Linear(28 * 28, hidden_dim) # 28 * 28 is the size of MNIST images
        self.fc_mu = nn.Linear(hidden_dim, z_dim)
        self.fc_logvar = nn.Linear(hidden_dim, z_dim)

    def forward(self, x):
        h = torch.relu(self.fc1(x))
        mu = self.fc_mu(h)
        logvar = self.fc_logvar(h)
        return mu, logvar

class Decoder(nn.Module):
    def __init__(self, z_dim, hidden_dim):
        super(Decoder, self).__init__()
        self.fc1 = nn.Linear(z_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, 28 * 28) # 28 * 28 is still the size of MNIST images

    def forward(self, z):
        h = torch.relu(self.fc1(z))
        x_recon = torch.sigmoid(self.fc2(h))
        return x_recon

# Define the VAE
class VAE(nn.Module):
    def __init__(self, hidden_dim, z_dim):
        super(VAE, self).__init__()
        self.encoder = Encoder(hidden_dim, z_dim)
        self.decoder = Decoder(z_dim, hidden_dim)

    def parameterization_trick(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def forward(self, x):
        mu, logvar = self.encoder(x)
        z = self.parameterization_trick(mu, logvar)
        x_recon = self.decoder(z)
        return x_recon, mu, logvar

def loss_function(x, x_recon, mu, logvar):
    recon_loss = nn.functional.binary_cross_entropy(x_recon, x, reduction='sum')
    kl_div = -0.5 * torch.sum(-1 + logvar - mu.pow(2) - torch.exp(logvar))
    return recon_loss + kl_div

# Hyperparameters
hidden_dim = 400
z_dim = 20
batch_size = 128
num_epochs = 80

# Data loading
transform = transforms.Compose([transforms.ToTensor(), transforms.Lambda(lambda x: x.view(-1))])
train_dataset = datasets.MNIST(root='./data', train=True, transform=transform, download=True)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

vae = VAE(hidden_dim, z_dim).to(device)
optimizer = optim.Adam(vae.parameters(), lr=1e-3)

# Question 10

In [None]:
import matplotlib.pyplot as plt

def plot_samples(x_recon):
    _, axs = plt.subplots(8, 8, figsize=(8, 8))
    for i in range(8):
        for j in range(8):
            axs[i, j].imshow(x_recon[i * 8 + j].detach().numpy().reshape(28, 28), cmap='gray')
            axs[i, j].axis('off')
    plt.show()

In [None]:
vae.train()
for epoch in range(num_epochs):
    train_loss = 0
    for batch_idx, (data, _) in enumerate(train_loader):
        data = data.to(device)
        optimizer.zero_grad()
        x_recon, mu, logvar = vae(data)
        if batch_idx == 0:
            x_disp = x_recon[:64, :]
        loss = loss_function(data, x_recon, mu, logvar)
        loss.backward()
        train_loss += loss.item()
        optimizer.step()
    if epoch in [0, 9, 79]:
        print(f'Epoch {epoch + 1}, Loss: {train_loss / len(train_loader.dataset)}')
        plot_samples(x_disp.cpu())

# Question 17 :

In [None]:
prompt = ["a sitting cat  "]
import torch
height = 512  # default height of Stable Diffusion
width = 512  # default width of Stable Diffusion
guidance_scale = 7.5  # Scale for classifier-free guidance
generator = torch.manual_seed(0)  # Seed generator to create the inital latent noise
batch_size = len(prompt)
num_inference_steps = 25  # Number of denoising steps
list_inference_step=[5,10,25,50,100]

In [None]:
from PIL import Image
import torch
from transformers import CLIPTextModel, CLIPTokenizer
from diffusers import AutoencoderKL, UNet2DConditionModel, PNDMScheduler

vae = AutoencoderKL.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="vae")
tokenizer = CLIPTokenizer.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="tokenizer")
text_encoder = CLIPTextModel.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="text_encoder")
unet = UNet2DConditionModel.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="unet")

In [None]:
torch_device = "cuda"
vae.to(torch_device)
text_encoder.to(torch_device)
unet.to(torch_device)

In [None]:
text_input = tokenizer(
    prompt, padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt"
)

max_length = text_input.input_ids.shape[-1]
uncond_input = tokenizer([""] * batch_size, padding="max_length", max_length=max_length, return_tensors="pt")
uncond_embeddings = text_encoder(uncond_input.input_ids.to(torch_device))[0]
with torch.no_grad():
    text_embeddings = text_encoder(text_input.input_ids.to(torch_device))[0]
text_embeddings = torch.cat([uncond_embeddings, text_embeddings])

2 ** (len(vae.config.block_out_channels) - 1) == 8
latents = torch.randn(
    (batch_size, unet.in_channels, height // 8, width // 8),
    generator=generator,
)
latents = latents.to(torch_device)
input_latents=latents.clone()


In [None]:
from tqdm.auto import tqdm
def run_pipeline(input_latents,scheduler,text_embeddings,guidance_scale,num_inference_steps):

  latents = input_latents.clone() * scheduler.init_noise_sigma
  scheduler.set_timesteps(num_inference_steps)

  for t in tqdm(scheduler.timesteps):
      # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
      latent_model_input = torch.cat([latents] * 2)

      latent_model_input = scheduler.scale_model_input(latent_model_input, timestep=t)

      # predict the noise residual
      with torch.no_grad():
          noise_pred = unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample

      # perform guidance
      noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
      noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond )

      # compute the previous noisy sample x_t -> x_t-1
      latents = scheduler.step(noise_pred, t, latents).prev_sample



  # scale and decode the image latents with vae
  latents = 1 / 0.18215 * latents
  with torch.no_grad():
      image = vae.decode(latents).sample
  return image
from diffusers import EulerDiscreteScheduler
scheduler=EulerDiscreteScheduler.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="scheduler")

image=run_pipeline(input_latents,scheduler,text_embeddings,guidance_scale,num_inference_steps)

In [None]:
import matplotlib.pyplot as plt
def show_img(image):
  image = (image / 2 + 0.5).clamp(0, 1)
  image = image.detach().cpu().permute(0, 2, 3, 1).numpy()
  images = (image * 255).round().astype("uint8")
  pil_images = [Image.fromarray(image) for image in images]

  return pil_images[0]
show_img(image)

In [None]:
# euler scheduler
euler_scheduler=[]
scheduler=EulerDiscreteScheduler.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="scheduler")
for num_inference_steps in list_inference_step:
  euler_scheduler.append(run_pipeline(input_latents,scheduler,text_embeddings,guidance_scale,num_inference_steps))

# euler  ancestral scheduler
euler_ancestral_scheduler=[]
from diffusers import EulerAncestralDiscreteScheduler
scheduler=EulerAncestralDiscreteScheduler.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="scheduler")
for num_inference_steps in list_inference_step:
  euler_ancestral_scheduler.append(run_pipeline(input_latents,scheduler,text_embeddings,guidance_scale,num_inference_steps))
#DPM scheduler
DPM_scheduler=[]
from diffusers import DPMSolverMultistepScheduler
scheduler=DPMSolverMultistepScheduler.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="scheduler")
for num_inference_steps in list_inference_step:
  DPM_scheduler.append(run_pipeline(input_latents,scheduler,text_embeddings,guidance_scale,num_inference_steps))


In [None]:
# Define scheduler names and timesteps
scheduler_names = [ 'DPM Solver','Euler', 'Euler Ancestral']
timesteps = [5, 10, 25, 50, 100]

# Collect all results in a list of lists for easier plotting
all_results = [euler_scheduler, euler_ancestral_scheduler, DPM_scheduler]
all_results = [DPM_scheduler, euler_scheduler, euler_ancestral_scheduler ]
# Create a figure with subplots
fig, axes = plt.subplots(len(timesteps), len(all_results), figsize=(15, 10))

# Adjust layout for tighter spacing
fig.suptitle("Generated Images by Scheduler and Timesteps", fontsize=16)
plt.subplots_adjust(hspace=0.2, wspace=0.2)

# Plot each image in the corresponding subplot
for row_idx, timestep in enumerate(timesteps):
    for col_idx, (scheduler_name, scheduler_results) in enumerate(zip(scheduler_names, all_results)):
        ax = axes[row_idx, col_idx]
        ax.imshow(show_img(scheduler_results[row_idx]))  # Assuming each scheduler_result is an image
        # Remove ticks and spines but keep ylabel
        ax.set_xticks([])
        ax.set_yticks([])
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
        ax.spines['left'].set_visible(False)
        ax.spines['bottom'].set_visible(False)

        if row_idx == 0:
            ax.set_title(scheduler_name, fontsize=12)  # Add scheduler name at the top
        if col_idx == 0:
            # Add timestep labels on the y-axis
            ax.set_ylabel(f"{timestep} steps", fontsize=12, rotation=90, labelpad=10)

# Display the plot
plt.show()

# Question 18: Negative Prompting

In [None]:
from PIL import Image
import torch
from transformers import CLIPTextModel, CLIPTokenizer
from diffusers import AutoencoderKL, UNet2DConditionModel, PNDMScheduler
import matplotlib.pyplot as plt
vae = AutoencoderKL.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="vae")
tokenizer = CLIPTokenizer.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="tokenizer")
text_encoder = CLIPTextModel.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="text_encoder")
unet = UNet2DConditionModel.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="unet")
from diffusers import UniPCMultistepScheduler,EulerDiscreteScheduler

scheduler = EulerDiscreteScheduler.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="scheduler")
torch_device = "cuda"
vae.to(torch_device)
text_encoder.to(torch_device)
unet.to(torch_device)


In [None]:
prompt = ["a photograph of an astronaut riding a horse on earth "]
height = 512  # default height of Stable Diffusion
width = 512  # default width of Stable Diffusion
num_inference_steps = 25  # Number of denoising steps
guidance_scale = 7.5  # Scale for classifier-free guidance
generator = torch.manual_seed(0)  # Seed generator to create the inital latent noise
batch_size = len(prompt)

In [None]:
def question_18(negative_prompt,prompt):
    text_input = tokenizer(
        prompt, padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt"
    )
    text_negative_input = tokenizer(
        negative_prompt, padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt"
    )
    max_length = text_input.input_ids.shape[-1]
    uncond_input = tokenizer([""] * batch_size, padding="max_length", max_length=max_length, return_tensors="pt")
    uncond_embeddings = text_encoder(uncond_input.input_ids.to(torch_device))[0]
    with torch.no_grad():
        text_embeddings = text_encoder(text_input.input_ids.to(torch_device))[0]
        text_negative_embeddings = text_encoder(text_negative_input.input_ids.to(torch_device))[0]
    text_embeddings = torch.cat([uncond_embeddings, text_embeddings,text_negative_embeddings])

    2 ** (len(vae.config.block_out_channels) - 1) == 8
    latents = torch.randn(
        (batch_size, unet.in_channels, height // 8, width // 8),
        generator=generator,
    )
    latents = latents.to(torch_device)
    latents = latents * scheduler.init_noise_sigma
    from tqdm.auto import tqdm

    scheduler.set_timesteps(num_inference_steps)

    for t in tqdm(scheduler.timesteps):
        # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
        latent_model_input = torch.cat([latents] * 3)

        latent_model_input = scheduler.scale_model_input(latent_model_input, timestep=t)

        # predict the noise residual
        with torch.no_grad():
            noise_pred = unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample

        # perform guidance
        noise_pred_uncond, noise_pred_text,noise_pred_negative = noise_pred.chunk(3,dim=0)

        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond ) - guidance_scale *( noise_pred_negative -noise_pred_uncond)


        # compute the previous noisy sample x_t -> x_t-1
        latents = scheduler.step(noise_pred, t, latents).prev_sample



    # scale and decode the image latents with vae
    latents = 1 / 0.18215 * latents
    with torch.no_grad():
        image = vae.decode(latents).sample


    image = (image / 2 + 0.5).clamp(0, 1)
    image = image.detach().cpu().permute(0, 2, 3, 1).numpy()
    images = (image * 255).round().astype("uint8")
    pil_images = [Image.fromarray(image) for image in images]
    plt.imshow(pil_images[0])



In [None]:
negative_prompt=[" sky"]
question_18(negative_prompt,prompt)
plt.title(f"negative prompt: {negative_prompt[0]}")
plt.legend()
plt.show()

In [None]:
negative_prompt=["sky in the upper left "]
question_18(negative_prompt,prompt)
plt.title(f"negative prompt: {negative_prompt[0]}")
plt.legend()
plt.show()

# Question 22


In [None]:
path_to_utils = 'pyfiles'
import sys
import os
sys.path.append(path_to_utils)
import nmt_dataset
import nnet_models
import numpy as np
import torch

import time
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

from subword_nmt.apply_bpe import BPE
%matplotlib inline

In [None]:
data_dir = 'data'
source_lang, target_lang = 'en', 'fr'
model_dir = 'models/{}-{}'.format(source_lang, target_lang)
! bash download-data.sh

In [None]:
def reset_seed(seed=1234):
    np.random.seed(seed)
    torch.manual_seed(seed)

In [None]:
bpe_path = os.path.join(data_dir, 'bpecodes.de-en-fr')

with open(bpe_path) as bpe_codes:
    bpe_model = BPE(bpe_codes)

def preprocess(line, is_source=True, source_lang=None, target_lang=None):
    return bpe_model.segment(line.lower())

def postprocess(line):
    return line.replace('@@ ', '')

def load_data(source_lang, target_lang, split='train', max_size=None):
    # max_size: max number of sentence pairs in the training corpus (None = all)
    path = os.path.join(data_dir, '{}.{}-{}'.format(split, *sorted([source_lang, target_lang])))
    return nmt_dataset.load_dataset(path, source_lang, target_lang, preprocess=preprocess, max_size=max_size)   # set max_size to 10000 for fast debugging

In [None]:
def save_model(model, checkpoint_path):
    dirname = os.path.dirname(checkpoint_path)
    if dirname:
        os.makedirs(dirname, exist_ok=True)
    torch.save(model, checkpoint_path)

def train_model(
        train_iterator,
        valid_iterators,
        model,
        checkpoint_path,
        epochs=1,
        validation_frequency=1
    ):
    """
    train_iterator: instance of nmt_dataset.BatchIterator or nmt_dataset.MultiBatchIterator
    valid_iterators: list of nmt_dataset.BatchIterator
    model: instance of nnet_models.EncoderDecoder
    checkpoint_path: path of the model checkpoint
    epochs: iterate this many times over train_iterator
    validation_frequency: validate the model every N epochs
    """

    reset_seed()

    best_bleu = -1
    for epoch in range(1, epochs + 1):

        start = time.time()
        running_loss = 0

        print('Epoch: [{}/{}]'.format(epoch, epochs))

        # Iterate over training batches for one epoch
        for i, batch in tqdm(enumerate(train_iterator), total=len(train_iterator)):
            t = time.time()
            running_loss += model.train_step(batch)

        # Average training loss for this epoch
        # *****START CODE
        epoch_loss = running_loss / len(train_iterator)
        # *****END CODE

        print("loss={:.3f}, time={:.2f}".format(epoch_loss, time.time() - start))
        sys.stdout.flush()

        # Evaluate and save the model
        if epoch % validation_frequency == 0:
            bleu_scores = []

            # Compute BLEU over all validation sets
            for valid_iterator in valid_iterators:
                # *****START CODE
                src, tgt = valid_iterator.source_lang, valid_iterator.target_lang
                translation_output = model.translate(valid_iterator, postprocess)
                bleu_score = translation_output.score
                output = translation_output.output
                # *****END CODE

                with open(os.path.join(model_dir, 'valid.{}-{}.{}.out'.format(src, tgt, epoch)), 'w') as f:
                    f.writelines(line + '\n' for line in output)

                print('{}-{}: BLEU={}'.format(src, tgt, bleu_score))
                sys.stdout.flush()
                bleu_scores.append(bleu_score)

            # Average the validation BLEU scores
            bleu_score = round(sum(bleu_scores) / len(bleu_scores), 2)
            if len(bleu_scores) > 1:
                print('BLEU={}'.format(bleu_score))

            # Update the model's learning rate based on current performance.
            # This scheduler divides the learning rate by 10 if BLEU does not improve.
            model.scheduler_step(bleu_score)

            # Save a model checkpoint if it has the best validation BLEU so far
            if bleu_score > best_bleu:
                best_bleu = bleu_score
                save_model(model, checkpoint_path)

        print('=' * 50)

    print("Training completed. Best BLEU is {}".format(best_bleu))

In [None]:
def get_binned_bleu_scores(model, valid_iterator):
    # Compute and plot BLEU scores according to sequence length
    # lengths = np.arange(0, 31, 5)
    lengths = np.arange(4, 20, 3)
    bleu_scores = np.zeros(len(lengths))

    for i in tqdm(range(1, len(lengths)), total=len(lengths) - 1):
        min_len = lengths[i - 1]
        max_len = lengths[i]

        tmp_data = valid_data[(valid_iterator.data['source_len'] > min_len) & (valid_iterator.data['source_len'] <= max_len)]
        tmp_iterator = nmt_dataset.BatchIterator(tmp_data, source_lang, target_lang, batch_size, max_len=max_len)
        bleu_scores[i] = model.translate(tmp_iterator, postprocess).score
        print(model.translate(tmp_iterator, postprocess))
    lengths = lengths[1:]
    bleu_scores = bleu_scores[1:]

    plt.plot(lengths, bleu_scores, 'x-')
    plt.ylim(0, np.max(bleu_scores) + 1)
    plt.xlabel('Source length')
    plt.ylabel('BLEU score')

    return lengths, bleu_scores


def show_attention(input_sentence, output_words, attentions):
    # Plot an encoder-decoder attention matrix
    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(attentions, cmap='bone', aspect='auto')
    fig.colorbar(cax)

    ax.set_xticklabels([''] + input_sentence.split(' ') +
                       [nmt_dataset.EOS_TOKEN], rotation=90)
    ax.set_yticklabels([''] + output_words.split(' ') +
                       [nmt_dataset.EOS_TOKEN])

    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()


def encode_as_batch(sentence, dictionary, source_lang, target_lang):
    # Create a batch from a single sentence
    sentence = sentence + ' ' + nmt_dataset.EOS_TOKEN
    tensor = dictionary.txt2vec(sentence).unsqueeze(0)

    return {
        'source': tensor,
        'source_len': torch.from_numpy(np.array([tensor.shape[-1]])),
        'source_lang': source_lang,
        'target_lang': target_lang
    }


def get_translation(model, sentence, dictionary, source_lang, target_lang, return_output=False):
    # Translate given sentence with given model. Also show translation outputs by Google Translate for comparison.
    print('Source:', sentence)
    sentence_tok = preprocess(sentence, is_source=True, source_lang=source_lang, target_lang=target_lang)
    print('Tokenized source:', sentence_tok)
    batch = encode_as_batch(sentence_tok, dictionary, source_lang, target_lang)
    prediction, attn_matrix, enc_self_attn = model.eval_step(batch)
    prediction = prediction[0]
    prediction_detok = postprocess(prediction)
    print('Prediction:', prediction)
    print('Detokenized prediction:', prediction_detok)

    print('Google Translate ({}->{}): {}'.format(
        source_lang,
        target_lang,
        translator.translate(sentence, src=source_lang, dest=target_lang).text
    ))

    print('Google Translate on prediction ({}->{}): {}'.format(
        target_lang,
        source_lang,
        translator.translate(prediction_detok, src=target_lang, dest=source_lang).text
    ))

    results = {
        'source': sentence,
        'source_tokens': sentence_tok.split(' ') + ['<eos>'],
        'prediction_detok': prediction_detok,
        'prediction_tokens': prediction.split(' '),
    }

    if attn_matrix is not None:
        attn_matrix = attn_matrix[0].detach().cpu().numpy()
        results['attention_matrix'] = attn_matrix
        show_attention(sentence_tok, prediction, attn_matrix)

    if enc_self_attn is not None:
        results['encoder_self_attention_list'] = enc_self_attn

    if return_output:
        return results

In [None]:
multi_model_dir = os.path.join("models", "de-en-fr")
import pandas as pd

# Load datasets for all three languages
train_data_de_en = load_data("de", "en", "train", max_size=10000)
train_data_fr_en = load_data("fr", "en", "train", max_size=10000)
train_data_de_fr = load_data("de", "fr", "train", max_size=10000)



# Combine the tokenized data from all datasets
combined_source_tokenized = pd.concat(
    [
        train_data_de_en["source_tokenized"],
        train_data_fr_en["source_tokenized"],
        train_data_de_fr["source_tokenized"],
    ]
)
combined_target_tokenized = pd.concat(
    [
        train_data_de_en["target_tokenized"],
        train_data_fr_en["target_tokenized"],
        train_data_de_fr["target_tokenized"],
    ]
)



source_dict = nmt_dataset.load_or_create_dictionary(
    multi_model_dir + "/src_dict.txt",
    combined_source_tokenized,
    minimum_count=10,
    reset=True
)

target_dict = nmt_dataset.load_or_create_dictionary(
    multi_model_dir + "/tgt_dict.txt",
    combined_target_tokenized,
    minimum_count=10,
    reset=True
)



In [None]:
multi_transformer_encoder = nnet_models.TransformerEncoder(
    input_size=len(source_dict), hidden_size=512, num_layers=3, dropout=0.1, heads=8
)

multi_transformer_decoder = nnet_models.TransformerDecoder(
    output_size=len(target_dict), hidden_size=512, num_layers=1, heads=8, dropout=0.1
)

multi_transformer_model = nnet_models.EncoderDecoder(
    multi_transformer_encoder,
    multi_transformer_decoder,
    lr=0.001,
    use_cuda=True,
    target_dict=target_dict,
)

normed_transformer_encoder = nnet_models.TransformerEncoder(
    input_size=len(source_dict), hidden_size=512, num_layers=3, dropout=0.1, heads=8, normalize_before=True
)

normed_transformer_decoder = nnet_models.TransformerDecoder(
    output_size=len(target_dict), hidden_size=512, num_layers=1, heads=8, dropout=0.1, normalize_before=True
)

normed_transformer_model = nnet_models.EncoderDecoder(
    normed_transformer_encoder,
    normed_transformer_decoder,
    lr=0.001,
    use_cuda=True,
    target_dict=target_dict,
)



sh_multi_transformer_encoder = nnet_models.TransformerEncoder(
    input_size=len(source_dict), hidden_size=512, num_layers=1, dropout=0.1, heads=8
)

sh_multi_transformer_model = nnet_models.EncoderDecoder(
    sh_multi_transformer_encoder,
    multi_transformer_decoder,
    lr=0.001,
    use_cuda=True,
    target_dict=target_dict,
)

sh_normed_transformer_encoder = nnet_models.TransformerEncoder(
    input_size=len(source_dict), hidden_size=512, num_layers=1, dropout=0.1, heads=8, normalize_before=True
)

sh_normed_transformer_model = nnet_models.EncoderDecoder(
    sh_normed_transformer_encoder,
    normed_transformer_decoder,
    lr=0.001,
    use_cuda=True,
    target_dict=target_dict,
)

## Multilingual evaluation

In [None]:
max_len = 30       # maximum 30 tokens per sentence (longer sequences will be truncated)
batch_size = 512   # maximum 512 tokens per batch (decrease if you get OOM errors, increase to speed up training)

In [None]:
def preprocess(line, is_source=True, source_lang=None, target_lang=None):
    line = bpe_model.segment(line.lower())
    if is_source:
        line = "<lang:{}> {}".format(target_lang, line)
    return line


test_iterators = []
valid_iterators = []
train_iterators = []

for src, tgt in (
    ("en", "fr"),
    ("fr", "en"),
    ("en", "de"),
    ("de", "en"),
    ("de", "fr"),
    ("fr", "de"),
):
    dataset = load_data(src, tgt, "test")
    nmt_dataset.binarize(
        dataset, source_dict=source_dict, target_dict=target_dict, sort=False
    )
    test_iterators.append(
        nmt_dataset.BatchIterator(
            dataset, src, tgt, batch_size=batch_size, max_len=max_len, shuffle=True
        )
    )
    train_data = load_data(src, tgt, "train", max_size=10000)
    nmt_dataset.binarize(
        train_data, source_dict=source_dict, target_dict=target_dict, sort=False
    )
    train_iterators.append(
        nmt_dataset.BatchIterator(
            train_data, src, tgt, batch_size=batch_size, max_len=max_len, shuffle=True
        )
    )

    valid_data = load_data(src, tgt, "valid")
    nmt_dataset.binarize(
        valid_data, source_dict=source_dict, target_dict=target_dict, sort=False
    )
    valid_iterators.append(
        nmt_dataset.BatchIterator(
            valid_data, src, tgt, batch_size=batch_size, max_len=max_len, shuffle=True
        )
    )

train_iterators = nmt_dataset.MultilingualBatchIterator(train_iterators)

In [None]:
from collections import defaultdict


def save_model(model, checkpoint_path):
    dirname = os.path.dirname(checkpoint_path)
    if dirname:
        os.makedirs(dirname, exist_ok=True)
    torch.save(model, checkpoint_path)

def train_model(
        train_iterator,
        valid_iterators,
        model,
        checkpoint_path,
        epochs=1,
        validation_frequency=1
    ):
    """
    train_iterator: instance of nmt_dataset.BatchIterator or nmt_dataset.MultiBatchIterator
    valid_iterators: list of nmt_dataset.BatchIterator
    model: instance of nnet_models.EncoderDecoder
    checkpoint_path: path of the model checkpoint
    epochs: iterate this many times over train_iterator
    validation_frequency: validate the model every N epochs
    """

    reset_seed()

    best_bleu = -1
    bleus = defaultdict(list)
    for epoch in range(1, epochs + 1):

        start = time.time()
        running_loss = 0

        print('Epoch: [{}/{}]'.format(epoch, epochs))

        # Iterate over training batches for one epoch
        for i, batch in tqdm(enumerate(train_iterator), total=len(train_iterator)):
            t = time.time()
            running_loss += model.train_step(batch)

        # Average training loss for this epoch
        # *****START CODE
        epoch_loss = running_loss / len(train_iterator)
        # *****END CODE

        print("loss={:.3f}, time={:.2f}".format(epoch_loss, time.time() - start))
        sys.stdout.flush()

        # Evaluate and save the model
        if epoch % validation_frequency == 0:
            bleu_scores = []

            # Compute BLEU over all validation sets
            for valid_iterator in valid_iterators:
                # *****START CODE
                src, tgt = valid_iterator.source_lang, valid_iterator.target_lang
                translation_output = model.translate(valid_iterator, postprocess)
                bleu_score = translation_output.score
                output = translation_output.output
                print(output)
                # *****END CODE

                with open(os.path.join(model_dir, 'valid.{}-{}.{}.out'.format(src, tgt, epoch)), 'w') as f:
                    f.writelines(line + '\n' for line in output)

                print('{}-{}: BLEU={}'.format(src, tgt, bleu_score))
                bleus[(src, tgt)].append(bleu_score)
                sys.stdout.flush()
                bleu_scores.append(bleu_score)

            # Average the validation BLEU scores
            bleu_score = round(sum(bleu_scores) / len(bleu_scores), 2)
            if len(bleu_scores) > 1:
                print('BLEU={}'.format(bleu_score))

            # Update the model's learning rate based on current performance.
            # This scheduler divides the learning rate by 10 if BLEU does not improve.
            model.scheduler_step(bleu_score)

            # Save a model checkpoint if it has the best validation BLEU so far
            if bleu_score > best_bleu:
                best_bleu = bleu_score
                save_model(model, checkpoint_path)

        print('=' * 50)

    print("Training completed. Best BLEU is {}".format(best_bleu))
    return bleus

In [None]:
model_bleus = {}
models = {
    "transformer": multi_transformer_model,
    "norm_transformer": normed_transformer_model,
    "shallow_norm": sh_normed_transformer_model,
    "shallow_transformer": sh_multi_transformer_model,
}
def average_coefficients(data_dict):
    # Unpack all lists from the dictionary and compute the average for each coefficient
    num_lists = len(data_dict)
    list_length = len(next(iter(data_dict.values())))  # Assume all lists are the same length

    # Compute the averages using a comprehension
    averaged_list = [
        sum(data_dict[key][i] for key in data_dict) / num_lists
        for i in range(list_length)
    ]

    return averaged_list

for name, model in models.items():
    checkpoint_path=os.path.join(multi_model_dir,f"{name}.pt")
    bleus = train_model(
        train_iterators,
        valid_iterators,
        normed_transformer_model,
        epochs=10,
        checkpoint_path=checkpoint_path,
    )
    model_bleus[name] = average_coefficients(bleus)

## Results of the training w.r.t to epoch

In [None]:
for model_name, average_bleu_scores in model_bleus.items():
    plt.plot(average_bleu_scores, '--x', label=model_name)
plt.xlabel('Epochs')
plt.ylabel('BLEU score')
plt.legend()


## Focus on the best models

In [None]:
for model_name, average_bleu_scores in model_bleus.items():
    if model_name != "transformer":
        plt.plot(average_bleu_scores, '--x', label=model_name)
plt.xlabel('Epochs')
plt.ylabel('BLEU score')
plt.legend()

## Plot w.r.t to source lengths

In [None]:
averages = {}

for model_name, model in models.items():
    print(f"Evaluating model: {model_name}")
    bleu_scores = []
    for iterator in valid_iterators:
        transformer_lengths, bleu_score = get_binned_bleu_scores(model, iterator)
        bleu_scores.append(bleu_score)
    average_bleu_scores = np.mean(bleu_scores, axis=0)
    averages[model_name] = average_bleu_scores
    print(f"Average BLEU scores for {model_name}: {average_bleu_scores}")

In [None]:
for model_name, average_bleu_scores in averages.items():
    plt.plot(transformer_lengths, average_bleu_scores, '--x', label=model_name)
plt.xlabel('Source length')
plt.ylabel('BLEU score')
plt.legend()


In [None]:
for model_name, average_bleu_scores in averages.items():
    if model_name not in ["shallow_norm","shallow_transformer"]:
        plt.plot(transformer_lengths, average_bleu_scores, '--x', label=model_name)
plt.xlabel('Source length')
plt.ylabel('BLEU score')
plt.legend()
plt.show()²
