# Task 2 Demo

In [1]:
# Importing libraries
import pandas as pd
import math
import torch
from torch import nn
from transformers import BertTokenizer
from datasets import load_dataset
from tqdm import tqdm
from transformers import T5Tokenizer, T5ForConditionalGeneration
import pickle
import io

In [2]:
# Getting data
INPUT_PATH = 'input.csv'
OUTPUT1_PATH = 'output1.csv'
OUTPUT2_PATH = 'output2.csv'
OUTPUT3_PATH = 'output3.csv'

input_data = pd.read_csv(INPUT_PATH)

FileNotFoundError: [Errno 2] No such file or directory: 'input.csv'

In [5]:
# Device to mps
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
device

device(type='mps')

In [3]:
# Device to gpu
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

## Model 2a

In [16]:
# Getting model 1

# Load datasets
train_dataset = load_dataset('wmt16', 'de-en', split='train[:50000]')
valid_dataset = load_dataset('wmt16', 'de-en', split='validation')
test_dataset = load_dataset('wmt16', 'de-en', split='test')

# special symbols
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

de_tokenizer = BertTokenizer.from_pretrained('bert-base-german-cased')
en_tokenizer = BertTokenizer.from_pretrained('bert-base-cased')


def tokenize_and_numericalize(example):
    src_tokens = de_tokenizer.tokenize(example['translation']['de'])
    tgt_tokens = en_tokenizer.tokenize(example['translation']['en'])
    src_indices = de_tokenizer.convert_tokens_to_ids(src_tokens)
    tgt_indices = en_tokenizer.convert_tokens_to_ids(tgt_tokens)
    return src_indices, tgt_indices


train_data = [(tokenize_and_numericalize(example)) for example in train_dataset]
valid_data = [(tokenize_and_numericalize(example)) for example in valid_dataset]
test_data = [(tokenize_and_numericalize(example)) for example in test_dataset]


class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        # Compute positional encodings
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)  # (max_len, 1, d_model) -> (max_len, d_model, 1)

        self.register_buffer('pe', pe)

    def forward(self, x):
        """
        Args:
            x: Tensor, shape [seq_len, batch_size, d_model]
        """
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)


# Hyperparameters
SRC_VOCAB_SIZE = de_tokenizer.vocab_size
TGT_VOCAB_SIZE = en_tokenizer.vocab_size
D_MODEL = 512
NHEAD = 8
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3
DIM_FEEDFORWARD = 2048
DROPOUT = 0.1
LEARNING_RATE = 0.0001
BATCH_SIZE = 8
NUM_EPOCHS = 5


class Seq2SeqTransformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, dropout=0.1):
        super(Seq2SeqTransformer, self).__init__()
        self.src_tok_emb = nn.Embedding(src_vocab_size, d_model)
        self.tgt_tok_emb = nn.Embedding(tgt_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model)
        self.transformer = nn.Transformer(d_model=d_model, nhead=nhead, num_encoder_layers=num_encoder_layers,
                                          num_decoder_layers=num_decoder_layers, dim_feedforward=dim_feedforward,
                                          dropout=dropout)
        self.generator = nn.Linear(d_model, tgt_vocab_size)

    def forward(self, src, tgt, src_mask=None, tgt_mask=None, memory_mask=None, src_key_padding_mask=None, tgt_key_padding_mask=None, memory_key_padding_mask=None):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(tgt))
        output = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask, None,
                                  src_key_padding_mask, tgt_key_padding_mask, memory_key_padding_mask)
        output = self.generator(output)
        return output


model2a = Seq2SeqTransformer(SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, D_MODEL, NHEAD,
                            NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, DIM_FEEDFORWARD, DROPOUT)
model2a.load_state_dict(torch.load("model2A.pth", map_location='cpu'))

def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=device)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

def create_mask(src, tgt):
    src_seq_len = src.shape[0]
    tgt_seq_len = tgt.shape[0]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len), device=device).type(torch.bool)

    src_padding_mask = (src == PAD_IDX).transpose(0, 1)
    tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1)

    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask


def generate_translations_2a(german_sentence, model_2a):
    model_2a.eval()  # Ensure the model is in evaluation mode.

    # Tokenize German sentence
    src_tokens = de_tokenizer.tokenize(german_sentence)
    src_indices = de_tokenizer.convert_tokens_to_ids(src_tokens)

    src_tensor = torch.tensor(src_indices, dtype=torch.long).unsqueeze(1).to(device)
    tgt_tensor = torch.tensor([[BOS_IDX]], dtype=torch.long).to(device)

    # Accumulate tokens of the translation here
    generated_tokens = []

    with torch.no_grad():
        # We don't know the sentence length, so we assume some max_length to stop at.
        for _ in range(100):  # Max length
            output = model_2a(src_tensor, tgt_tensor)
            next_token_id = output.argmax(dim=-1)[-1, :].item()  # Take the most likely next token ID
            if next_token_id == EOS_IDX:
                break  # Stop if EOS token is generated
            generated_tokens.append(next_token_id)
            tgt_tensor = torch.cat((tgt_tensor, torch.tensor([[next_token_id]], dtype=torch.long).to(device)), dim=0)

    # Decode the generated tokens into a string, skipping special tokens like EOS or BOS
    translation_2a = en_tokenizer.decode(generated_tokens, skip_special_tokens=True)

    return translation_2a

# generate_translations_2a("Wiederaufnahme der Sitzungsperiode", model2a)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [22]:
# Predicting
predictions = [generate_translations_2a(sentence, model2a) for sentence in input_data['de']]
output_data1 = input_data.copy()
output_data1['en'] = predictions
output_data1.to_csv(OUTPUT1_PATH, index=False)

## Model 2b

In [11]:
# Getting model 2
tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small").to(device)

def generate_translations(data):
    translations = []
    for example in tqdm(data, desc="Translating"):
        input_text = "translate English to German: " + example
        inputs = tokenizer(input_text, return_tensors="pt", padding=True).to(device)
        outputs = model.generate(inputs.input_ids,max_length=500, attention_mask=inputs.attention_mask)
        translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        translations.append(translated_text)
    return translations

test_translations = generate_translations(input_data["en"].values)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Translating: 100%|██████████| 1/1 [00:00<00:00,  3.43it/s]


In [13]:
# Saving the output
output_data2 = input_data.copy()
output_data2["de"] = test_translations
output_data2.to_csv(OUTPUT2_PATH, index=False)

## Model 2c

In [19]:
# Getting model 3
tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")

class CPU_Unpickler(pickle.Unpickler):
    """
    Unpickler that overrides the default find_class method to load torch tensors on the CPU
    Saved the model using pickle while running on GPU on kaggel, so need to load it on CPU

    Reference: https://github.com/pytorch/pytorch/issues/16797
    """
    def find_class(self, module, name):
        if module == 'torch.storage' and name == '_load_from_bytes':
            return lambda b: torch.load(io.BytesIO(b), map_location='cpu')
        else: return super().find_class(module, name)

with open("translation_model.pkl", "rb") as f:
    # model = pickle.load(f)
    model = CPU_Unpickler(f).load()
    model.to(device)

def generate_translations(data):
    translations = []
    for example in tqdm(data, desc="Translating"):
        input_text = "translate German to English: " + example
        inputs = tokenizer(input_text, return_tensors="pt", padding=True).to(device)
        outputs = model.generate(inputs.input_ids,max_length=500, attention_mask=inputs.attention_mask)
        translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        translations.append(translated_text)
    return translations

test_translations = generate_translations(input_data["de"].values)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Translating: 100%|██████████| 1/1 [00:00<00:00,  3.23it/s]


In [22]:
# Saving the output
output_data3 = input_data.copy()
output_data3["en"] = test_translations
output_data3.to_csv(OUTPUT3_PATH, index=False)