## Imports

In [1]:
import numpy as np
import pandas as pd
import torch
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset

## Data loading

In [2]:
italian_file_path = 'europarl-v7.it-en.it'  # Adjust the path if your file is in a different directory
english_file_path = 'europarl-v7.it-en.en'

def load_sentences(file_path):
    with open(file_path, encoding='utf-8') as file:
        sentences = file.read().split('\n')
    return sentences

italian_sentences = load_sentences(italian_file_path)
english_sentences = load_sentences(english_file_path)

# Test data loading
print(f"Total Italian sentences: {len(italian_sentences)}")
print(f"Total English sentences: {len(english_sentences)}")

# Print the first 5 sentences in both languages to check
for i in range(5):
    print(f"Italian sentence {i+1}: {italian_sentences[i]}")
    print(f"English sentence {i+1}: {english_sentences[i]}\n")


Total Italian sentences: 1909116
Total English sentences: 1909116
Italian sentence 1: Ripresa della sessione
English sentence 1: Resumption of the session

Italian sentence 2: Dichiaro ripresa la sessione del Parlamento europeo, interrotta venerdì 17 dicembre e rinnovo a tutti i miei migliori auguri nella speranza che abbiate trascorso delle buone vacanze.
English sentence 2: I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.

Italian sentence 3: Come avrete avuto modo di constatare il grande "baco del millennio" non si è materializzato. Invece, i cittadini di alcuni nostri paesi sono stati colpiti da catastrofi naturali di proporzioni davvero terribili.
English sentence 3: Although, as you will have seen, the dreaded 'millennium bug' failed to materialise, still the people in a number of countries suffered a series of natural disa

## Dataset class

In [None]:
class TranslationDataset(Dataset):
    def __init__(self, src_file, trg_file, src_tokenizer, trg_tokenizer):
        self.src_sentences = open(src_file, encoding='utf-8').read().split('\n')
        self.trg_sentences = open(trg_file, encoding='utf-8').read().split('\n')
        self.src_tokenizer = src_tokenizer
        self.trg_tokenizer = trg_tokenizer

    def __len__(self):
        return len(self.src_sentences)

    def __getitem__(self, idx):
        src_sample = self.src_tokenizer(self.src_sentences[idx])
        trg_sample = self.trg_tokenizer(self.trg_sentences[idx])
        return torch.tensor(src_sample), torch.tensor(trg_sample)

## Tokenization and vocabulary building

In [None]:
def yield_tokens(data_iter, tokenizer):
    for sentence in data_iter:
        yield tokenizer(sentence)

src_tokenizer = get_tokenizer('spacy', language='it_core_news_sm')
trg_tokenizer = get_tokenizer('spacy', language='en_core_web_sm')

# Load sentences (for building vocab)
src_sentences = open('europarl-v7.it-en.it', encoding='utf-8').read().split('\n')
trg_sentences = open('europarl-v7.it-en.en', encoding='utf-8').read().split('\n')

# Build vocabularies
src_vocab = build_vocab_from_iterator(yield_tokens(src_sentences, src_tokenizer), specials=["<unk>", "<pad>", "<bos>", "<eos>"])
trg_vocab = build_vocab_from_iterator(yield_tokens(trg_sentences, trg_tokenizer), specials=["<unk>", "<pad>", "<bos>", "<eos>"])

src_vocab.set_default_index(src_vocab["<unk>"])
trg_vocab.set_default_index(trg_vocab["<unk>"])

## Collate function

In [None]:
def collate_fn(batch):
    src_batch, trg_batch = [], []
    for src_item, trg_item in batch:
        src_batch.append(torch.cat([torch.tensor([src_vocab["<bos>"]]), src_item, torch.tensor([src_vocab["<eos>"]])], dim=0))
        trg_batch.append(torch.cat([torch.tensor([trg_vocab["<bos>"]]), trg_item, torch.tensor([trg_vocab["<eos>"]])], dim=0))
    
    src_batch = pad_sequence(src_batch, padding_value=src_vocab["<pad>"])
    trg_batch = pad_sequence(trg_batch, padding_value=trg_vocab["<pad>"])
    return src_batch, trg_batch


## Data loader

In [None]:
BATCH_SIZE = 128

# Assuming you have defined src_file and trg_file paths
dataset = TranslationDataset('europarl-v7.it-en.it', 'europarl-v7.it-en.en', src_tokenizer, trg_tokenizer)
data_loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
