## Data

In [1]:
import nltk
import numpy as np
from collections import defaultdict, Counter
import pickle
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ZBOOK\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
italian_file_path = 'europarl-v7.it-en.it'
english_file_path = 'europarl-v7.it-en.en'

def load_sentences(file_path):
    with open(file_path, encoding='utf-8') as file:
        sentences = file.read().split('\n')
    return sentences

italian_sentences = load_sentences(italian_file_path)
english_sentences = load_sentences(english_file_path)

# Test data loading
print(f"Total Italian sentences: {len(italian_sentences)}")
print(f"Total English sentences: {len(english_sentences)}")

# Print the first 5 sentences in both languages to check
for i in range(5):
    print(f"Italian sentence {i+1}: {italian_sentences[i]}")
    print(f"English sentence {i+1}: {english_sentences[i]}\n")

Total Italian sentences: 1909116
Total English sentences: 1909116
Italian sentence 1: Ripresa della sessione
English sentence 1: Resumption of the session

Italian sentence 2: Dichiaro ripresa la sessione del Parlamento europeo, interrotta venerdì 17 dicembre e rinnovo a tutti i miei migliori auguri nella speranza che abbiate trascorso delle buone vacanze.
English sentence 2: I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.

Italian sentence 3: Come avrete avuto modo di constatare il grande "baco del millennio" non si è materializzato. Invece, i cittadini di alcuni nostri paesi sono stati colpiti da catastrofi naturali di proporzioni davvero terribili.
English sentence 3: Although, as you will have seen, the dreaded 'millennium bug' failed to materialise, still the people in a number of countries suffered a series of natural disa

## Preprocessing

In [4]:
def tokenize_sentences(sentences):
    """Tokenizes every sentence."""
    return [nltk.word_tokenize(sentence.lower()) for sentence in sentences]


def build_vocab(tokenized_sentences, min_frequency=256):
    """Builds a vocabulary with tokens appearing at least min_frequency times, including special tokens."""
    # Count the frequency of each token across all sentences
    token_freqs = Counter(token for sentence in tokenized_sentences for token in sentence)
    
    # Filter tokens by frequency threshold
    tokens = [token for token, freq in token_freqs.items() if freq >= min_frequency]
    
    # Create a vocabulary with special tokens
    vocab = {"<pad>": 0, "<sos>": 1, "<eos>": 2, "<unk>": 3}
    
    # Assign indices to filtered tokens
    for index, token in enumerate(tokens, start=len(vocab)):
        vocab[token] = index
    
    return vocab


def tokens_to_indices(tokenized_sentences, vocab):
    """Converts tokens to indices based on a vocabulary, using <unk> for unknown tokens."""
    return [[vocab.get(token, vocab["<unk>"]) for token in sentence] for sentence in tokenized_sentences]

def pad_sequences(sequences, vocab, max_len=32, padding_value=0, ):
    """Pads sequences to a specified maximum length."""
    # Truncate longer sequences and pad shorter ones
    return np.array([[vocab["<sos>"]] + seq[:max_len - 2] + [vocab["<eos>"]] + [padding_value] * max(0, max_len - (len(seq) + 2)) for seq in sequences])

def save_processed_data(filename, data):
    """Saves processed data to a file using pickle."""
    with open(filename, 'wb') as file:
        pickle.dump(data, file)

def load_processed_data(filename):
    """Loads processed data from a pickle file."""
    with open(filename, 'rb') as file:
        return pickle.load(file)

In [5]:
try:
    print("Trying to load data...")

    # Attempt to load pre-processed data if available
    italian_tokenized = load_processed_data('italian_tokenized.pkl')
    english_tokenized = load_processed_data('english_tokenized.pkl')
    italian_vocab = load_processed_data('italian_vocab.pkl')
    english_vocab = load_processed_data('english_vocab.pkl')
    italian_padded = load_processed_data('italian_padded.pkl')
    english_padded = load_processed_data('english_padded.pkl')

    print("Succesfully loaded data!")

except (FileNotFoundError, IOError):

    print("Failed to load data.")

    # Pre-process data
    print("Tokenizing sentences...")
    italian_tokenized = tokenize_sentences(italian_sentences)
    english_tokenized = tokenize_sentences(english_sentences)
    print("Sentences tokenized.")
    
    save_processed_data('italian_tokenized.pkl', italian_tokenized)
    save_processed_data('english_tokenized.pkl', english_tokenized)

    print("Building vocab...")
    italian_vocab = build_vocab(italian_tokenized)
    english_vocab = build_vocab(english_tokenized)
    print("Vocab built.")
    
    save_processed_data('italian_vocab.pkl', italian_vocab)
    save_processed_data('english_vocab.pkl', english_vocab)

    print("Tokens to indices...")
    italian_indices = tokens_to_indices(italian_tokenized, italian_vocab)
    english_indices = tokens_to_indices(english_tokenized, english_vocab)
    print("Done.")
    
    print("Padding sequences...")
    italian_padded = pad_sequences(italian_indices, italian_vocab)
    english_padded = pad_sequences(english_indices, english_vocab)
    print("Sentences padded.")
    
    save_processed_data('italian_padded.pkl', italian_padded)
    save_processed_data('english_padded.pkl', english_padded)

    print("Saved data for future use.")

Trying to load data...
Failed to load data.
Tokenizing sentences...
Sentences tokenized.
Building vocab...
Vocab built.
Tokens to indices...
Done.
Padding sequences...
Sentences padded.
Saved data for future use.


In [7]:
# TranslationDataset class for use with PyTorch
class TranslationDataset(Dataset):
    def __init__(self, inputs, targets):
        self.inputs = inputs
        self.targets = targets
    
    def __len__(self):
        return len(self.inputs)
    
    def __getitem__(self, idx):
        return {
            'input': torch.tensor(self.inputs[idx], dtype=torch.long),
            'target': torch.tensor(self.targets[idx], dtype=torch.long)
        }

# Splitting the dataset
train_inputs, test_inputs, train_targets, test_targets = train_test_split(italian_padded, english_padded, test_size=0.1)
train_inputs, val_inputs, train_targets, val_targets = train_test_split(train_inputs, train_targets, test_size=0.1)

# Create Dataset and DataLoader instances for PyTorch
train_dataset = TranslationDataset(train_inputs, train_targets)
val_dataset = TranslationDataset(val_inputs, val_targets)
test_dataset = TranslationDataset(test_inputs, test_targets)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [8]:
maximum = 0
for item in italian_vocab.items():
    if maximum < int(item[1]):
        maximum = int(item[1])

print(maximum)

10004


In [9]:
len(italian_vocab)

10005

In [8]:
lists = [[45, 8, 45, 7, 7, 12, 66], [12]  + [78] * 200]

pad_sequences(lists, english_vocab)



array([[ 1, 45,  8, 45,  7,  7, 12, 66,  2,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 1, 12, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78,
        78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78,
        78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78,
        78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78,  2]])

## Model

In [10]:
import torch
import torch.nn as nn
import random

# Implementation:
# https://github.com/aladdinpersson/Machine-Learning-Collection/blob/master/ML/Pytorch/more_advanced/Seq2Seq/seq2seq.py


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, p):
        super(Encoder, self).__init__()
        self.dropout = nn.Dropout(p)
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)

    def forward(self, x):
        # x shape: (seq_length, N) where N is batch size
        embedding = self.dropout(self.embedding(x))
        # embedding shape: (seq_length, N, embedding_size)
        outputs, (hidden, cell) = self.rnn(embedding)
        # outputs shape: (seq_length, N, hidden_size)

        return hidden, cell


class Decoder(nn.Module):
    def __init__(
        self, input_size, embedding_size, hidden_size, output_size, num_layers, p
    ):
        super(Decoder, self).__init__()
        self.dropout = nn.Dropout(p)
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden, cell):
        # x shape: (N) where N is for batch size, we want it to be (1, N), seq_length
        # is 1 here because we are sending in a single word and not a sentence
        x = x.unsqueeze(0)

        embedding = self.dropout(self.embedding(x))
        # embedding shape: (1, N, embedding_size)

        outputs, (hidden, cell) = self.rnn(embedding, (hidden, cell))
        # outputs shape: (1, N, hidden_size)

        predictions = self.fc(outputs)

        # predictions shape: (1, N, length_target_vocabulary) to send it to
        # loss function we want it to be (N, length_target_vocabulary) so we're
        # just gonna remove the first dim
        predictions = predictions.squeeze(0)

        return predictions, hidden, cell


class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, source, target, teacher_force_ratio=0.5):
        



        batch_size = source.shape[1]
        target_len = target.shape[0]
        target_vocab_size = len(english_vocab)

        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)

        hidden, cell = self.encoder(source)

        # Grab the first input to the Decoder which will be <SOS> token
        x = target[0]

        for t in range(1, target_len):
            # Use previous hidden, cell as context from encoder at start
            output, hidden, cell = self.decoder(x, hidden, cell)

            # Store next output prediction
            outputs[t] = output

            # Get the best word the Decoder predicted (index in the vocabulary)
            best_guess = output.argmax(1)

            # With probability of teacher_force_ratio we take the actual next word
            # otherwise we take the word that the Decoder predicted it to be.
            # Teacher Forcing is used so that the model gets used to seeing
            # similar inputs at training and testing time, if teacher forcing is 1
            # then inputs at test time might be completely different than what the
            # network is used to. This was a long comment.
            x = target[t] if random.random() < teacher_force_ratio else best_guess

        return outputs

## Training

In [11]:
import torch.optim as optim

# Initialize model
encoder = Encoder(len(italian_vocab), embedding_size=64, hidden_size=128, num_layers=1, p=0.5).to(device)
decoder = Decoder(len(english_vocab), embedding_size=64, hidden_size=128, output_size=len(english_vocab), num_layers=1, p=0.5).to(device)

# Initialize Seq2Seq model
model = Seq2Seq(encoder, decoder).to(device)

# Loss function
pad_idx = 0
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)


# Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training
def train(model, iterator, optimizer, criterion, clip):
    model.train()

    epoch_loss = 0

    for i, batch in enumerate(iterator):
        if i % 100 == 0:
            print(f"batch {i} started...")
        src = batch['input'].to(device)
        trg = batch['target'].to(device)

        optimizer.zero_grad()

        output = model(src, trg)
        
        # Output shape: (trg_len, batch_size, output_dim)
        # Target shape: (trg_len, batch_size)

        output_dim = output.shape[-1]

        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)

        loss = criterion(output, trg)

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()

        epoch_loss += loss.item()
    return epoch_loss / len(iterator)

# Evaluation
def evaluate(model, iterator, criterion):
    model.eval()

    epoch_loss = 0

    with torch.no_grad():
        for i, batch in enumerate(iterator):
            src = batch['input'].to(device)
            trg = batch['target'].to(device)

            output = model(src, trg, 0)  # Turn off teacher forcing

            output_dim = output.shape[-1]

            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)

            loss = criterion(output, trg)

            epoch_loss += loss.item()

    return epoch_loss / len(iterator)



# Training loop
N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    train_loss = train(model, train_loader, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, val_loader, criterion)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'seq2seq.pt')

    print(f'Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Val. Loss: {valid_loss:.3f}')


  from .autonotebook import tqdm as notebook_tqdm


batch 0 started...
batch 100 started...
batch 200 started...
batch 300 started...
batch 400 started...
batch 500 started...
batch 600 started...
batch 700 started...
batch 800 started...
batch 900 started...
batch 1000 started...
batch 1100 started...
batch 1200 started...
batch 1300 started...
batch 1400 started...
batch 1500 started...
batch 1600 started...
batch 1700 started...
batch 1800 started...
batch 1900 started...
batch 2000 started...
batch 2100 started...
batch 2200 started...
batch 2300 started...
batch 2400 started...
batch 2500 started...
batch 2600 started...
batch 2700 started...
batch 2800 started...
batch 2900 started...
batch 3000 started...
batch 3100 started...
batch 3200 started...
batch 3300 started...
batch 3400 started...
batch 3500 started...
batch 3600 started...
batch 3700 started...
batch 3800 started...
batch 3900 started...
batch 4000 started...
batch 4100 started...
batch 4200 started...
batch 4300 started...
batch 4400 started...
batch 4500 started...


KeyboardInterrupt: 

In [1]:
len(english_vocab)

NameError: name 'english_vocab' is not defined