## Imports

In [1]:
import nltk
import numpy as np
from collections import defaultdict

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Daniel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Data loading

In [2]:
italian_file_path = 'europarl-v7.it-en.it'  # Adjust the path if your file is in a different directory
english_file_path = 'europarl-v7.it-en.en'

def load_sentences(file_path):
    with open(file_path, encoding='utf-8') as file:
        sentences = file.read().split('\n')
    return sentences

italian_sentences = load_sentences(italian_file_path)
english_sentences = load_sentences(english_file_path)

# Test data loading
print(f"Total Italian sentences: {len(italian_sentences)}")
print(f"Total English sentences: {len(english_sentences)}")

# Print the first 5 sentences in both languages to check
for i in range(5):
    print(f"Italian sentence {i+1}: {italian_sentences[i]}")
    print(f"English sentence {i+1}: {english_sentences[i]}\n")

Total Italian sentences: 1909116
Total English sentences: 1909116
Italian sentence 1: Ripresa della sessione
English sentence 1: Resumption of the session

Italian sentence 2: Dichiaro ripresa la sessione del Parlamento europeo, interrotta venerdì 17 dicembre e rinnovo a tutti i miei migliori auguri nella speranza che abbiate trascorso delle buone vacanze.
English sentence 2: I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.

Italian sentence 3: Come avrete avuto modo di constatare il grande "baco del millennio" non si è materializzato. Invece, i cittadini di alcuni nostri paesi sono stati colpiti da catastrofi naturali di proporzioni davvero terribili.
English sentence 3: Although, as you will have seen, the dreaded 'millennium bug' failed to materialise, still the people in a number of countries suffered a series of natural disa

## Data preprocessing

In [3]:
import nltk
import numpy as np
from collections import defaultdict, Counter
import pickle

nltk.download('punkt')  # Make sure NLTK datasets are downloaded

def tokenize_sentences(sentences, max_sentence_length=50):
    """Tokenizes sentences up to a maximum length, discarding longer ones."""
    return [nltk.word_tokenize(sentence.lower()) for sentence in sentences if len(sentence.split()) <= max_sentence_length]

def build_vocab(tokenized_sentences, min_frequency=1):
    """Builds a vocabulary with tokens appearing at least min_frequency times."""
    # Flatten the list of token lists and count token frequencies
    token_freqs = Counter(token for sentence in tokenized_sentences for token in sentence)
    # Include only tokens that appear at least min_frequency times
    vocab = {token: index for index, (token, freq) in enumerate(token_freqs.items(), start=1) if freq >= min_frequency}
    vocab["<pad>"] = 0  # Padding token
    return vocab

def tokens_to_indices(tokenized_sentences, vocab):
    """Converts tokens to indices based on a vocabulary."""
    return [[vocab.get(token, vocab["<pad>"]) for token in sentence] for sentence in tokenized_sentences]

def pad_sequences(sequences, max_len=None, padding_value=0):
    """Pads sequences to a specified maximum length."""
    if not max_len:
        max_len = max(len(seq) for seq in sequences)
    # Truncate longer sequences and pad shorter ones
    return np.array([seq[:max_len] + [padding_value] * max(0, max_len - len(seq)) for seq in sequences])

def save_processed_data(filename, data):
    """Saves processed data to a file using pickle."""
    with open(filename, 'wb') as file:
        pickle.dump(data, file)

def load_processed_data(filename):
    """Loads processed data from a pickle file."""
    with open(filename, 'rb') as file:
        return pickle.load(file)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Daniel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
try:
    print("Trying to load data...")

    # Attempt to load pre-processed data if available
    italian_tokenized = load_processed_data('italian_tokenized.pkl')
    english_tokenized = load_processed_data('english_tokenized.pkl')
    italian_vocab = load_processed_data('italian_vocab.pkl')
    english_vocab = load_processed_data('english_vocab.pkl')
    italian_padded = load_processed_data('italian_padded.pkl')
    english_padded = load_processed_data('english_padded.pkl')

    print("Succesfully loaded data!")

except (FileNotFoundError, IOError):

    print("Failed to load data.")

    # Pre-process data
    print("Tokenizing sentences...")
    italian_tokenized = tokenize_sentences(italian_sentences)
    english_tokenized = tokenize_sentences(english_sentences)
    print("Sentences tokenized.")
    
    print("Building vocab...")
    italian_vocab = build_vocab(italian_tokenized)
    english_vocab = build_vocab(english_tokenized)
    print("Vocab built.")
    
    print("Tokens to indices...")
    italian_indices = tokens_to_indices(italian_tokenized, italian_vocab)
    english_indices = tokens_to_indices(english_tokenized, english_vocab)
    print("Done.")
    
    print("Padding sequences...")
    italian_padded = pad_sequences(italian_indices)
    english_padded = pad_sequences(english_indices)
    print("Sentences padded.")
    
    # Save processed data for future use
    save_processed_data('italian_tokenized.pkl', italian_tokenized)
    save_processed_data('english_tokenized.pkl', english_tokenized)
    save_processed_data('italian_vocab.pkl', italian_vocab)
    save_processed_data('english_vocab.pkl', english_vocab)
    save_processed_data('italian_padded.pkl', italian_padded)
    save_processed_data('english_padded.pkl', english_padded)

    print("Saved data for future use.")


Trying to load data...
Succesfully loaded data!


## RNN Model Definition

In [8]:
import torch
import torch.nn as nn
import torch.optim as optim

#### Encoder

In [27]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.rnn = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input)
        output, hidden = self.rnn(embedded, hidden)
        return output, hidden

    def initHidden(self, batch_size=1):
        return torch.zeros(1, batch_size, self.hidden_size)


#### Decoder

In [10]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.rnn = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = torch.relu(output)
        output, hidden = self.rnn(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden