In [2]:
from csv import QUOTE_NONE
from collections import Counter
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk import bigrams, trigrams
import spacy
import re
import torch.nn as nn
from nltk.tokenize import word_tokenize
import nltk
import random
from sklearn.model_selection import train_test_split
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from collections import defaultdict
import numpy as np
from typing import Dict
import torch
import torch.optim as optim
from torch.nn import CrossEntropyLoss

nltk.download('punkt')
nltk.download('stopwords')
!python3 -m spacy download en_core_web_sm
!python3 -m spacy download fr_core_news_sm

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/niclasstoffregen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m340.9 kB/s[0m eta [36m0:00:00[0m00:01[0m00:02[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
Collecting fr-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.7.0/fr_core_news_sm-3.7.0-py3-none-any.whl (16.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:

In [3]:
# Define the paths to your files
english_file_path = "fr-en/europarl-v7.fr-en.en"
french_file_path = "fr-en/europarl-v7.fr-en.fr"

In [4]:
########### TASK 1###############

In [5]:
########LOAD DATA###############

In [7]:
""" Function to load data into a pandas DataFrame without treating any character as quotes"""

def load_data_to_dataframe(file_path):
    # Read the entire file as a single column DataFrame, ignoring any quoting
    return pd.read_csv(file_path, header=None, names=['text'], encoding='utf-8', sep='\t', quoting=QUOTE_NONE, engine='python')

# Load the data
english_data = load_data_to_dataframe(english_file_path)
french_data = load_data_to_dataframe(french_file_path)

English DataFrame sample:
                                                text
0                          Resumption of the session
1  I declare resumed the session of the European ...
2  Although, as you will have seen, the dreaded '...
3  You have requested a debate on this subject in...
4  In the meantime, I should like to observe a mi...
French DataFrame sample:
                                                text
0                              Reprise de la session
1  Je déclare reprise la session du Parlement eur...
2  Comme vous avez pu le constater, le grand "bog...
3  Vous avez souhaité un débat à ce sujet dans le...
4  En attendant, je souhaiterais, comme un certai...


In [8]:
##############DATA ANALYSIS##################

In [9]:
## Take 10% fraction of data randomly sampled
english_data = english_data.sample(frac=0.1, random_state=42)
french_data = french_data.sample(frac=0.1, random_state=42)


In [15]:
########### TASK 2###############

In [16]:
##################DATA PREPROCESSING#################################

In [17]:
def preprocess_data(data_en, data_fr):
    # Lowercase the text
    """
    Normalizing case helps reduce the complexity of the language model by treating words like “The” and “the” as the same word, which can be particularly helpful in languages like English where capitalization is more stylistic than semantic.
    """
    data_en['text'] = data_en['text'].str.lower()
    data_fr['text'] = data_fr['text'].str.lower()

    # Remove XML tags
    """
    Lines containing XML-tags are likely not actual conversational or formal text but rather formatting or metadata which is irrelevant for translation purposes
    """
    data_en['text'] = data_en['text'].apply(lambda x: '' if x.strip().startswith('<') else x)
    data_fr['text'] = data_fr['text'].apply(lambda x: '' if x.strip().startswith('<') else x)

    # Strip empty lines and remove their correspondences
    """
     Empty lines or lines that do not contain any meaningful content should be removed because they do not provide valuable information for training the model. It is also important to remove the corresponding line in the other language to maintain alignment.
    """
    mask = (data_en['text'].str.strip().astype(bool) & data_fr['text'].str.strip().astype(bool))
    data_en = data_en[mask]
    data_fr = data_fr[mask]

    return data_en, data_fr



"""
Steps Not Chosen and Why:

- Removing Numbers or Special Characters: Not chosen because numbers and certain punctuation can carry semantic weight in sentences, which can be important for translations, such as dates, quantities, or formatted text.
- Stemming/Lemmatization: Not typically used in machine translation preprocessing because retaining the full form of words is important for accurate translation, especially between languages with different linguistic structures.
- Removing Stopwords: Not recommended for translation tasks because stopwords (common words like “and”, “the”, etc.) are crucial for maintaining the grammatical structure of the sentence in both source and target languages.
"""


preprocessed_en, preprocessed_fr  = preprocess_data(english_data,french_data)


In [18]:
"""
Remove all characters that are defined as noise from both translations
"""
def remove_noisy_characters(data):
    # Define the characters to remove
    noisy_characters = re.escape('@#$%^&*~<>|\\{}[]+=_/')
    
    # Regex to match any noisy character
    regex_pattern = f'[{noisy_characters}]'
    
    # Remove noisy characters using regex substitution
    data['text'] = data['text'].apply(lambda x: re.sub(regex_pattern, '', x))
    
    return data


preprocessed_en = remove_noisy_characters(english_data)
preprocessed_fr = remove_noisy_characters(french_data)


In [19]:
# TODO FIX BUG

"""
remove special characterers only if a special character appears in one translation but not in the other and vice versa
"""

def synchronize_special_characters(data_en, data_fr):
    counter = 0
    # Define the characters to synchronize
    special_characters = re.escape('@#$%^&*~<>|\\{}[]+=_/')
    
    # Regex to match any special character
    regex_pattern = f'[{special_characters}]'

    # Process each sentence pair
    for idx in range(len(data_en)):
        if idx >= len(data_fr):  # Ensure index is within the bounds for both dataframes
            break
        
        # Extract texts from both dataframes
        text_en = data_en.loc[idx, 'text']
        text_fr = data_fr.loc[idx, 'text']

        # Find special characters in both texts
        found_chars_en = set(re.findall(regex_pattern, text_en))
        found_chars_fr = set(re.findall(regex_pattern, text_fr))

        # Determine characters to remove (those not in both)
        chars_to_remove = found_chars_en.symmetric_difference(found_chars_fr)

        # Remove the special characters that do not appear in both translations
        if chars_to_remove:
            counter += 1
            remove_regex = '[' + re.escape(''.join(chars_to_remove)) + ']'
            data_en.loc[idx, 'text'] = re.sub(remove_regex, '', text_en)
            data_fr.loc[idx, 'text'] = re.sub(remove_regex, '', text_fr)
    print(counter)
    return data_en, data_fr

#data_en_sync, data_fr_sync = synchronize_special_characters(data_en, data_fr)

In [20]:
#######TRAINING without pretrained Embeddings############# 

In [21]:
"""
Model Architecture:
Encoder: An RNN (LSTM or GRU) that processes the input sentence and encodes it into a context vector.
Decoder: Another RNN that takes the context vector and generates the output sentence in the target language.
"""

class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output, hidden = self.lstm(embedded, hidden)
        return output, hidden

    def initHidden(self):
        return (torch.zeros(1, 1, self.hidden_size),
                torch.zeros(1, 1, self.hidden_size))
    

class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = nn.functional.relu(output)
        output, hidden = self.lstm(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return (torch.zeros(1, 1, self.hidden_size),
                torch.zeros(1, 1, self.hidden_size))
    


In [22]:
#Create Embeddings manually 
# Tokenization function
def tokenize(sentences):
    return [word_tokenize(sentence.lower()) for sentence in sentences]

# Building vocabulary
def build_vocab_1(tokenized_texts):
    vocab = defaultdict()
    vocab.default_factory = lambda: len(vocab)
    for tokens in tokenized_texts:
        for token in tokens:
            vocab[token]
    return vocab

# Convert texts to numerical indices
def numericalize(texts, vocab):
    return [[vocab[token] for token in tokens] for tokens in tokenize(texts)]



In [23]:
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    loss = 0

    # Encoder
    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)

    # Decoder initialization without an SOS token
    # The first input to the decoder could be an assumed blank token, often just zeros
    decoder_input = torch.zeros((1, 1), dtype=torch.long)  # Assuming your vocab is zero-indexed
    decoder_hidden = encoder_hidden  # Use the last hidden state from the encoder to start the decoder

    for di in range(target_length):
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
        topv, topi = decoder_output.topk(1)
        decoder_input = topi.squeeze().detach()  # Use model's own prediction as next input

        loss += criterion(decoder_output, target_tensor[di].unsqueeze(0))
        # Check if decoder has generated the end of sequence, often by target length or a special condition
        if di == target_length - 1:  # Simple condition assuming reaching the end of target tensor
            break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [24]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    print_loss_total = 0  # Reset every print_every

    training_pairs = [tensorsFromPair(random.choice(pairs))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print(f'Iteration: {iter} Average Loss: {print_loss_avg:.4f}')

In [25]:
#TODO takes to much time..

X_train, X_test, y_train, y_test = train_test_split(data_en, data_fr, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2


# init decoder and encoder and loss function 

input_size = vocab_size_en   # size of the English vocabulary was calculated in task 1
output_size = vocab_size_fr  # size of the French vocabulary was calculated in task 1
hidden_size = 256            # typically a power of 2 like 256 or 512

encoder = EncoderRNN(input_size, hidden_size)
decoder = DecoderRNN(hidden_size, output_size)


criterion = nn.NLLLoss()
encoder_optimizer = optim.Adam(encoder.parameters(), lr=0.01)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=0.01)


data = {
    'X_train': X_train.tolist(),
    'y_train': y_train.tolist()
}

df = pd.DataFrame(data)



#create Tensors

# Tokenize and build vocabularies
tokenized_x = tokenize(df['X_train'])
tokenized_y = tokenize(df['y_train'])

#{token(in this case a word):number} ex. 'watts': 4239
vocab_x = build_vocab_1(tokenized_x)
vocab_y = build_vocab_1(tokenized_y)

# Numericalize the data
numericalized_x = numericalize(df['X_train'], vocab_x)
numericalized_y = numericalize(df['y_train'], vocab_y)

# Convert lists to PyTorch tensors and pad sequences
input_tensor = pad_sequence([torch.tensor(seq) for seq in numericalized_x], batch_first=True, padding_value=0)
target_tensor = pad_sequence([torch.tensor(seq) for seq in numericalized_y], batch_first=True, padding_value=0)





# Number of iterations (epochs)
num_iterations = 10000
print_every = 1000

"""for iter in range(1, num_iterations + 1):
    # Randomly selecting an example each time (for simplicity in this example)
    # Ideally, use a more systematic way to create batches and cycle through data
    input_example = random.choice(input_tensor)
    target_example = random.choice(target_tensor)
    loss = train(input_example, target_example, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, 1000)
    if iter % print_every == 0:
        print(f'Iter: {iter}, Loss: {loss:.4f}')"""


"for iter in range(1, num_iterations + 1):\n    # Randomly selecting an example each time (for simplicity in this example)\n    # Ideally, use a more systematic way to create batches and cycle through data\n    input_example = random.choice(input_tensor)\n    target_example = random.choice(target_tensor)\n    loss = train(input_example, target_example, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, 1000)\n    if iter % print_every == 0:\n        print(f'Iter: {iter}, Loss: {loss:.4f}')"

In [26]:
####### TODO TRAIN MODEL WITH SOS AND EOS TOKEN #########

In [27]:
#TODO add SOS and EOS tokens 

SOS_token = 0  # Assuming 0 is the index for SOS token in your vocabulary
EOS_token = 1  # Assuming 1 is the index for EOS token in your vocabulary

def add_special_tokens(sentences, sos_token, eos_token=None):
    # Add an SOS token at the beginning of each sentence.
    # Optionally, add an EOS token at the end.
    updated_sentences = []
    for sentence in sentences:
        modified_sentence = [sos_token] + sentence
        if eos_token is not None:
            modified_sentence += [eos_token]
        updated_sentences.append(modified_sentence)
    return updated_sentences


In [28]:
########################## USE DIFFERENT EMBEDDING MODELS #################



In [29]:

def load_embeddings_and_create_index(path):
    word_to_idx = {}
    idx = 0
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            word_to_idx[word] = idx
            idx += 1
    return word_to_idx


# Tokenization
def tokenize(text):
    return text.lower().split()

# Vocabulary mapping
class Vocabulary:
    def __init__(self, token_to_idx=None):
        if token_to_idx is None:
            token_to_idx = {}
        self.token_to_idx = token_to_idx
        self.idx_to_token = {idx: token for token, idx in self.token_to_idx.items()}
    
    def add_word(self, word):
        if word not in self.token_to_idx:
            idx = len(self.token_to_idx)
            self.token_to_idx[word] = idx
            self.idx_to_token[idx] = word
    
    def __call__(self, word):
        return self.token_to_idx.get(word, self.token_to_idx['<unk>'])
    
    def __len__(self):
        return len(self.token_to_idx)

# Build vocabularies for both languages
def build_vocab(sentences, existing_embeddings_word2idx):
    vocab = Vocabulary()
    vocab.add_word('<pad>')  # Assuming you handle padding explicitly
    vocab.add_word('<unk>')  # Handle unknown words

    for sentence in sentences:
        tokens = tokenize(sentence)
        for token in tokens:
            if token in existing_embeddings_word2idx:
                vocab.add_word(token)
    return vocab


# Dataset preparation
class TranslationDataset(Dataset):
    def __init__(self, source_sentences, target_sentences, source_vocab, target_vocab):
        self.source_sentences = source_sentences
        self.target_sentences = target_sentences
        self.source_vocab = source_vocab
        self.target_vocab = target_vocab
    
    def __len__(self):
        return len(self.source_sentences)
    
    def __getitem__(self, index):
        source_sentence = [self.source_vocab(token) if token in self.source_vocab.token_to_idx else self.source_vocab('<unk>') for token in tokenize(self.source_sentences.iloc[index])]
        target_sentence = [self.target_vocab(token) if token in self.target_vocab.token_to_idx else self.target_vocab('<unk>') for token in tokenize(self.target_sentences.iloc[index])]
        return torch.tensor(source_sentence, dtype=torch.long), torch.tensor(target_sentence, dtype=torch.long)

# Collate function for padding
def collate_fn(batch):
    source_batch, target_batch = zip(*batch)
    source_batch_padded = pad_sequence(source_batch, padding_value=vocab_en('<pad>'), batch_first=True)
    target_batch_padded = pad_sequence(target_batch, padding_value=vocab_fr('<pad>'), batch_first=True)
    return source_batch_padded, target_batch_padded


In [30]:
# Example GloVe embedding file path and embedding dimension

def load_glove_embeddings(path: str, word2idx: Dict[str, int], embedding_dim: int) -> torch.Tensor:
    """
    Load GloVe embeddings from a specified file and align them with the given word index dictionary.

    Parameters:
    - path (str): The file path to the GloVe embeddings file.
    - word2idx (Dict[str, int]): A dictionary mapping words to their corresponding indices. This dictionary defines
      the position each word’s vector should occupy in the resulting embedding matrix.
    - embedding_dim (int): The dimensionality of the GloVe vectors (e.g., 50, 100, 200, 300).

    Returns:
    - torch.Tensor: A tensor of shape (len(word2idx), embedding_dim) containing the GloVe vectors aligned according to word2idx.
    """
    with open(path, 'r', encoding='utf-8') as f:
        # Initialize the embedding matrix with zeros
        embeddings = np.zeros((len(word2idx), embedding_dim))
        
        # Process each line in the GloVe file
        for line in f:
            values = line.split()
            word = values[0]
            
            # If the word is in the provided dictionary, update the corresponding row in embeddings
            if word in word2idx:
                # Convert embedding values from strings to float32
                vector = np.asarray(values[1:], dtype='float32')
                # Place the vector in the correct index as per word2idx
                embeddings[word2idx[word]] = vector
    
    # Convert the numpy array to a PyTorch tensor
    return torch.from_numpy(embeddings)





In [31]:

class Encoder(nn.Module):
    def __init__(self, hidden_size, pretrained_embeddings):
        
        """
        Initialize the Encoder with pre-trained embeddings and a GRU layer.

        Parameters:
            hidden_size (int): The number of features in the hidden state of the GRU.
            pretrained_embeddings (torch.Tensor): A tensor containing the pre-trained word embeddings.
        """
        super(Encoder, self).__init__()
        # Ensure that the pretrained embeddings are of type float32
        if pretrained_embeddings.dtype != torch.float32:
            pretrained_embeddings = pretrained_embeddings.to(dtype=torch.float32)
        self.embedding = nn.Embedding.from_pretrained(pretrained_embeddings, freeze=False)
        embed_size = pretrained_embeddings.shape[1]  # Embedding size is the second dimension of the embeddings tensor
        self.rnn = nn.GRU(embed_size, hidden_size, batch_first=True).float()  # Ensure GRU is initialized as float32

    def forward(self, x):
        """
        Forward pass of the encoder which processes the input sequence.

        Parameters:
            x (torch.Tensor): The input sequence tensor, which should be indexed by batch.

        Returns:
            hidden (torch.Tensor): The hidden state of the GRU, representing the encoded information of the input.
        """
        embedded = self.embedding(x).float()  # Ensure embedding outputs float32
        _, hidden = self.rnn(embedded)
        return hidden

class Decoder(nn.Module):
    def __init__(self, embed_size, hidden_size, output_size, pretrained_embeddings):
        """
        Initialize the Decoder with pre-trained embeddings, a GRU layer, and a linear output layer.

        Parameters:
            embed_size (int): The size of each embedding vector.
            hidden_size (int): The number of features in the hidden state of the GRU.
            output_size (int): The size of the output vocabulary.
            pretrained_embeddings (torch.Tensor): A tensor containing the pre-trained word embeddings.

        """
        
        super(Decoder, self).__init__()
        # Ensure that the pretrained embeddings are of type float32
        if pretrained_embeddings.dtype != torch.float32:
            pretrained_embeddings = pretrained_embeddings.to(dtype=torch.float32)
        self.embedding = nn.Embedding.from_pretrained(pretrained_embeddings, freeze=False)
        self.rnn = nn.GRU(embed_size, hidden_size, batch_first=True).float()  # Ensure GRU is initialized as float32
        self.fc = nn.Linear(hidden_size, output_size).float()  # Ensure Linear is initialized as float32

    def forward(self, x, hidden):
        """
        Forward pass of the decoder that processes one timestep of the sequence.

        Parameters:
            x (torch.Tensor): The input tensor for the current timestep.
            hidden (torch.Tensor): The hidden state from the last timestep.

        Returns:
            predicted (torch.Tensor): The output logits for the next word in the sequence.
            hidden (torch.Tensor): The updated hidden state.
        """
        embedded = self.embedding(x).float()  # Ensure embedding outputs float32
        output, hidden = self.rnn(embedded, hidden)
        predicted = self.fc(output)
        return predicted, hidden
    
class Seq2Seq(nn.Module):

        
    def __init__(self, encoder, decoder):
    
        """
        Initialize the sequence-to-sequence model which contains an encoder and a decoder.

        Parameters:
            encoder (Encoder): The encoder part of the Seq2Seq model.
            decoder (Decoder): The decoder part of the Seq2Seq model.
        """
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, src, trg):
        
        """
        Forward pass of the Seq2Seq model which processes the entire input and target sequence.

        Parameters:
            src (torch.Tensor): The input sequence tensor.
            trg (torch.Tensor): The target sequence tensor used during training.

        Returns:
            outputs (torch.Tensor): The output from the decoder for each step in the sequence.
        """
        hidden = self.encoder(src)
        outputs, _ = self.decoder(trg, hidden)
        return outputs




In [32]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


def train(model, loader, optimizer, criterion, epochs=10, device=device):
    model.train()  # Set the model to training mode
    for epoch in range(epochs):
        total_loss = 0
        for src, trg in loader:
            # Move tensors to the correct device and ensure they are long type for indexing operations
            src = src.to(device).long()  # Correct type for embedding layer
            trg = trg.to(device).long()  # Correct type for embedding layer
            

            optimizer.zero_grad()

            # Forward pass: The decoder's input is all except the last word
            output = model(src, trg[:, :-1])  
            
            # Since output will be in float (from linear layers, and GRU output), ensure it's float32 if not already
            output = output.float()

            output_dim = output.shape[-1]
            output = output.contiguous().view(-1, output_dim)
            trg = trg[:, 1:].contiguous().view(-1)  # Target doesn't include the first <sos> token

            loss = criterion(output, trg)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
        
        average_loss = total_loss / len(loader)
        print(f'Epoch {epoch+1}/{epochs}, Loss: {average_loss:.4f}')
        
        


In [34]:
data_en = english_data["text"]
data_fr = french_data["text"]

word_to_idx_en = load_embeddings_and_create_index('glove.6B/glove.6B.100d.txt')
word_to_idx_fr = load_embeddings_and_create_index('glove.6B/glove.6B.100d.txt')
vocab_en = build_vocab(data_en, word_to_idx_en)
vocab_fr = build_vocab(data_fr, word_to_idx_fr)

X_train, X_test, y_train, y_test = train_test_split(data_en, data_fr, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2

data_en = pd.DataFrame({'sentence': X_train.tolist()}) 
data_fr = pd.DataFrame({'sentence': y_train.tolist()})


device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
print("Using device:", device)

embedding_dim =100


data = {
    'X_train': X_train.tolist(),
    'y_train': y_train.tolist()
}

df = pd.DataFrame(data)

df['tokenized_x'] = df['X_train'].apply(tokenize)
df['tokenized_y'] = df['y_train'].apply(tokenize)

#{token(in this case a word):number} ex. 'watts': 4239
vocab_x = build_vocab_1(tokenized_x)
vocab_y = build_vocab_1(tokenized_y)



# Load embeddings
glove_embeddings_en = load_glove_embeddings('glove.6B/glove.6B.100d.txt', vocab_x, embedding_dim)
glove_embeddings_fr = load_glove_embeddings('glove.6B/glove.6B.100d.txt', vocab_x, embedding_dim)

# Model instantiation
encoder = Encoder(hidden_size=hidden_size, pretrained_embeddings=glove_embeddings_en)
decoder = Decoder(embed_size=embedding_dim, hidden_size=hidden_size, output_size=len(vocab_fr), pretrained_embeddings=glove_embeddings_fr)
model = Seq2Seq(encoder, decoder)
model = model.to(device)


optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = CrossEntropyLoss(ignore_index=vocab_fr.token_to_idx['<pad>']).to(device)  # Move the loss function to the device



word_to_idx_en = load_embeddings_and_create_index('glove.6B/glove.6B.100d.txt')
word_to_idx_fr = load_embeddings_and_create_index('glove.6B/glove.6B.100d.txt')

vocab_en = build_vocab(data_en['sentence'], word_to_idx_en)
vocab_fr = build_vocab(data_fr['sentence'], word_to_idx_fr)

dataset = TranslationDataset(data_en['sentence'], data_fr['sentence'], vocab_en, vocab_fr)
loader = DataLoader(dataset, batch_size=2, collate_fn=collate_fn)

train(model, loader, optimizer, criterion, epochs=10, device=device)

Using device: mps


KeyboardInterrupt: 

In [36]:
vocab_en

<__main__.Vocabulary at 0x350c2a910>