In [1]:
import torch.nn as nn
import nltk
import random
from sklearn.model_selection import train_test_split
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from collections import defaultdict
import numpy as np
from typing import Dict
import torch
import torch.optim as optim
from torch.nn import CrossEntropyLoss
from preprocessing_task_2 import prepare_data
import spacy
from collections import Counter

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
!python3 -m spacy download en_core_web_sm
!python3 -m spacy download fr_core_news_sm

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/niclasstoffregen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/niclasstoffregen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
Collecting fr-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.7.0/fr_core_news_sm-3.7.0-py3-none-any.whl (16.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00

In [3]:
####### LOADING AND PREPROCESSING #############
english_data, french_data = prepare_data(fraction=0.1)

  data_en = data_en[mask]
  data_fr = data_fr[mask]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_en['text'] = data_en['text'].apply(lambda x: re.sub(regex_pattern, '', x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_fr['text'] = data_fr['text'].apply(lambda x: re.sub(regex_pattern, '', x))


In [4]:
# Functions to tokenize data and build vocab
spacy_fr = spacy.load('fr_core_news_sm')
spacy_en = spacy.load('en_core_web_sm')

def tokenize_fr(text):
    return [tok.text for tok in spacy_fr.tokenizer(text)]

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

def build_vocab(sentences, vocab_size, word_tokenize):
    all_words = [word for sentence in sentences for word in word_tokenize(sentence)]
    word_counts = Counter(all_words)
    vocab = [word for word, _ in word_counts.most_common(vocab_size)]
    word2idx = {word: idx for idx, word in enumerate(vocab, start=4)}
    word2idx['<pad>'] = 0
    word2idx['<unk>'] = 1
    word2idx['<sos>'] = 2
    word2idx['<eos>'] = 3
    return word2idx

In [5]:
# Collate function for padding
def collate_fn(batch):
    source_batch, target_batch = zip(*batch)
    source_batch_padded = pad_sequence(source_batch, padding_value=vocab_en['<pad>'], batch_first=True)
    target_batch_padded = pad_sequence(target_batch, padding_value=vocab_fr['<pad>'], batch_first=True)
    return source_batch_padded, target_batch_padded

In [6]:
########################## TODO: USE DIFFERENT EMBEDDING MODELS #################


In [7]:

def load_embeddings_and_create_index(path):
    word_to_idx = {}
    idx = 0
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            word_to_idx[word] = idx
            idx += 1
    return word_to_idx


In [8]:


# Dataset preparation
class TranslationDataset(Dataset):
    def __init__(self, source_sentences, target_sentences, source_vocab, target_vocab, tokenizer_source, tokenizer_target):
        self.source_sentences = source_sentences
        self.target_sentences = target_sentences
        self.source_vocab = source_vocab
        self.target_vocab = target_vocab
        self.tokenizer_source = tokenizer_source
        self.tokenizer_target = tokenizer_target
    
    def __len__(self):
        return len(self.source_sentences)
    
    def __getitem__(self, index):
        source_sentence = [self.source_vocab[token] if token in self.source_vocab else self.source_vocab['<unk>'] for token in self.tokenizer_source(self.source_sentences.iloc[index])]
        target_sentence = [self.target_vocab[token] if token in self.target_vocab else self.target_vocab['<unk>'] for token in self.tokenizer_target(self.target_sentences.iloc[index])]
        return torch.tensor(source_sentence, dtype=torch.long), torch.tensor(target_sentence, dtype=torch.long)




In [9]:
# Example GloVe embedding file path and embedding dimension

def load_glove_embeddings(path: str, word2idx: Dict[str, int], embedding_dim: int) -> torch.Tensor:
    """
    Load GloVe embeddings from a specified file and align them with the given word index dictionary.

    Parameters:
    - path (str): The file path to the GloVe embeddings file.
    - word2idx (Dict[str, int]): A dictionary mapping words to their corresponding indices. This dictionary defines
      the position each word’s vector should occupy in the resulting embedding matrix.
    - embedding_dim (int): The dimensionality of the GloVe vectors (e.g., 50, 100, 200, 300).

    Returns:
    - torch.Tensor: A tensor of shape (len(word2idx), embedding_dim) containing the GloVe vectors aligned according to word2idx.
    """
    with open(path, 'r', encoding='utf-8') as f:
        # Initialize the embedding matrix with zeros
        embeddings = np.zeros((len(word2idx), embedding_dim))
        
        # Process each line in the GloVe file
        for line in f:
            values = line.split()
            word = values[0]
            
            # If the word is in the provided dictionary, update the corresponding row in embeddings
            if word in word2idx.keys():
                # Convert embedding values from strings to float32
                vector = np.asarray(values[1:], dtype='float32')
                # Place the vector in the correct index as per word2idx
                embeddings[word2idx[word]] = vector
    
    # Convert the numpy array to a PyTorch tensor
    return torch.from_numpy(embeddings)





In [10]:

class Encoder(nn.Module):
    def __init__(self, hidden_size, pretrained_embeddings):
        
        """
        Initialize the Encoder with pre-trained embeddings and a GRU layer.

        Parameters:
            hidden_size (int): The number of features in the hidden state of the GRU.
            pretrained_embeddings (torch.Tensor): A tensor containing the pre-trained word embeddings.
        """
        super(Encoder, self).__init__()
        # Ensure that the pretrained embeddings are of type float32
        if pretrained_embeddings.dtype != torch.float32:
            pretrained_embeddings = pretrained_embeddings.to(dtype=torch.float32)
        self.embedding = nn.Embedding.from_pretrained(pretrained_embeddings, freeze=False)
        embed_size = pretrained_embeddings.shape[1]  # Embedding size is the second dimension of the embeddings tensor
        self.rnn = nn.GRU(embed_size, hidden_size, batch_first=True).float()  # Ensure GRU is initialized as float32

    def forward(self, input):
        """
        Forward pass of the encoder which processes the input sequence.

        Parameters:
            input (torch.Tensor): The input sequence tensor, which should be indexed by batch.

        Returns:
            hidden (torch.Tensor): The hidden state of the GRU, representing the encoded information of the input.
        """
        embedded = self.embedding(input).float()  # Ensure embedding outputs float32
        _, hidden = self.rnn(embedded)
        return hidden

class Decoder(nn.Module):
    def __init__(self, embed_size, hidden_size, output_size, pretrained_embeddings):
        """
        Initialize the Decoder with pre-trained embeddings, a GRU layer, and a linear output layer.

        Parameters:
            embed_size (int): The size of each embedding vector.
            hidden_size (int): The number of features in the hidden state of the GRU.
            output_size (int): The size of the output vocabulary.
            pretrained_embeddings (torch.Tensor): A tensor containing the pre-trained word embeddings.

        """
        
        super(Decoder, self).__init__()
        # Ensure that the pretrained embeddings are of type float32
        if pretrained_embeddings.dtype != torch.float32:
            pretrained_embeddings = pretrained_embeddings.to(dtype=torch.float32)
        self.embedding = nn.Embedding.from_pretrained(pretrained_embeddings, freeze=False)
        self.rnn = nn.GRU(embed_size, hidden_size, batch_first=True).float()  # Ensure GRU is initialized as float32
        self.fc = nn.Linear(hidden_size, output_size).float()  # Ensure Linear is initialized as float32

    def forward(self, x, hidden):
        """
        Forward pass of the decoder that processes one timestep of the sequence.

        Parameters:
            x (torch.Tensor): The input tensor for the current timestep.
            hidden (torch.Tensor): The hidden state from the last timestep.

        Returns:
            predicted (torch.Tensor): The output logits for the next word in the sequence.
            hidden (torch.Tensor): The updated hidden state.
        """
        embedded = self.embedding(x).float()  # Ensure embedding outputs float32
        output, hidden = self.rnn(embedded, hidden)
        predicted = self.fc(output)
        return predicted, hidden
    
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, source, target):

        """
        Forward pass of the Seq2Seq model which processes the entire input and target sequence.

        Parameters:
            source (torch.Tensor): The input sequence tensor.
            target (torch.Tensor): The target sequence tensor used during training.

        Returns:
            outputs (torch.Tensor): The output from the decoder for each step in the sequence.
        """
        hidden = self.encoder(source)
        outputs, _ = self.decoder(target, hidden)
        return outputs




In [11]:
def predict_1(model, sentence, source_vocab, target_vocab, tokenizer_source, tokenizer_target, max_length=50, device="cpu"):
    model.eval()  # Set the model to evaluation mode

    # Tokenize the input sentence
    tokens = tokenizer_source(sentence.lower())

    # Convert tokens to indices
    indices = [source_vocab.get(token, source_vocab['<unk>']) for token in tokens]
    # Prepare the input tensor
    input_tensor = torch.tensor(indices, dtype=torch.long).unsqueeze(0).to(device) 
    
    # Pass through the encoder
    with torch.no_grad():
        encoder_hidden = model.encoder(input_tensor)

    #print(f"encoder hidden size: {encoder_hidden.size()}")

    # Initialize the decoder input and hidden state
    decoder_input = torch.tensor([[target_vocab['<sos>']]], dtype=torch.long).to(device)
    #print(f"decoder input size: {decoder_input.size()}")
    decoder_hidden = encoder_hidden#.unsqueeze(0)  # Add batch dimension back
    #print(f"decoder hidden size: {decoder_hidden.size()}")


    # Generate the output sequence
    output_tokens = []
    for _ in range(max_length):
        with torch.no_grad():
            decoder_output, decoder_hidden = model.decoder(decoder_input, decoder_hidden)

        topv, topi = decoder_output.topk(1)
        next_token = topi.squeeze().item()

        if next_token == target_vocab['<eos>']:
            break

        output_tokens.append(next_token)
        decoder_input = topi.squeeze(0)#.detach()#.unsqueeze(0)

    # Convert indices to tokens
    output_sentence = [list(target_vocab.keys())[list(target_vocab.values()).index(idx)] for idx in output_tokens]

    return ' '.join(output_sentence)

In [12]:
def predict_2(model, sentence, source_vocab, target_vocab, tokenizer_source, tokenizer_target, device="cpu"):
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():
        source = torch.tensor([source_vocab[token] if token in source_vocab else source_vocab['<unk>'] for token in tokenizer_source(sentence.lower())], dtype=torch.long).unsqueeze(0).to(device)
        target = torch.tensor([target_vocab['<sos>']], dtype=torch.long).unsqueeze(0).to(device)
        hidden = model.encoder(source)
        outputs = []
        for _ in range(20):  # Limit the length of the generated sequence
            output, hidden = model.decoder(target, hidden)
            output = output.squeeze(0)
            topv, target = output.topk(1)
            #target = topi.squeeze(0)#.detach()
            if target.item() == target_vocab['<eos>']:
                break
            outputs.append(target.item())
        translated = ' '.join([list(target_vocab.keys())[list(target_vocab.values()).index(idx)] for idx in outputs])
        return translated

In [13]:

def train(model, loader, optimizer, criterion, epochs=10, device="cpu"):
    model.train()  # Set the model to training mode
    for epoch in range(epochs):
        total_loss = 0
        for src, trg in loader:
            # Move tensors to the correct device and ensure they are long type for indexing operations
            src = src.to(device).long()  # Correct type for embedding layer
            trg = trg.to(device).long()  # Correct type for embedding layer
            

            optimizer.zero_grad()

            # Forward pass: The decoder's input is all except the last word
            output = model(src, trg[:, :-1])  
            
            # Since output will be in float (from linear layers, and GRU output), ensure it's float32 if not already
            output = output.float()

            output_dim = output.shape[-1]
            output = output.contiguous().view(-1, output_dim)
            trg = trg[:, 1:].contiguous().view(-1)  # Target doesn't include the first <sos> token

            loss = criterion(output, trg)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
        
        average_loss = total_loss / len(loader)
        print(predict_1(model, "I am a student", vocab_en, vocab_fr, tokenize_en, tokenize_fr, device=device))
        print(predict_2(model, "I am a book", vocab_en, vocab_fr, tokenize_en, tokenize_fr, device=device))

        print(f'Epoch {epoch+1}/{epochs}, Loss: {average_loss:.4f}')

In [14]:

device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")

X_train, X_test, y_train, y_test = train_test_split(english_data, french_data, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2

vocab_size = 10000
vocab_en = build_vocab(X_train["text"], vocab_size, tokenize_en)
vocab_fr = build_vocab(y_train["text"], vocab_size, tokenize_fr)
print(f"English vocab size: {len(vocab_en)}, French vocab size: {len(vocab_fr)}")
embedding_dim = 100

# Load embeddings TODO: use same or different embeddings for source and target languages?
vocab_embeddings_en = load_glove_embeddings('glove.6B/glove.6B.100d.txt', vocab_en, embedding_dim)
vocab_embeddings_fr = load_glove_embeddings('glove.6B/glove.6B.100d.txt', vocab_fr, embedding_dim)

# Model instantiation
hidden_size = 1024
encoder = Encoder(hidden_size=hidden_size, pretrained_embeddings=vocab_embeddings_en)
decoder = Decoder(embed_size=embedding_dim, hidden_size=hidden_size, output_size=len(vocab_fr), pretrained_embeddings=vocab_embeddings_fr)
model = Seq2Seq(encoder, decoder)
model = model.to(device)


optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = CrossEntropyLoss(ignore_index=vocab_fr['<pad>']).to(device)  # Move the loss function to the device

# Use training data for the dataset
train_dataset = TranslationDataset(X_train['text'], y_train['text'], vocab_en, vocab_fr, tokenizer_source=tokenize_en, tokenizer_target=tokenize_fr)
train_loader = DataLoader(train_dataset, batch_size=32, collate_fn=collate_fn)  # Increased batch size

# Validation DataLoader
val_dataset = TranslationDataset(X_val['text'], y_val['text'], vocab_en, vocab_fr, tokenizer_source=tokenize_en, tokenizer_target=tokenize_fr)
val_loader = DataLoader(val_dataset, batch_size=32, collate_fn=collate_fn)  # Increased batch size


train(model, train_loader, optimizer, criterion, epochs=10, device=device)

English vocab size: 10004, French vocab size: 10004
, il est important de la compétence de la commission de la pêche et de la commission . - et je pense que nous avons tous conscients de la commission de l' industrie , du commerce extérieur , de la recherche et de la politique de l' énergie , de
, il est important de la commission de la pêche et de la commission . - et je pense que
Epoch 1/10, Loss: 4.6885
, en ce qui concerne la question de la commission , je voudrais rappeler que la commission a approuvé une proposition de règlement . je pense que le parlement européen a été <unk> , et je pense que le parlement européen a été <unk> , et je pense que le
, je voudrais dire que la commission a approuvé une proposition de règlement . je pense que le parlement européen
Epoch 2/10, Loss: 3.8403
, monsieur le commissaire , le rapport de mme palacio <unk> , qui est présenté au parlement européen , qui est très important pour la deuxième lecture . je voudrais cependant poser une question comp

KeyboardInterrupt: 

In [None]:
print(predict_1(model, "I am a student", vocab_en, vocab_fr, tokenize_en, tokenize_fr, device=device))
print(predict_2(model, "I am a book", vocab_en, vocab_fr, tokenize_en, tokenize_fr, device=device))


In [None]:
from nltk.translate.bleu_score import sentence_bleu

def evaluate_with_bleu(model, loader, criterion, target_vocab, device="cpu"):
    model.eval()  # Set the model to evaluation mode
    total_loss = 0
    total_samples = 0
    total_bleu_score = 0

    with torch.no_grad():
        for src, trg in loader:
            src = src.to(device).long()  # Move source to device and ensure it's long type
            trg = trg.to(device).long()  # Move target to device and ensure it's long type

            # Forward pass: The decoder's input is all except the last word
            output = model(src, trg[:, :-1])

            output_dim = output.shape[-1]
            output = output.contiguous().view(-1, output_dim)
            trg = trg[:, 1:].contiguous().view(-1)  # Target doesn't include the first <sos> token

            loss = criterion(output, trg)
            total_loss += loss.item() * src.size(0)
            total_samples += src.size(0)

            # Calculate BLEU score
            output = output.view(src.size(0), -1, output_dim)
            for i in range(src.size(0)):
                pred_indices = output[i].argmax(dim=1).tolist()
                target_indices = trg[i].tolist()
                pred_sentence = [list(target_vocab.keys())[list(target_vocab.values()).index(idx)] for idx in pred_indices if idx in target_vocab.values()]
                target_sentence = [list(target_vocab.keys())[list(target_vocab.values()).index(idx)] for idx in target_indices if idx in target_vocab.values()]
                total_bleu_score += sentence_bleu([target_sentence], pred_sentence)

    average_loss = total_loss / total_samples
    average_bleu_score = total_bleu_score / total_samples
    return average_loss, average_bleu_score

In [None]:
# Assuming `val_loader` is the DataLoader for the validation set
val_loss, val_bleu = evaluate_with_bleu(model, val_loader, criterion, vocab_fr, device=device)
print(f'Validation Loss: {val_loss:.4f}, Validation BLEU Score: {val_bleu:.4f}')