In [7]:
import torch.nn as nn
import nltk
import random
from sklearn.model_selection import train_test_split
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from collections import defaultdict
import numpy as np
from typing import Dict
import torch
import torch.optim as optim
from torch.nn import CrossEntropyLoss
from preprocessing_task_2 import prepare_data
import spacy
from collections import Counter
from tqdm import tqdm

In [8]:
nltk.download('punkt')
nltk.download('stopwords')
#!python3 -m spacy download en_core_web_sm
#!python3 -m spacy download fr_core_news_sm

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/niclasstoffregen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/niclasstoffregen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [18]:
####### LOADING AND PREPROCESSING #############
english_data, french_data = prepare_data(fraction=0.1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_en['text'] = data_en['text'].apply(lambda x: re.sub(regex_pattern, '', x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_fr['text'] = data_fr['text'].apply(lambda x: re.sub(regex_pattern, '', x))


In [19]:
# Functions to tokenize data and build vocab
spacy_fr = spacy.load('fr_core_news_sm')
spacy_en = spacy.load('en_core_web_sm')

def tokenize_fr(text):
    return [tok.text for tok in spacy_fr.tokenizer(text)]

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

def build_vocab(sentences, vocab_size, word_tokenize):
    all_words = [word for sentence in sentences for word in word_tokenize(sentence)]
    word_counts = Counter(all_words)
    vocab = [word for word, _ in word_counts.most_common(vocab_size)]
    word2idx = {word: idx for idx, word in enumerate(vocab, start=4)}
    word2idx['<pad>'] = 0
    word2idx['<unk>'] = 1
    word2idx['<sos>'] = 2
    word2idx['<eos>'] = 3
    return word2idx

In [20]:
# Collate function for padding
def collate_fn(batch):
    source_batch, target_batch = zip(*batch)
    source_batch_padded = pad_sequence(source_batch, padding_value=vocab_en['<pad>'], batch_first=True)
    target_batch_padded = pad_sequence(target_batch, padding_value=vocab_fr['<pad>'], batch_first=True)
    return source_batch_padded, target_batch_padded

In [21]:
# Dataset preparation
class TranslationDataset(Dataset):
    def __init__(self, source_sentences, target_sentences, source_vocab, target_vocab, tokenizer_source, tokenizer_target):
        self.source_sentences = source_sentences
        self.target_sentences = target_sentences
        self.source_vocab = source_vocab
        self.target_vocab = target_vocab
        self.tokenizer_source = tokenizer_source
        self.tokenizer_target = tokenizer_target
    
    def __len__(self):
        return len(self.source_sentences)
    
    def __getitem__(self, index):
        source_sentence = [self.source_vocab[token] if token in self.source_vocab else self.source_vocab['<unk>'] for token in self.tokenizer_source(self.source_sentences.iloc[index])]
        target_sentence = [self.target_vocab[token] if token in self.target_vocab else self.target_vocab['<unk>'] for token in self.tokenizer_target(self.target_sentences.iloc[index])]
        return torch.tensor(source_sentence, dtype=torch.long), torch.tensor(target_sentence, dtype=torch.long)




In [22]:
# Example GloVe embedding file path and embedding dimension

def load_glove_embeddings(path: str, word2idx: Dict[str, int], embedding_dim: int) -> torch.Tensor:
    """
    Load GloVe embeddings from a specified file and align them with the given word index dictionary.

    Parameters:
    - path (str): The file path to the GloVe embeddings file.
    - word2idx (Dict[str, int]): A dictionary mapping words to their corresponding indices. This dictionary defines
      the position each word’s vector should occupy in the resulting embedding matrix.
    - embedding_dim (int): The dimensionality of the GloVe vectors (e.g., 50, 100, 200, 300).

    Returns:
    - torch.Tensor: A tensor of shape (len(word2idx), embedding_dim) containing the GloVe vectors aligned according to word2idx.
    """
    with open(path, 'r', encoding='utf-8') as f:
        # Initialize the embedding matrix with zeros
        #embeddings = np.zeros((len(word2idx), embedding_dim))
        #better approach: init with random 
        embeddings = np.random.uniform(-0.1, 0.1, (len(word2idx), embedding_dim))
        # Process each line in the GloVe file
        for line in f:
            values = line.split()
            word = values[0]
            
            # If the word is in the provided dictionary, update the corresponding row in embeddings
            if word in word2idx.keys():
                # Convert embedding values from strings to float32
                vector = np.asarray(values[1:], dtype='float32')
                # Place the vector in the correct index as per word2idx
                embeddings[word2idx[word]] = vector
            else:
                pass
    # Convert the numpy array to a PyTorch tensor
    return torch.from_numpy(embeddings)



def load_word2vec_embeddings(path, word2idx, embedding_dim):
    embeddings = np.random.uniform(-0.1, 0.1, (len(word2idx), embedding_dim))
    with open(path, 'r', encoding='latin1') as f:
        for line in f:
            values = line.strip().split()
            word = values[0]
            if word in word2idx:
                try:
                    vector = np.asarray(values[1:], dtype='float32')
                    embeddings[word2idx[word]] = vector
                except ValueError:
                    print(f"Error converting values for word: {word}")
                    continue
    return torch.from_numpy(embeddings)






In [24]:
class Encoder(nn.Module):
    def __init__(self, input_dim, pretrained_embeddings, hidden_dim, dropout, num_layers=1):
        super().__init__()
        if pretrained_embeddings.dtype != torch.float32:
            pretrained_embeddings = pretrained_embeddings.to(dtype=torch.float32)
        embedding_dim = pretrained_embeddings.shape[1]
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding.from_pretrained(pretrained_embeddings, freeze=False)
        self.rnn = nn.GRU(embedding_dim, hidden_dim, num_layers=num_layers, dropout=dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        # src = [src length, batch size]
        embedded = self.dropout(self.embedding(src))
        # embedded = [src length, batch size, embedding dim]
        outputs, hidden = self.rnn(embedded)  # no cell state in GRU!
        # outputs = [src length, batch size, hidden dim * n directions]
        # hidden = [n layers * n directions, batch size, hidden dim]
        # outputs are always from the top hidden layer
        return hidden

class Decoder(nn.Module):
    def __init__(self, output_dim, embedding_dim, hidden_dim, pretrained_embeddings, dropout, num_layers=1):
        super().__init__()
        if pretrained_embeddings.dtype != torch.float32:
            pretrained_embeddings = pretrained_embeddings.to(dtype=torch.float32)
        self.embedding = nn.Embedding.from_pretrained(pretrained_embeddings, freeze=False)
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.embedding = nn.Embedding(output_dim, embedding_dim)
        self.rnn = nn.GRU(embedding_dim + hidden_dim, hidden_dim, num_layers=num_layers, dropout=dropout)
        self.fc_out = nn.Linear(embedding_dim + hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, context):
        # input = [batch size]
        # hidden = [n layers * n directions, batch size, hidden dim]
        # context = [n layers * n directions, batch size, hidden dim]
        # n layers and n directions in the decoder will both always be 1, therefore:
        # hidden = [1, batch size, hidden dim]
        # context = [1, batch size, hidden dim]
        input = input.unsqueeze(0)
        # input = [1, batch size]
        embedded = self.dropout(self.embedding(input))
        # embedded = [1, batch size, embedding dim]
        emb_con = torch.cat((embedded, context), dim=2)
        # emb_con = [1, batch size, embedding dim + hidden dim]
        output, hidden = self.rnn(emb_con, hidden)
        # output = [seq len, batch size, hidden dim * n directions]
        # hidden = [n layers * n directions, batch size, hidden dim]
        # seq len, n layers and n directions will always be 1 in this decoder, therefore:
        # output = [1, batch size, hidden dim]
        # hidden = [1, batch size, hidden dim]
        output = torch.cat(
            (embedded.squeeze(0), hidden.squeeze(0), context.squeeze(0)), dim=1
        )
        # output = [batch size, embedding dim + hidden dim * 2]
        prediction = self.fc_out(output)
        # prediction = [batch size, output dim]
        return prediction, hidden

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        assert (
            encoder.hidden_dim == decoder.hidden_dim
        ), "Hidden dimensions of encoder and decoder must be equal!"

    def forward(self, src, trg, teacher_forcing_ratio):
        # src = [src length, batch size]
        # trg = [trg length, batch size]
        # teacher_forcing_ratio is probability to use teacher forcing
        # e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time
        batch_size = trg.shape[1]
        trg_length = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        # tensor to store decoder outputs
        outputs = torch.zeros(trg_length, batch_size, trg_vocab_size).to(self.device)
        # last hidden state of the encoder is the context
        context = self.encoder(src)
        # context = [n layers * n directions, batch size, hidden dim]
        # context also used as the initial hidden state of the decoder
        hidden = context
        # hidden = [n layers * n directions, batch size, hidden dim]
        # first input to the decoder is the <sos> tokens
        input = trg[0, :]
        for t in range(1, trg_length):
            # insert input token embedding, previous hidden state and the context state
            # receive output tensor (predictions) and new hidden state
            output, hidden = self.decoder(input, hidden, context)
            # output = [batch size, output dim]
            # hidden = [1, batch size, hidden dim]
            # place predictions in a tensor holding predictions for each token
            outputs[t] = output
            # decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            # get the highest predicted token from our predictions
            top1 = output.argmax(1)
            # if teacher forcing, use actual next token as next input
            # if not, use predicted token
            input = trg[t] if teacher_force else top1
            # input = [batch size]
        return outputs

In [25]:
import torch
from torch.utils.data import DataLoader, Dataset, random_split
from torch.nn import CrossEntropyLoss, Module
from torch.optim import Adam
from torch.nn.utils.rnn import pad_sequence
import numpy as np
from collections import Counter


# Define your device based on the availability
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")

# Define the vocabulary size and embedding dimension
vocab_size = 10000


# Build vocabularies based on the training set only
X_train, X_test, y_train, y_test = train_test_split(english_data, french_data, test_size=0.2)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25)  # 0.25 x 0.8 = 0.2

# Building vocabularies using only the training data to prevent information leakage
vocab_en = build_vocab(X_train["text"], vocab_size, tokenize_en)
vocab_fr = build_vocab(y_train["text"], vocab_size, tokenize_fr)
print(f"size of english vocab {len(vocab_en)}, size of french vocab {len(vocab_fr)}")


size of english vocab 10004, size of french vocab 10004


In [39]:
def translate_sentence(
    sentence,
    model,
    source_tokenizer,
    source_vocab,
    target_vocab,
    sos_token,
    eos_token,
    device,
    max_output_length=25,
):
    model.eval()
    with torch.no_grad():
        sentence = sentence.lower()
        if isinstance(sentence, str):
            tokens = [token if token in source_vocab else "<unk>" for token in source_tokenizer(sentence) ]
        else:
            tokens = [token for token in sentence]
        tokens = [sos_token] + tokens + [eos_token]
        ids = [source_vocab[token] for token in tokens]
        tensor = torch.LongTensor(ids).unsqueeze(-1).to(device)
        context = model.encoder(tensor)
        hidden = context
        inputs = [target_vocab[sos_token]]
        for _ in range(max_output_length):
            inputs_tensor = torch.LongTensor([inputs[-1]]).to(device)
            output, hidden = model.decoder(inputs_tensor, hidden, context)
            predicted_token = output.argmax(-1).item()
            inputs.append(predicted_token)
            if predicted_token == target_vocab[eos_token]:
                break
        tokens = [list(target_vocab.keys())[list(target_vocab.values()).index(idx)] for idx in inputs]
    return ' '.join(tokens)

def train_fn(
    model, data_loader, optimizer, criterion, clip, teacher_forcing_ratio, device
):
    model.train()
    epoch_loss = 0
    for src, trg in tqdm(data_loader):
        src = src.to(device).long().T
        trg = trg.to(device).long().T
        # src = [src length, batch size]
        # trg = [trg length, batch size]
        optimizer.zero_grad()
        output = model(src, trg, teacher_forcing_ratio)
        # output = [trg length, batch size, trg vocab size]
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        # output = [(trg length - 1) * batch size, trg vocab size]
        #trg = trg[1:].view(-1)
        trg = trg[1:].reshape(-1)
        # trg = [(trg length - 1) * batch size]
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(data_loader)

def evaluate_fn(model, data_loader, criterion, device):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for src, trg in tqdm(data_loader):
            src = src.to(device).long().T
            trg = trg.to(device).long().T
            # src = [src length, batch size]
            # trg = [trg length, batch size]
            output = model(src, trg, 0)  # turn off teacher forcing
            # output = [trg length, batch size, trg vocab size]
            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            # output = [(trg length - 1) * batch size, trg vocab size]
            #trg = trg[1:].view(-1)
            trg = trg[1:].reshape(-1)
            # trg = [(trg length - 1) * batch size]
            loss = criterion(output, trg)
            epoch_loss += loss.item()
    return epoch_loss / len(data_loader)

In [42]:
from nltk.translate.bleu_score import corpus_bleu
from tqdm import tqdm 


def evaluate_bleu(model, data_loader, source_tokenizer, source_vocab, target_vocab,device, quick=False):
    """
    Evaluate the BLEU score for a translation model.

    Args:
        model: The translation model (Seq2Seq).
        data_loader: DataLoader for the dataset to evaluate.
        source_vocab: Vocabulary for the source language.
        target_vocab: Vocabulary for the target language.
        tokenizer_source: Tokenization function for the source language.
        tokenizer_target: Tokenization function for the target language.
        device: Device to run the model on ('cpu' or 'cuda').

    Returns:
        float: The BLEU score for the dataset.
        :param source_tokenizer: 
    """
    model.eval()
    references = []
    hypotheses = []
    sos_token = "<sos>"
    eos_token = "<eos>"
    pad_token = "<pad>"
    unkown_token = "<unk>"
    
    counter = 0

    with torch.no_grad():
        for source, target in tqdm(data_loader):
            counter += 1
            if quick and counter == 10:
                break
            # Process each sentence in the batch
            for i in range(source.size(0)):
                source_sentence = source[i]
                target_sentence = target[i]

                # Convert source tensor to sentence
                src_sent = ' '.join([list(source_vocab.keys())[list(source_vocab.values()).index(idx)] for idx in source_sentence if idx not in [source_vocab[pad_token], source_vocab[unkown_token], source_vocab[sos_token], source_vocab[eos_token]]])
                # Generate translation
                translation = translate_sentence(src_sent, model, source_tokenizer, source_vocab, target_vocab, sos_token, eos_token, device=device)
                
                # Convert target tensor to actual sentence
                ref_sent = [list(target_vocab.keys())[list(target_vocab.values()).index(idx)] for idx in target_sentence if idx not in [target_vocab[pad_token], target_vocab[unkown_token], target_vocab[sos_token], target_vocab[eos_token]]]
                #print(f"sentence: {src_sent}")
                #print(f"reference: {ref_sent}")
                #print(f"translation: {translation}")
                # Append to lists
                hypotheses.append(translation.split())
                references.append([ref_sent])

    # Calculate BLEU score
    return corpus_bleu(references, hypotheses)

from rouge import Rouge

def evaluate_rouge(model, data_loader, source_tokenizer, source_vocab, target_vocab, device, quick=False):
    """
    Evaluate the ROUGE score for a translation model.

    Args:
        model: The translation model (Seq2Seq).
        data_loader: DataLoader for the dataset to evaluate.
        source_vocab: Vocabulary for the source language.
        target_vocab: Vocabulary for the target language.
        tokenizer_source: Tokenization function for the source language.
        tokenizer_target: Tokenization function for the target language.
        device: Device to run the model on ('cpu' or 'cuda').

    Returns:
        dict: A dictionary containing the ROUGE-1, ROUGE-2, and ROUGE-L scores.
    """
    model.eval()
    rouge = Rouge()
    references = []
    hypotheses = []
    sos_token = "<sos>"
    eos_token = "<eos>"
    pad_token = "<pad>"
    unkown_token = "<unk>"
    counter = 0

    with torch.no_grad():
        for source, target in tqdm(data_loader):
            counter += 1
            if quick and counter == 10:
                break
            for i in range(source.size(0)):
                source_sentence = source[i]
                target_sentence = target[i]
                # More readable version
                src_sent = ' '.join(str(source_vocab.get(idx.item(), source_vocab[unkown_token])) for idx in source_sentence if idx not in [source_vocab[pad_token], source_vocab[unkown_token], source_vocab[sos_token], source_vocab[eos_token]])
                translation = translate_sentence(src_sent, model, source_tokenizer, source_vocab, target_vocab, sos_token, eos_token, device=device)
                ref_sent = ' '.join(str(target_vocab.get(idx.item(), target_vocab[unkown_token])) for idx in target_sentence if idx not in [target_vocab[pad_token], target_vocab[unkown_token], target_vocab[sos_token], target_vocab[eos_token]])                
                hypotheses.append(translation)
                references.append(ref_sent)

    return rouge.get_scores(hypotheses, references, avg=True)

In [29]:
embedding_dim = 100

input_dim = len(vocab_en)
output_dim = len(vocab_fr)
hidden_dim = 512
encoder_dropout = 0.5
decoder_dropout = 0.5

#Load embeddings word2vec
vocab_embeddings_en = load_word2vec_embeddings('word2vec/english.txt', vocab_en, embedding_dim)
vocab_embeddings_fr = load_word2vec_embeddings('word2vec/france.txt', vocab_fr, embedding_dim)
print(f"size of english embeddings {vocab_embeddings_en.shape}, size of french embeddings {vocab_embeddings_fr.shape}")

encoder = Encoder(
    input_dim,
    vocab_embeddings_en,
    hidden_dim,
    encoder_dropout,
)

decoder = Decoder(
    output_dim,
    embedding_dim,
    hidden_dim,
    vocab_embeddings_fr,
    decoder_dropout,
)

model = Seq2Seq(encoder, decoder, device).to(device)
model = model.to(device)

def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.normal_(param.data, mean=0, std=0.01)


model.apply(init_weights)
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


print(f"The model has {count_parameters(model):,} trainable parameters")



# Define the optimizer and loss function
optimizer = Adam(model.parameters(), lr=0.001)
criterion = CrossEntropyLoss(ignore_index=vocab_fr['<pad>']).to(device)

# Create datasets for training and validation
train_dataset = TranslationDataset(X_train['text'], y_train['text'], vocab_en, vocab_fr, tokenize_en, tokenize_fr)
val_dataset = TranslationDataset(X_val['text'], y_val['text'], vocab_en, vocab_fr, tokenize_en, tokenize_fr)
test_dataset = TranslationDataset(X_test['text'], y_test['text'], vocab_en, vocab_fr, tokenize_en, tokenize_fr)

# Create DataLoaders for training and validation
train_loader = DataLoader(train_dataset, batch_size=32, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=1, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=1, collate_fn=collate_fn)


Error converting values for word: communauté
size of english embeddings torch.Size([10004, 100]), size of french embeddings torch.Size([10004, 100])




The model has 15,927,940 trainable parameters


In [30]:

n_epochs = 10
clip = 1.0
teacher_forcing_ratio = 0.5
lower = True
sos_token = "<sos>"
eos_token = "<eos>"
best_valid_loss = float("inf")

results = pd.DataFrame(columns=["epoch", "train_loss", "valid_loss", "train_ppl", "valid_ppl", "bleu", "rouge"])

for epoch in range(n_epochs):
    train_loss = train_fn(
        model,
        train_loader,
        optimizer,
        criterion,
        clip,
        teacher_forcing_ratio,
        device,
    )
    valid_loss = evaluate_fn(
        model,
        val_loader,
        criterion,
        device,
    )
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), f"no_gpt_approach-{epoch}.pt")

    print(f"\tTrain Loss: {train_loss:7.3f} | Train PPL: {np.exp(train_loss):7.3f}")
    print(f"\tValid Loss: {valid_loss:7.3f} | Valid PPL: {np.exp(valid_loss):7.3f}")
    translation = translate_sentence(
        "i am a student",
        model,
        tokenize_en,
        vocab_en,
        vocab_fr,
        sos_token,
        eos_token,
        device,
    )
    print(f"Prediction: {translation}")
    bleu_score = evaluate_bleu(model, test_loader, tokenize_en, vocab_en, vocab_fr, device, quick=True)
    print(f"BLEU score: {bleu_score}")
    rouge_score = evaluate_rouge(model, test_loader, tokenize_en, vocab_en, vocab_fr, device, quick=True)
    print(f"ROUGE score: {rouge_score}")
    results.loc[-1] = {
            "epoch": epoch,
            "train_loss": train_loss,
            "valid_loss": valid_loss,
            "train_ppl": np.exp(train_loss),
            "valid_ppl": np.exp(valid_loss),
            "bleu": bleu_score,
            "rouge": rouge_score
        }
    
results.to_csv("reports/csv/no_gpt_approach_results.csv", index=False)

100%|██████████| 3756/3756 [28:15<00:00,  2.21it/s]  
100%|██████████| 1252/1252 [03:27<00:00,  6.03it/s]


	Train Loss:   5.391 | Train PPL: 219.453
	Valid Loss:   5.872 | Valid PPL: 354.812


KeyError: True

In [None]:
#############################################################################################################

In [140]:
# Example GloVe embedding file path and embedding dimension

def check_missing_words_in_embeddings(path: str, word2idx: Dict[str, int], embedding_dim: int) -> torch.Tensor:
    print(len(word2idx))
    missing_word_counter = 0
    
    
        
    glove_words = set()
    with open(path, 'r', encoding='utf-8') as f:
        # Process each line in the GloVe file to collect all GloVe words
        for line in f:
            values = line.split()
            word = values[0]
            glove_words.add(word)
    
    for word in word2idx:
        if word not in glove_words:
            missing_word_counter+=1
    

    print(missing_word_counter)
    # Convert the numpy array to a PyTorch tensor

check_missing_words_in_embeddings('glove.6B/glove.6B.100d.txt', vocab_fr, embedding_dim)

check_missing_words_in_embeddings('fasttext/cc.fr.300.vec', vocab_fr, 300) 

10004
6906
10004
62


In [141]:
"""
Relevant notes:
If a word in  the vocabulary (word2idx) doesn’t exist in the GloVe or FastText embeddings file, 
its embedding vector would not be updated and would remain as initially set by a zero vector. 

update:
not found in vocubulary words are initialized with random vector


"""

'\nRelevant notes:\nIf a word in your vocabulary (word2idx) doesn’t exist in the GloVe or FastText embeddings file, \nits embedding vector would not be updated and would remain as initially set. \nGiven that you initialize the embeddings matrix with zeros, \nany word not found in the pre-trained dataset would be represented by a zero vector. \n\nupdate not found in vocubulary words are initialized with random vector\n\n\n'

In [207]:
train_dataset[5]

(tensor([ 687,  124,    7,  149,    4,  377, 1420,  195,    5,    7,  149,   13,
          926,  688,   66,  689,    9,   13,   50,   11,  452,   30,    4,  322,
            8]),
 tensor([1479,   53,  265,   14,    7,  943,   46,  944,    4,   35,   37,   76,
          197,    9,  714,    4,   35,   21,  579,  103, 1480,   93,   11,  266,
            6]))

In [208]:
def get_key_by_value(dictionary, search_value):
    for key, value in dictionary.items():
        if value == search_value:
            return key
    return None  # If the value is not found, return None or raise an exception


list_en= [ 687,  124,    7,  149,    4,  377, 1420,  195,    5,    7,  149,   13,
          926,  688,   66,  689,    9,   13,   50,   11,  452,   30,    4,  322,
            8]

print("english:")

res= ""
for i in list_en:
    res= res + " "+  get_key_by_value(vocab_en, i)
print(res)
    

##########


list_fr= [1479,   53,  265,   14,    7,  943,   46,  944,    4,   35,   37,   76,
          197,    9,  714,    4,   35,   21,  579,  103, 1480,   93,   11,  266,
            6]

print("french:")

res= ""
for i in list_fr:
    res= res + " "+  get_key_by_value(vocab_fr, i)
print(res)

english:
 They need to see the political dimension working , to see that officials accept their responsibilities and that there is communication with the citizens .
french:
 Ils ont besoin que la dimension politique fonctionne , qu' il y ait des responsabilités , qu' une communication soit établie avec les citoyens .


In [199]:
english_data

Unnamed: 0,text
0,Resumption of the session
1,I declare resumed the session of the European ...
2,"Although, as you will have seen, the dreaded '..."
3,You have requested a debate on this subject in...
4,"In the meantime, I should like to observe a mi..."
...,...
995,There is still a need to tighten up in the are...
996,This is especially necessary in relation to th...
997,"As Liberals and Greens, we clearly have differ..."
998,"Mr President, Commissioner, there are just two..."


In [156]:
french_data

Unnamed: 0,text
0,Reprise de la session
1,Je déclare reprise la session du Parlement eur...
2,"Comme vous avez pu le constater, le grand ""bog..."
3,Vous avez souhaité un débat à ce sujet dans le...
4,"En attendant, je souhaiterais, comme un certai..."
...,...
99995,Cela se trouvait dans la communication du mois...
99996,N'allez pas croire qu'il y ait eu une quelconq...
99997,Je pensais que ce dossier était bien plus avancé.
99998,Je suis désolée.


In [177]:
X_train, X_test, y_train, y_test = train_test_split(english_data, french_data, test_size=0.2)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25)  # 0.25 x 0.8 = 0.2


In [181]:
X_train["text"][5]

"Please rise, then, for this minute' s silence."

In [182]:
y_train["text"][5]

'Je vous invite à vous lever pour cette minute de silence.'