In [130]:
import torch.nn as nn
import nltk
import random
from sklearn.model_selection import train_test_split
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from collections import defaultdict
import numpy as np
from typing import Dict
import torch
import torch.optim as optim
from torch.nn import CrossEntropyLoss
from preprocessing_task_2 import prepare_data
import spacy
from collections import Counter

In [72]:
nltk.download('punkt')
nltk.download('stopwords')
!python3 -m spacy download en_core_web_sm
!python3 -m spacy download fr_core_news_sm

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/erikrubinov/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/erikrubinov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


/Users/erikrubinov/anaconda3/bin/python3: No module named spacy
/Users/erikrubinov/anaconda3/bin/python3: No module named spacy


In [209]:
####### LOADING AND PREPROCESSING #############
#english_data, french_data = prepare_data()


""" Function to load data into a pandas DataFrame without treating any character as quotes"""


import pandas as pd
from csv import QUOTE_NONE

# Define the paths to your files
english_file_path = "fr-en/europarl-v7.fr-en.en"
french_file_path = "fr-en/europarl-v7.fr-en.fr"


def load_data_to_dataframe(file_path):
    # Read the entire file as a single column DataFrame, ignoring any quoting
    return pd.read_csv(file_path, header=None, names=['text'], encoding='utf-8', sep='\t', quoting=QUOTE_NONE, engine='python')

# Load the data
english_data = load_data_to_dataframe(english_file_path)
french_data = load_data_to_dataframe(french_file_path)

# Show the first few entries of the DataFrame
print("English DataFrame sample:")
print(english_data.head())
print("French DataFrame sample:")
print(french_data.head())

English DataFrame sample:
                                                text
0                          Resumption of the session
1  I declare resumed the session of the European ...
2  Although, as you will have seen, the dreaded '...
3  You have requested a debate on this subject in...
4  In the meantime, I should like to observe a mi...
French DataFrame sample:
                                                text
0                              Reprise de la session
1  Je déclare reprise la session du Parlement eur...
2  Comme vous avez pu le constater, le grand "bog...
3  Vous avez souhaité un débat à ce sujet dans le...
4  En attendant, je souhaiterais, comme un certai...


In [210]:
## Take fraction of data due to hardware constraints

english_data = english_data[:100000]
french_data = french_data[:100000]


In [211]:
# Functions to tokenize data and build vocab
spacy_fr = spacy.load('fr_core_news_sm')
spacy_en = spacy.load('en_core_web_sm')

def tokenize_fr(text):
    return [tok.text for tok in spacy_fr.tokenizer(text)]

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

def build_vocab(sentences, vocab_size, word_tokenize):
    all_words = [word for sentence in sentences for word in word_tokenize(sentence)]
    word_counts = Counter(all_words)
    vocab = [word for word, _ in word_counts.most_common(vocab_size)]
    word2idx = {word: idx for idx, word in enumerate(vocab, start=4)}
    word2idx['<pad>'] = 0
    word2idx['<unk>'] = 1
    word2idx['<sos>'] = 2
    word2idx['<eos>'] = 3
    return word2idx

In [212]:
# Collate function for padding
def collate_fn(batch):
    source_batch, target_batch = zip(*batch)
    source_batch_padded = pad_sequence(source_batch, padding_value=vocab_en['<pad>'], batch_first=True)
    target_batch_padded = pad_sequence(target_batch, padding_value=vocab_fr['<pad>'], batch_first=True)
    return source_batch_padded, target_batch_padded

In [213]:
########################## TODO: USE DIFFERENT EMBEDDING MODELS #################


In [214]:


# Dataset preparation
class TranslationDataset(Dataset):
    def __init__(self, source_sentences, target_sentences, source_vocab, target_vocab, tokenizer_source, tokenizer_target):
        self.source_sentences = source_sentences
        self.target_sentences = target_sentences
        self.source_vocab = source_vocab
        self.target_vocab = target_vocab
        self.tokenizer_source = tokenizer_source
        self.tokenizer_target = tokenizer_target
    
    def __len__(self):
        return len(self.source_sentences)
    
    def __getitem__(self, index):
        source_sentence = [self.source_vocab[token] if token in self.source_vocab else self.source_vocab['<unk>'] for token in self.tokenizer_source(self.source_sentences.iloc[index])]
        target_sentence = [self.target_vocab[token] if token in self.target_vocab else self.target_vocab['<unk>'] for token in self.tokenizer_target(self.target_sentences.iloc[index])]
        return torch.tensor(source_sentence, dtype=torch.long), torch.tensor(target_sentence, dtype=torch.long)




In [215]:
# Example GloVe embedding file path and embedding dimension

def load_glove_embeddings(path: str, word2idx: Dict[str, int], embedding_dim: int) -> torch.Tensor:
    """
    Load GloVe embeddings from a specified file and align them with the given word index dictionary.

    Parameters:
    - path (str): The file path to the GloVe embeddings file.
    - word2idx (Dict[str, int]): A dictionary mapping words to their corresponding indices. This dictionary defines
      the position each word’s vector should occupy in the resulting embedding matrix.
    - embedding_dim (int): The dimensionality of the GloVe vectors (e.g., 50, 100, 200, 300).

    Returns:
    - torch.Tensor: A tensor of shape (len(word2idx), embedding_dim) containing the GloVe vectors aligned according to word2idx.
    """
    with open(path, 'r', encoding='utf-8') as f:
        # Initialize the embedding matrix with zeros
        #embeddings = np.zeros((len(word2idx), embedding_dim))
        #better approach: init with random 
        embeddings = np.random.uniform(-0.1, 0.1, (len(word2idx), embedding_dim))
        # Process each line in the GloVe file
        for line in f:
            values = line.split()
            word = values[0]
            
            # If the word is in the provided dictionary, update the corresponding row in embeddings
            if word in word2idx.keys():
                # Convert embedding values from strings to float32
                vector = np.asarray(values[1:], dtype='float32')
                # Place the vector in the correct index as per word2idx
                embeddings[word2idx[word]] = vector
            else:
                pass
    # Convert the numpy array to a PyTorch tensor
    return torch.from_numpy(embeddings)



def load_word2vec_embeddings(path, word2idx, embedding_dim):
    embeddings = np.random.uniform(-0.1, 0.1, (len(word2idx), embedding_dim))
    with open(path, 'r', encoding='latin1') as f:
        for line in f:
            values = line.strip().split()
            word = values[0]
            if word in word2idx:
                try:
                    vector = np.asarray(values[1:], dtype='float32')
                    embeddings[word2idx[word]] = vector
                except ValueError:
                    print(f"Error converting values for word: {word}")
                    continue
    return torch.from_numpy(embeddings)






In [216]:

class Encoder(nn.Module):
    def __init__(self, hidden_size, pretrained_embeddings):
        
        """
        Initialize the Encoder with pre-trained embeddings and a GRU layer.

        Parameters:
            hidden_size (int): The number of features in the hidden state of the GRU.
            pretrained_embeddings (torch.Tensor): A tensor containing the pre-trained word embeddings.
        """
        super(Encoder, self).__init__()
        # Ensure that the pretrained embeddings are of type float32
        if pretrained_embeddings.dtype != torch.float32:
            pretrained_embeddings = pretrained_embeddings.to(dtype=torch.float32)
        self.embedding = nn.Embedding.from_pretrained(pretrained_embeddings, freeze=False)
        embed_size = pretrained_embeddings.shape[1]  # Embedding size is the second dimension of the embeddings tensor
        self.rnn = nn.GRU(embed_size, hidden_size, batch_first=True).float()  # Ensure GRU is initialized as float32

    def forward(self, input):
        """
        Forward pass of the encoder which processes the input sequence.

        Parameters:
            input (torch.Tensor): The input sequence tensor, which should be indexed by batch.

        Returns:
            hidden (torch.Tensor): The hidden state of the GRU, representing the encoded information of the input.
        """
        embedded = self.embedding(input).float()  # Ensure embedding outputs float32
        _, hidden = self.rnn(embedded)
        return hidden

class Decoder(nn.Module):
    def __init__(self, embed_size, hidden_size, output_size, pretrained_embeddings):
        """
        Initialize the Decoder with pre-trained embeddings, a GRU layer, and a linear output layer.

        Parameters:
            embed_size (int): The size of each embedding vector.
            hidden_size (int): The number of features in the hidden state of the GRU.
            output_size (int): The size of the output vocabulary.
            pretrained_embeddings (torch.Tensor): A tensor containing the pre-trained word embeddings.

        """
        
        super(Decoder, self).__init__()
        # Ensure that the pretrained embeddings are of type float32
        if pretrained_embeddings.dtype != torch.float32:
            pretrained_embeddings = pretrained_embeddings.to(dtype=torch.float32)
        self.embedding = nn.Embedding.from_pretrained(pretrained_embeddings, freeze=False)
        self.rnn = nn.GRU(embed_size, hidden_size, batch_first=True).float()  # Ensure GRU is initialized as float32
        self.fc = nn.Linear(hidden_size, output_size).float()  # Ensure Linear is initialized as float32

    def forward(self, x, hidden):
        """
        Forward pass of the decoder that processes one timestep of the sequence.

        Parameters:
            x (torch.Tensor): The input tensor for the current timestep.
            hidden (torch.Tensor): The hidden state from the last timestep.

        Returns:
            predicted (torch.Tensor): The output logits for the next word in the sequence.
            hidden (torch.Tensor): The updated hidden state.
        """
        embedded = self.embedding(x).float()  # Ensure embedding outputs float32
        output, hidden = self.rnn(embedded, hidden)
        predicted = self.fc(output)
        return predicted, hidden
    
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, source, target):

        """
        Forward pass of the Seq2Seq model which processes the entire input and target sequence.

        Parameters:
            source (torch.Tensor): The input sequence tensor.
            target (torch.Tensor): The target sequence tensor used during training.

        Returns:
            outputs (torch.Tensor): The output from the decoder for each step in the sequence.
        """
        hidden = self.encoder(source)
        outputs, _ = self.decoder(target, hidden)
        return outputs




In [217]:
class Encoder2(nn.Module):
    def __init__(self, hidden_size, pretrained_embeddings, num_layers=2, bidirectional=True):
        super(Encoder2, self).__init__()  # Ensure the class name here matches the class being defined
        if pretrained_embeddings.dtype != torch.float32:
            pretrained_embeddings = pretrained_embeddings.to(dtype=torch.float32)
        self.embedding = nn.Embedding.from_pretrained(pretrained_embeddings, freeze=False)
        embed_size = pretrained_embeddings.shape[1]
        self.rnn = nn.GRU(embed_size, hidden_size, num_layers=num_layers, 
                          batch_first=True, bidirectional=bidirectional).float()

    def forward(self, input):
        embedded = self.embedding(input).float()
        _, hidden = self.rnn(embedded)
        if self.rnn.bidirectional:
            hidden = hidden.view(hidden.size(0)//2, 2, hidden.size(1), hidden.size(2))
            hidden = torch.sum(hidden, dim=1)
        return hidden

class Decoder2(nn.Module):
    def __init__(self, embed_size, hidden_size, output_size, pretrained_embeddings, num_layers=2):
        super(Decoder2, self).__init__()  # Correct use of super()
        if pretrained_embeddings.dtype != torch.float32:
            pretrained_embeddings = pretrained_embeddings.to(dtype=torch.float32)
        self.embedding = nn.Embedding.from_pretrained(pretrained_embeddings, freeze=False)
        self.rnn = nn.GRU(embed_size, hidden_size, num_layers=num_layers, batch_first=True).float()
        self.fc = nn.Linear(hidden_size, output_size).float()

    def forward(self, x, hidden):
        embedded = self.embedding(x).float()
        output, hidden = self.rnn(embedded, hidden)
        predicted = self.fc(output)
        return predicted, hidden

In [218]:
def predict_1(model, sentence, source_vocab, target_vocab, tokenizer_source, tokenizer_target, max_length=50, device="cpu"):
    model.eval()  # Set the model to evaluation mode

    # Tokenize the input sentence
    tokens = tokenizer_source(sentence.lower())
    # Convert tokens to indices
    indices = [source_vocab.get(token, source_vocab['<unk>']) for token in tokens]
    input_tensor = torch.tensor(indices, dtype=torch.long).unsqueeze(0).to(device)  # Add batch dimension
    # Pass through the encoder
    with torch.no_grad():
        encoder_hidden = model.encoder(input_tensor)
    # Initialize the decoder input and hidden state
    decoder_input = torch.tensor([[target_vocab['<sos>']]], dtype=torch.long).to(device)
    decoder_hidden = encoder_hidden#.unsqueeze(0)  # Add batch dimension back

    # Generate the output sequence
    output_tokens = []
    for _ in range(max_length):
        with torch.no_grad():
            decoder_output, decoder_hidden = model.decoder(decoder_input, decoder_hidden)

        topv, topi = decoder_output.topk(1)
        next_token = topi.squeeze().item()

        if next_token == target_vocab['<eos>']:
            break

        output_tokens.append(next_token)
        decoder_input = topi.squeeze(0)#.detach()#.unsqueeze(0)

    # Convert indices to tokens
    output_sentence = [list(target_vocab.keys())[list(target_vocab.values()).index(idx)] for idx in output_tokens]

    return ' '.join(output_sentence)

In [219]:
import nltk
from nltk.translate.bleu_score import corpus_bleu

def evaluate_bleu(model, data_loader, source_vocab, target_vocab, tokenizer_source, tokenizer_target, device):
    """
    Evaluate the BLEU score for a translation model.

    Args:
        model: The translation model (Seq2Seq).
        data_loader: DataLoader for the dataset to evaluate.
        source_vocab: Vocabulary for the source language.
        target_vocab: Vocabulary for the target language.
        tokenizer_source: Tokenization function for the source language.
        tokenizer_target: Tokenization function for the target language.
        device: Device to run the model on ('cpu' or 'cuda').

    Returns:
        float: The BLEU score for the dataset.
    """
    model.eval()
    references = []
    hypotheses = []

    with torch.no_grad():
        for source, target in data_loader:
            # Process each sentence in the batch
            for i in range(source.size(0)):
                source_sentence = source[i]
                target_sentence = target[i]

                # Convert source tensor to sentence
                src_sent = ' '.join([list(source_vocab.keys())[list(source_vocab.values()).index(idx)] for idx in source_sentence if idx not in [source_vocab['<pad>'], source_vocab['<unk>'], source_vocab['<sos>'], source_vocab['<eos>']]])
                # Generate translation
                translation = predict_1(model, src_sent, source_vocab, target_vocab, tokenizer_source, tokenizer_target, device=device)
                
                # Convert target tensor to actual sentence
                ref_sent = [list(target_vocab.keys())[list(target_vocab.values()).index(idx)] for idx in target_sentence if idx not in [target_vocab['<pad>'], target_vocab['<unk>'], target_vocab['<sos>'], target_vocab['<eos>']]]
                
                # Append to lists
                hypotheses.append(translation.split())
                references.append([ref_sent])

    # Calculate BLEU score
    return corpus_bleu(references, hypotheses)



In [220]:
from rouge import Rouge

def evaluate_rouge(model, data_loader, source_vocab, target_vocab, tokenizer_source, tokenizer_target, device):
    """
    Evaluate the ROUGE score for a translation model.

    Args:
        model: The translation model (Seq2Seq).
        data_loader: DataLoader for the dataset to evaluate.
        source_vocab: Vocabulary for the source language.
        target_vocab: Vocabulary for the target language.
        tokenizer_source: Tokenization function for the source language.
        tokenizer_target: Tokenization function for the target language.
        device: Device to run the model on ('cpu' or 'cuda').

    Returns:
        dict: A dictionary containing the ROUGE-1, ROUGE-2, and ROUGE-L scores.
    """
    model.eval()
    rouge = Rouge()
    references = []
    hypotheses = []

    with torch.no_grad():
        for source, target in data_loader:
            for i in range(source.size(0)):
                source_sentence = source[i]
                target_sentence = target[i]
                # More readable version
                src_sent = ' '.join(str(source_vocab.get(idx.item(), source_vocab['<unk>'])) for idx in source_sentence if idx not in [source_vocab['<pad>'], source_vocab['<unk>'], source_vocab['<sos>'], source_vocab['<eos>']])
                translation = predict_1(model, src_sent, source_vocab, target_vocab, tokenizer_source, tokenizer_target, device=device)
                ref_sent = ' '.join(str(target_vocab.get(idx.item(), target_vocab['<unk>'])) for idx in target_sentence if idx not in [target_vocab['<pad>'], target_vocab['<unk>'], target_vocab['<sos>'], target_vocab['<eos>']])                
                hypotheses.append(translation)
                references.append(ref_sent)

    return rouge.get_scores(hypotheses, references, avg=True)
                                     


In [221]:
def train_4(model, train_loader,val_loader, optimizer, criterion, epochs=10, device="cpu"):
    model.train()  # Set the model to training mode
    for epoch in range(epochs):
        total_loss = 0
        for src, trg in train_loader:
            src = src.to(device).long()
            trg = trg.to(device).long()

            optimizer.zero_grad()

            # Ensure the first token for the decoder is <sos> which is trg[:,0]
            # and the input to the decoder includes this <sos> token at the start
            # up to before the <eos> token at the end.
            decoder_input = trg[:, :-1]  # Excludes <eos> at the end
            target_output = trg[:, 1:]  # Excludes <sos> at the start

            # Pass the source and modified target to the model
            output = model(src, decoder_input)
            
            output = output.float()  # Ensure the output is in float format
            output_dim = output.shape[-1]

            # Flatten the output for computing the loss
            output = output.contiguous().view(-1, output_dim)
            target_output = target_output.contiguous().view(-1)

            # Compute the loss; we don't need to handle <eos> explicitly here as it's managed by the target sequence setup
            loss = criterion(output, target_output)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
        
        average_loss = total_loss / len(train_loader)
        
        bleu_score = evaluate_bleu(model, val_loader, vocab_en, vocab_fr, tokenize_en, tokenize_fr, device)
        print("bleu_score:",bleu_score)
        rouge_score = evaluate_rouge(model, val_loader, vocab_en, vocab_fr, tokenize_en, tokenize_fr, device)
        print("rouge_1_score:",rouge_score["rouge-1"]["r"])
        print(predict_1(model, "Hello, this is a test", vocab_en, vocab_fr, tokenize_en, tokenize_fr, device=device))
        print(f'Epoch {epoch+1}/{epochs}, Loss: {average_loss:.4f}')

        

In [222]:
import torch
from torch.utils.data import DataLoader, Dataset, random_split
from torch.nn import CrossEntropyLoss, Module
from torch.optim import Adam
from torch.nn.utils.rnn import pad_sequence
import numpy as np
from collections import Counter


# Define your device based on the availability
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")

# Define the vocabulary size and embedding dimension
vocab_size = 10000


# Build vocabularies based on the training set only
X_train, X_test, y_train, y_test = train_test_split(english_data, french_data, test_size=0.2)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25)  # 0.25 x 0.8 = 0.2

# Building vocabularies using only the training data to prevent information leakage
vocab_en = build_vocab(X_train["text"], vocab_size, tokenize_en)
vocab_fr = build_vocab(y_train["text"], vocab_size, tokenize_fr)

#Load embeddings glove
#embedding_dim = 300
#vocab_embeddings_en = load_glove_embeddings('glove.6B/glove.6B.300d.txt', vocab_en, embedding_dim)
#vocab_embeddings_fr = load_glove_embeddings('fasttext/cc.fr.300.vec', vocab_fr, embedding_dim)

#Load embeddings word2vec
embedding_dim = 100
vocab_embeddings_en = load_word2vec_embeddings('word2vec/english.txt', vocab_en, embedding_dim)
vocab_embeddings_fr = load_word2vec_embeddings('word2vec/france.txt', vocab_fr, embedding_dim)



Error converting values for word: communauté


In [None]:
# Instantiate the model components
hidden_size = 1024
#encoder = Encoder(hidden_size=hidden_size, pretrained_embeddings=vocab_embeddings_en)
#decoder = Decoder(embed_size=embedding_dim, hidden_size=hidden_size, output_size=len(vocab_fr), pretrained_embeddings=vocab_embeddings_fr)
encoder = Encoder2(hidden_size=hidden_size, pretrained_embeddings=vocab_embeddings_en)
decoder = Decoder2(embed_size=embedding_dim, hidden_size=hidden_size, output_size=len(vocab_fr), pretrained_embeddings=vocab_embeddings_fr)

model = Seq2Seq(encoder, decoder)
model = model.to(device)

# Define the optimizer and loss function
optimizer = Adam(model.parameters(), lr=0.001)
criterion = CrossEntropyLoss(ignore_index=vocab_fr['<pad>']).to(device)

# Create datasets for training and validation
train_dataset = TranslationDataset(X_train['text'], y_train['text'], vocab_en, vocab_fr, tokenize_en, tokenize_fr)
val_dataset = TranslationDataset(X_val['text'], y_val['text'], vocab_en, vocab_fr, tokenize_en, tokenize_fr)

# Create DataLoaders for training and validation
train_loader = DataLoader(train_dataset, batch_size=2, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=2, collate_fn=collate_fn)

train_4(model, train_loader, val_loader, optimizer, criterion, epochs=10, device=device)

In [None]:
print(predict_1(model, "I am a student", vocab_en, vocab_fr, tokenize_en, tokenize_fr, device=device))
print(predict_2(model, "I am a book", vocab_en, vocab_fr, tokenize_en, tokenize_fr, device=device))


In [None]:
#############################################################################################################

In [140]:
# Example GloVe embedding file path and embedding dimension

def check_missing_words_in_embeddings(path: str, word2idx: Dict[str, int], embedding_dim: int) -> torch.Tensor:
    print(len(word2idx))
    missing_word_counter = 0
    
    
        
    glove_words = set()
    with open(path, 'r', encoding='utf-8') as f:
        # Process each line in the GloVe file to collect all GloVe words
        for line in f:
            values = line.split()
            word = values[0]
            glove_words.add(word)
    
    for word in word2idx:
        if word not in glove_words:
            missing_word_counter+=1
    

    print(missing_word_counter)
    # Convert the numpy array to a PyTorch tensor

check_missing_words_in_embeddings('glove.6B/glove.6B.100d.txt', vocab_fr, embedding_dim)

check_missing_words_in_embeddings('fasttext/cc.fr.300.vec', vocab_fr, 300) 

10004
6906
10004
62


In [141]:
"""
Relevant notes:
If a word in  the vocabulary (word2idx) doesn’t exist in the GloVe or FastText embeddings file, 
its embedding vector would not be updated and would remain as initially set by a zero vector. 

update:
not found in vocubulary words are initialized with random vector


"""

'\nRelevant notes:\nIf a word in your vocabulary (word2idx) doesn’t exist in the GloVe or FastText embeddings file, \nits embedding vector would not be updated and would remain as initially set. \nGiven that you initialize the embeddings matrix with zeros, \nany word not found in the pre-trained dataset would be represented by a zero vector. \n\nupdate not found in vocubulary words are initialized with random vector\n\n\n'

In [207]:
train_dataset[5]

(tensor([ 687,  124,    7,  149,    4,  377, 1420,  195,    5,    7,  149,   13,
          926,  688,   66,  689,    9,   13,   50,   11,  452,   30,    4,  322,
            8]),
 tensor([1479,   53,  265,   14,    7,  943,   46,  944,    4,   35,   37,   76,
          197,    9,  714,    4,   35,   21,  579,  103, 1480,   93,   11,  266,
            6]))

In [208]:
def get_key_by_value(dictionary, search_value):
    for key, value in dictionary.items():
        if value == search_value:
            return key
    return None  # If the value is not found, return None or raise an exception


list_en= [ 687,  124,    7,  149,    4,  377, 1420,  195,    5,    7,  149,   13,
          926,  688,   66,  689,    9,   13,   50,   11,  452,   30,    4,  322,
            8]

print("english:")

res= ""
for i in list_en:
    res= res + " "+  get_key_by_value(vocab_en, i)
print(res)
    

##########


list_fr= [1479,   53,  265,   14,    7,  943,   46,  944,    4,   35,   37,   76,
          197,    9,  714,    4,   35,   21,  579,  103, 1480,   93,   11,  266,
            6]

print("french:")

res= ""
for i in list_fr:
    res= res + " "+  get_key_by_value(vocab_fr, i)
print(res)

english:
 They need to see the political dimension working , to see that officials accept their responsibilities and that there is communication with the citizens .
french:
 Ils ont besoin que la dimension politique fonctionne , qu' il y ait des responsabilités , qu' une communication soit établie avec les citoyens .


In [199]:
english_data

Unnamed: 0,text
0,Resumption of the session
1,I declare resumed the session of the European ...
2,"Although, as you will have seen, the dreaded '..."
3,You have requested a debate on this subject in...
4,"In the meantime, I should like to observe a mi..."
...,...
995,There is still a need to tighten up in the are...
996,This is especially necessary in relation to th...
997,"As Liberals and Greens, we clearly have differ..."
998,"Mr President, Commissioner, there are just two..."


In [156]:
french_data

Unnamed: 0,text
0,Reprise de la session
1,Je déclare reprise la session du Parlement eur...
2,"Comme vous avez pu le constater, le grand ""bog..."
3,Vous avez souhaité un débat à ce sujet dans le...
4,"En attendant, je souhaiterais, comme un certai..."
...,...
99995,Cela se trouvait dans la communication du mois...
99996,N'allez pas croire qu'il y ait eu une quelconq...
99997,Je pensais que ce dossier était bien plus avancé.
99998,Je suis désolée.


In [177]:
X_train, X_test, y_train, y_test = train_test_split(english_data, french_data, test_size=0.2)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25)  # 0.25 x 0.8 = 0.2


In [181]:
X_train["text"][5]

"Please rise, then, for this minute' s silence."

In [182]:
y_train["text"][5]

'Je vous invite à vous lever pour cette minute de silence.'