# Sequence to Sequence without Attention

In [1]:
from collections import Counter

import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from nltk import wordpunct_tokenize
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence
from torch.utils.data import Dataset
from tqdm import tqdm
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import numpy as np


In [2]:
#device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
device = "cpu"


In [3]:
def tokenize(text, character_flag=False):
    """Turn text into discrete tokens.

    Remove tokens that are not words.
    """
    text = str(text).lower()
    if character_flag:
        tokens =[char for char in text if char.isalpha()]

    else:   
        tokens = wordpunct_tokenize(text)
        tokens = [token for token in tokens if all(char.isalpha() for char in token)]

    return tokens


class SourceTargetTranslation(Dataset):
    #def __init__(self, path, max_vocab):
    def __init__(self, df, max_vocab):
    
        self.max_vocab = max_vocab
        
        # Extra tokens to add
        self.padding_token = '<PAD>'
        self.start_of_sequence_token = '<SOS>'
        self.end_of_sequence_token = '<EOS>'
        self.unknown_word_token = '<UNK>'
        
        # Helper function
        self.flatten = lambda x: [sublst for lst in x for sublst in lst]
        
        # Load the data into a DataFrame
        print(df.head())
        # Tokenize inputs (source) and targets (target)
        self.tokenize_df(df)

        # To reduce computational complexity, replace rare words with <UNK>
        self.replace_rare_tokens(df)
        
        # Prepare variables with mappings of tokens to indices
        self.create_token2idx(df)
        
        # Remove sequences with mostly <UNK>
        df = self.remove_mostly_unk(df)
        
        # Every sequence (input and target) should start with <SOS>
        # and end with <EOS>
        self.add_start_and_end_to_tokens(df)
        
        # Convert tokens to indices
        self.tokens_to_indices(df)
        
    def __getitem__(self, idx):
        """Return example at index idx."""
        return self.indices_pairs[idx][0], self.indices_pairs[idx][1]
    
    def tokenize_df(self, df):
        """Turn inputs and targets into tokens."""
        df['tokens_inputs'] = df.source.apply(tokenize)
        df['tokens_targets'] = df.target.apply(tokenize)
        
    def replace_rare_tokens(self, df):
        """Replace rare tokens with <UNK>."""
        common_tokens_inputs = self.get_most_common_tokens(
            df.tokens_inputs.tolist(),
        )
        common_tokens_targets = self.get_most_common_tokens(
            df.tokens_targets.tolist(),
        )
        
        df.loc[:, 'tokens_inputs'] = df.tokens_inputs.apply(
            lambda tokens: [token if token in common_tokens_inputs 
                            else self.unknown_word_token for token in tokens]
        )
        df.loc[:, 'tokens_targets'] = df.tokens_targets.apply(
            lambda tokens: [token if token in common_tokens_targets
                            else self.unknown_word_token for token in tokens]
        )

    def get_most_common_tokens(self, tokens_series):
        """Return the max_vocab most common tokens."""
        all_tokens = self.flatten(tokens_series)
        # Substract 4 for <PAD>, <SOS>, <EOS>, and <UNK>
        common_tokens = set(list(zip(*Counter(all_tokens).most_common(
            self.max_vocab - 4)))[0])
        return common_tokens

    def remove_mostly_unk(self, df, threshold=0.99):
        """Remove sequences with mostly <UNK>."""
        def calculate_ratio(tokens):
            # Check if tokens list is empty to avoid division by zero
            if len(tokens) == 0:
                return False  # Consider returning True if you want to remove empty token lists
            return (sum(1 for token in tokens if token != '<UNK>') / len(tokens)) > threshold

        # Apply the function to the DataFrame columns
        df = df[df['tokens_inputs'].apply(calculate_ratio)]
        df = df[df['tokens_targets'].apply(calculate_ratio)]
        return df

    def create_token2idx(self, df):
        """Create variables with mappings from tokens to indices."""
        unique_tokens_inputs = set(self.flatten(df.tokens_inputs))
        unique_tokens_targets = set(self.flatten(df.tokens_targets))
        
        for token in reversed([
            self.padding_token,
            self.start_of_sequence_token,
            self.end_of_sequence_token,
            self.unknown_word_token,
        ]):
            if token in unique_tokens_inputs:
                unique_tokens_inputs.remove(token)
            if token in unique_tokens_targets:
                unique_tokens_targets.remove(token)
                
        unique_tokens_inputs = sorted(list(unique_tokens_inputs))
        unique_tokens_targets = sorted(list(unique_tokens_targets))

        # Add <PAD>, <SOS>, <EOS>, and <UNK> tokens
        for token in reversed([
            self.padding_token,
            self.start_of_sequence_token,
            self.end_of_sequence_token,
            self.unknown_word_token,
        ]):
            
            unique_tokens_inputs = [token] + unique_tokens_inputs
            unique_tokens_targets = [token] + unique_tokens_targets
            
        self.token2idx_inputs = {token: idx for idx, token
                                 in enumerate(unique_tokens_inputs)}
        self.idx2token_inputs = {idx: token for token, idx
                                 in self.token2idx_inputs.items()}
        
        self.token2idx_targets = {token: idx for idx, token
                                  in enumerate(unique_tokens_targets)}
        self.idx2token_targets = {idx: token for token, idx
                                  in self.token2idx_targets.items()}
        
    def add_start_and_end_to_tokens(self, df):
        """Add <SOS> and <EOS> tokens to the end of every input and output."""
        df['tokens_inputs'] = df['tokens_inputs'].apply(
            lambda tokens: [self.start_of_sequence_token] + tokens + [self.end_of_sequence_token]
        )
        df['tokens_targets'] = df['tokens_targets'].apply(
            lambda tokens: [self.start_of_sequence_token] + tokens + [self.end_of_sequence_token]
        )
        
    def tokens_to_indices(self, df):
        """Convert tokens to indices."""
        df['indices_inputs'] = df.tokens_inputs.apply(
            lambda tokens: [self.token2idx_inputs[token] for token in tokens])
        df['indices_targets'] = df.tokens_targets.apply(
            lambda tokens: [self.token2idx_targets[token] for token in tokens])
             
        self.indices_pairs = list(zip(df.indices_inputs, df.indices_targets))
        
    def __len__(self):
        return len(self.indices_pairs)


In [4]:

def read_file(file_path,fraction):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.readlines()  # reads all lines into a list
        
    # Remove newline characters at the end of each line
    content = [line.strip() for line in content]
    
    # Calculate the number of lines to return based on the fraction
    if 0 < fraction < 1:
        end_index = int(len(content) * fraction)
        return content[:end_index]
    else:
        return content[:fraction]  # Return the full list



In [5]:
#### TRANSFORM OUR DATA IN THE DESIRED FORMAT#########


## Load data

In [6]:
def load_data(fraction, source_path="fr-en/europarl-v7.fr-en.en", target_path="fr-en/europarl-v7.fr-en.fr"):
    source_file_path = source_path
    target_file_path = target_path
    
    # Assuming read_file function is correctly implemented and returns the data needed
    source_data = read_file(source_file_path, fraction)
    target_data = read_file(target_file_path, fraction)
    
    return source_data, target_data


In [7]:
def prepare(source_data, target_data, max_vocab, max_words):
    # Creating new lists that only include entries with <= max_words
    filtered_source_data = []
    filtered_target_data = []
    
    for source, target in zip(source_data, target_data):
        word_source = source.split(" ")
        word_target = target.split(" ")
        if len(word_source) <= max_words and len(word_target) <= max_words:
            filtered_source_data.append(source)
            filtered_target_data.append(target)
    
    df = pd.DataFrame({
        'source': filtered_source_data,
        'target': filtered_target_data
    })
    
    dataset = SourceTargetTranslation(df, max_vocab=max_vocab)
    return dataset, df
            

In [8]:

def collate(batch):
    inputs = [torch.LongTensor(item[0]) for item in batch]
    targets = [torch.LongTensor(item[1]) for item in batch]
    
    # Pad sequencse so that they are all the same length (within one minibatch)
    padded_inputs = pad_sequence(inputs, padding_value=dataset.token2idx_targets[dataset.padding_token], batch_first=True)
    padded_targets = pad_sequence(targets, padding_value=dataset.token2idx_targets[dataset.padding_token], batch_first=True)
    
    # Sort by length for CUDA optimizations
    lengths = torch.LongTensor([len(x) for x in inputs])
    lengths, permutation = lengths.sort(dim=0, descending=True)

    return padded_inputs[permutation].to(device), padded_targets[permutation].to(device), lengths.to(device)



In [9]:

class Encoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, batch_size, pretrained_embeddings):
        super(Encoder, self).__init__()
        self.batch_size = batch_size
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim)
        self.embedding.weight.data.copy_(pretrained_embeddings)  # Set pretrained weights
        self.embedding.weight.requires_grad = True  # Optionally, make embeddings trainable
        self.gru = nn.GRU(self.embedding_dim, self.hidden_size, batch_first=True)
    
    # Remaining methods remain unchanged
        
    def forward(self, inputs, lengths):
        self.batch_size = inputs.size(0)
        
        # Turn input indices into distributed embeddings
        x = self.embedding(inputs)

        # Remove padding for more efficient RNN application
        x = pack_padded_sequence(x, lengths, batch_first=True)
    
        # Apply RNN to get hidden state at all timesteps (output)
        # and hidden state of last output (self.hidden)
        output, self.hidden = self.gru(x, self.init_hidden())
        
        # Pad the sequences like they were before
        output, _ = pad_packed_sequence(output)
        
        return output, self.hidden

    def init_hidden(self):
        # Randomly initialize the weights of the RNN
        return torch.randn(1, self.batch_size, self.hidden_size).to(device)


In [10]:

class Decoder(nn.Module):
    def __init__(
        self, 
        vocab_size,
        embedding_dim, 
        decoder_hidden_size,
        encoder_hidden_size, 
        batch_size,
        pretrained_embeddings
    ):
        super(Decoder, self).__init__()
        self.batch_size = batch_size
        self.encoder_hidden_size = encoder_hidden_size
        self.decoder_hidden_size = decoder_hidden_size
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim)
        self.embedding.weight.data.copy_(pretrained_embeddings)  # Set pretrained weights
        self.embedding.weight.requires_grad = True  # Optionally, make embeddings trainable
        self.gru = nn.GRU(
            self.embedding_dim + self.encoder_hidden_size, 
            self.decoder_hidden_size,
            batch_first=True,
        )
        self.fc = nn.Linear(self.encoder_hidden_size, self.vocab_size)
    
    def forward(self, input, hidden, context):
        # input = [batch size, 1]
        input = input.T
        # input = [1, batch size]
        # hidden = [n layers * n directions, batch size, hidden dim]
        # context = [n layers * n directions, batch size, hidden dim]
        # n layers and n directions in the decoder will both always be 1, therefore:
        # hidden = [1, batch size, hidden dim]
        # context = [1, batch size, hidden dim]
        # Turn target indices into distributed embeddings
        embedded = self.embedding(input)
        # embedded = [1, batch size, embedding dim]
        emb_con = torch.cat((embedded, context), dim=2)
        emb_con = emb_con.permute(1,0,2)
        # emb_con = [1, batch size, embedding dim + hidden dim]
        output, hidden = self.gru(emb_con, hidden)
        
        # Reshape the hidden states (output)
        output = output.view(-1, output.size(2))
        
        # Apply a linear layer
        x = self.fc(output)
        
        # Note: without attention, we won't return attention_weights
        return x, hidden
    
    def init_hidden(self):
        # Randomly initialize the weights of the RNN
        return torch.randn(1, self.batch_size, self.decoder_hidden_size).to(device)


In [11]:

criterion = nn.CrossEntropyLoss()

def loss_function(real, pred):
    """Calculate how wrong the model is."""
    # Use mask to only consider non-zero inputs in the loss
    mask = real.ge(1).float().to(device)
    
    loss_ = criterion(pred, real) * mask 
    return torch.mean(loss_)


class EncoderDecoder(nn.Module):
    def __init__(self, inputs_vocab_size, targets_vocab_size, hidden_size,
                 embedding_dim, batch_size, targets_start_idx, targets_stop_idx,
                 encoder_embeddings, decoder_embeddings):  # Added embeddings parameters
        super(EncoderDecoder, self).__init__()
        self.batch_size = batch_size
        self.targets_start_idx = targets_start_idx
        self.targets_stop_idx = targets_stop_idx
        
        # Pass pretrained embeddings to the encoder and decoder
        self.encoder = Encoder(inputs_vocab_size, embedding_dim,
                               hidden_size, batch_size, encoder_embeddings).to(device)
        
        self.decoder = Decoder(targets_vocab_size, embedding_dim,
                               hidden_size, hidden_size, batch_size, decoder_embeddings).to(device)
        
        
    def predict(self, inputs, lengths):
        self.batch_size = inputs.size(0)
        
        encoder_output, context = self.encoder(
            inputs.to(device),
            lengths,
        )
        decoder_hidden = context

        # Initialize the input of the decoder to be <SOS>
        decoder_input = torch.LongTensor(
            [[self.targets_start_idx]] * self.batch_size,
        ).to(device)
        
        # Output predictions instead of loss
        output = []
    
        for _ in range(20):  # max sequence length, since we work with short sentences
            predictions, decoder_hidden = self.decoder(
                decoder_input,
                decoder_hidden,
                context
            )
            
            prediction = torch.multinomial(F.softmax(predictions, dim=1), 1)
            decoder_input = prediction
    
            prediction = prediction.item()
            output.append(prediction)
    
            if prediction == self.targets_stop_idx:
                return output
    
        return output

    def forward(self, inputs, targets, lengths):
        self.batch_size = inputs.size(0)
        
        encoder_output, context = self.encoder(
            inputs.to(device),
            lengths.cpu(),  # Ensure lengths are on CPU if necessary
        )
        decoder_hidden = context
        
        # Initialize the input of the decoder to be <SOS>
        decoder_input = torch.LongTensor(
            [[self.targets_start_idx]] * self.batch_size,
        ).to(device)
        
        # Use teacher forcing to train the model
        loss = 0
        for timestep in range(1, targets.size(1)):
            predictions, decoder_hidden = self.decoder(
                decoder_input,
                decoder_hidden,
                context
            )
            decoder_input = targets[:, timestep].unsqueeze(1).to(device)
            
            loss += loss_function(targets[:, timestep], predictions)
            
        return loss / targets.size(1)



In [12]:


from typing import Dict
# Example GloVe embedding file path and embedding dimension

def load_glove_embeddings(path: str, word2idx: Dict[str, int], embedding_dim: int) -> torch.Tensor:
    """
    Load GloVe embeddings from a specified file and align them with the given word index dictionary.

    Parameters:
    - path (str): The file path to the GloVe embeddings file.
    - word2idx (Dict[str, int]): A dictionary mapping words to their corresponding indices. This dictionary defines
      the position each word’s vector should occupy in the resulting embedding matrix.
    - embedding_dim (int): The dimensionality of the GloVe vectors (e.g., 50, 100, 200, 300).

    Returns:
    - torch.Tensor: A tensor of shape (len(word2idx), embedding_dim) containing the GloVe vectors aligned according to word2idx.
    """
    with open(path, 'r', encoding='utf-8') as f:
        # Initialize the embedding matrix with zeros
        #embeddings = np.zeros((len(word2idx), embedding_dim))
        #better approach: init with random 
        embeddings = np.random.uniform(-0.1, 0.1, (len(word2idx), embedding_dim))
        # Process each line in the GloVe file
        for line in f:
            values = line.split()
            word = values[0]
            
            # If the word is in the provided dictionary, update the corresponding row in embeddings
            if word in word2idx.keys():
                # Convert embedding values from strings to float32
                vector = np.asarray(values[1:], dtype='float32')
                # Place the vector in the correct index as per word2idx
                embeddings[word2idx[word]] = vector
            else:
                pass
    # Convert the numpy array to a PyTorch tensor
    return torch.from_numpy(embeddings)



def load_word2vec_embeddings(path, word2idx, embedding_dim):
    embeddings = np.random.uniform(-0.1, 0.1, (len(word2idx), embedding_dim))
    with open(path, 'r', encoding='latin1') as f:
        for line in f:
            values = line.strip().split()
            word = values[0]
            if word in word2idx:
                try:
                    vector = np.asarray(values[1:], dtype='float32')
                    embeddings[word2idx[word]] = vector
                except ValueError:
                    print(f"Error converting values for word: {word}")
                    continue
    return torch.from_numpy(embeddings)

def load_random_embeddings(path: str, word2idx: int, embedding_dim: int) -> torch.Tensor:
    """
    Create random embeddings for a specified vocabulary size and embedding dimension.

    Parameters:
    - vocab_size (int): The number of unique tokens in your vocabulary.
    - embedding_dim (int): The dimensionality of each token's vector.

    Returns:
    - torch.Tensor: A tensor of shape (vocab_size, embedding_dim) containing randomly initialized vectors.
    """
    # Initialize random embeddings in a specified range
    # range -0.1 to 0.1 is a common choice for initializing embeddings
    embeddings = np.random.uniform(-0.1, 0.1, (len(word2idx), embedding_dim))

    return torch.from_numpy(embeddings)



In [13]:

from torch.utils.data import DataLoader
def load_and_init(dataset, embedding_dim, hidden_dim, batch_size, source_embeddings_path='glove.6B/glove.6B.300d.txt', target_embeddings_path='fasttext/cc.fr.300.vec', embeddings_loader_func=load_glove_embeddings):

    train_size = int(0.90 * len(dataset))
    test_size = len(dataset) - train_size
    train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate, drop_last=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate, drop_last=True)
    
    vocab_source = dataset.token2idx_inputs
    vocab_target = dataset.token2idx_targets
    
    vocab_embeddings_source = embeddings_loader_func(source_embeddings_path, vocab_source, embedding_dim)
    vocab_embeddings_target = embeddings_loader_func(target_embeddings_path, vocab_target, embedding_dim)
    
    
    model = EncoderDecoder(
        inputs_vocab_size=len(dataset.token2idx_inputs),
        targets_vocab_size=len(dataset.token2idx_targets),
        hidden_size=hidden_dim,
        embedding_dim=embedding_dim, 
        batch_size=batch_size, 
        targets_start_idx=dataset.token2idx_targets[dataset.start_of_sequence_token],
        targets_stop_idx=dataset.token2idx_targets[dataset.end_of_sequence_token],
        encoder_embeddings=vocab_embeddings_source,  # Pass encoder embeddings
        decoder_embeddings=vocab_embeddings_target   # Pass decoder embeddings
    ).to(device)
    
    optimizer = optim.Adam([p for p in model.parameters() if p.requires_grad], lr=0.001)

    return model, optimizer, train_loader, train_dataset, test_dataset, vocab_embeddings_source, vocab_embeddings_target


In [14]:

def calc_bleu(model, translation_dataset: SourceTargetTranslation, test_dataset, collate_fn):
    smoothing_function = SmoothingFunction().method1
    test_loader = DataLoader(test_dataset, batch_size=1, collate_fn=collate_fn, drop_last=True)
    model.eval()
    total_loss = total = 0
    bleu_scores = []
    with torch.no_grad():
        for inputs, targets, lengths in test_loader:
            reference = ' '.join([
                translation_dataset.idx2token_inputs[idx]
                for idx in inputs.cpu()[0].numpy()[1:-1]
            ])
            # print("source:",reference)

            target = ' '.join([
                translation_dataset.idx2token_targets[idx]
                for idx in targets.cpu()[0].numpy()[1:-1]
            ])
            # print("target:",target)

            inputs = inputs.to(device)
            outputs = model.predict(inputs, lengths)
            generated = (' '.join([
                translation_dataset.idx2token_targets[idx]
                for idx in outputs[:-1]
            ]))
            bleu_score = sentence_bleu([target], generated, smoothing_function=smoothing_function)
            bleu_scores.append(bleu_score)

    score = np.mean(bleu_scores)
    return score


In [15]:
def train(model, optimizer, translation_dataset: SourceTargetTranslation, train_loader, train_dataset, test_dataset, collate_fn, file_suffix, epochs=10):
    results_list = []
    results = pd.DataFrame(columns=["epoch", "train_loss", "train_ppl", "bleu"])
    bleu_scores=[]
    
    for epoch in range(epochs):
        
        total_loss = total = 0
        progress_bar = tqdm(train_loader, desc='Training', leave=False)
        for inputs, targets, lengths in progress_bar:
            model.train()
            inputs, targets = inputs.to(device), targets.to(device) #CHANGE ADDED
            lengths = lengths.cpu()  #CHANGE ADDED
    
            
            # Clean old gradients
            optimizer.zero_grad()
    
            # Forwards pass
            loss = model(inputs, targets, lengths)
    
            # Perform gradient descent, backwards pass
            loss.backward()
    
            # Take a step in the right direction
            optimizer.step()
    
            # Record metrics
            total_loss += loss.item()
            total += targets.size(1)
    
        train_loss = total_loss / total
        
        with torch.no_grad():
            bleu_score_train = calc_bleu(model, translation_dataset, train_dataset, collate_fn)
            bleu_score_test = calc_bleu(model, translation_dataset, test_dataset, collate_fn)
            
        tqdm.write(f'epoch #{epoch + 1:3d}\ttrain_loss: {train_loss:.2e}\tbleu_score_train: {bleu_score_train:.4f}\n')
        
        results_list.append({
            "epoch": epoch,
            "train_loss": train_loss,
            "train_ppl": np.exp(train_loss),
            "train_bleu": bleu_score_train,
            "test_bleu": bleu_score_test
        })
    
        total_loss = total = 0
    results = pd.DataFrame(results_list)
    results.to_csv(f"reports/csv/task_3_without_attention_{file_suffix}.csv", index=False)


In [16]:

def translate_text(input_text, model, dataset, device='cpu'):
    model.eval()  # Set the model to evaluation mode

    # Tokenize the input text using the same tokenizer used during training
    tokens = tokenize(input_text)
    # Convert tokens to indices using the _source  vocabulary
    indices = [dataset.token2idx_inputs.get(token, dataset.token2idx_inputs['<UNK>']) for token in tokens]

    # Add <SOS> and <EOS> tokens
    indices = [dataset.token2idx_inputs[dataset.start_of_sequence_token]] + indices + [dataset.token2idx_inputs[dataset.end_of_sequence_token]]
    
    # Convert list of indices to tensor
    indices_tensor = torch.LongTensor(indices).unsqueeze(0).to(device)  # Add batch dimension
    #print(len(indices_tensor))
    # Create lengths tensor
    lengths = torch.LongTensor([len(indices)]).to(device)

    # Get predictions from the model
    with torch.no_grad():
        outputs = model.predict(indices_tensor, lengths)

    translation = ' '.join([dataset.idx2token_targets[idx] for idx in outputs[:-1]])

    return translation


## Evaluation

In [None]:
## EXP 1: 2000 vocabulary and max. 5 words, glove + fasttext embeddings
max_words = 5
fraction = 20000000
max_vocab = 2000
source_data, target_data = load_data(fraction)
dataset, _ = prepare(source_data, target_data, max_vocab, max_words)
source_embeddings_path='glove.6B/glove.6B.300d.txt'
target_embeddings_path='fasttext/cc.fr.300.vec'
embeddings_loader_func=load_glove_embeddings
model, optimizer, train_loader, train_dataset, test_dataset, _, _ = load_and_init(dataset, 300, 256, 64, source_embeddings_path, target_embeddings_path, embeddings_loader_func)
train(model, optimizer, dataset, train_loader, train_dataset, test_dataset, collate, "exp1", 10)

In [None]:
## EXP 2: 2000 vocabulary and max. 8 words, glove + fasttext embeddings
max_words = 8
fraction = 20000000
max_vocab = 2000
source_data, target_data = load_data(fraction)
source_embeddings_path='glove.6B/glove.6B.300d.txt'
target_embeddings_path='fasttext/cc.fr.300.vec'
embeddings_loader_func=load_glove_embeddings
dataset, _ = prepare(source_data, target_data, max_vocab, max_words)
model, optimizer, train_loader, train_dataset, test_dataset, _, _ = load_and_init(dataset, 300, 256, 64, source_embeddings_path, target_embeddings_path, embeddings_loader_func)
train(model, optimizer, dataset, train_loader, train_dataset, test_dataset, collate, "exp2", 10)

In [None]:
## EXP 3: 2000 vocabulary and max. 11 words, glove + fasttext embeddings
max_words = 11
fraction = 20000000
max_vocab = 2000
source_data, target_data = load_data(fraction)
source_embeddings_path='glove.6B/glove.6B.300d.txt'
target_embeddings_path='fasttext/cc.fr.300.vec'
embeddings_loader_func=load_glove_embeddings
dataset, _ = prepare(source_data, target_data, max_vocab, max_words)
model, optimizer, train_loader, train_dataset, test_dataset, _, _ = load_and_init(dataset, 300, 256, 64, source_embeddings_path, target_embeddings_path, embeddings_loader_func)
train(model, optimizer, dataset, train_loader, train_dataset, test_dataset, collate, "exp3", 10)

In [17]:
## EXP 4: 20000 vocabulary and max. 11 words, glove + fasttext embeddings
max_words = 11
fraction = 20000000
max_vocab = 20000
source_data, target_data = load_data(fraction)
source_embeddings_path='glove.6B/glove.6B.300d.txt'
target_embeddings_path='fasttext/cc.fr.300.vec'
embeddings_loader_func=load_glove_embeddings
dataset, _ = prepare(source_data, target_data, max_vocab, max_words)
model, optimizer, train_loader, train_dataset, test_dataset, _, _ = load_and_init(dataset, 300, 256, 64, source_embeddings_path, target_embeddings_path, embeddings_loader_func)
train(model, optimizer, dataset, train_loader, train_dataset, test_dataset, collate, "exp4", 10)

                                              source  \
0                          Resumption of the session   
1     Please rise, then, for this minute' s silence.   
2  (The House rose and observed a minute' s silence)   
3              Madam President, on a point of order.   
4              Madam President, on a point of order.   

                                              target  
0                              Reprise de la session  
1  Je vous invite à vous lever pour cette minute ...  
2  (Le Parlement, debout, observe une minute de s...  
3  Madame la Présidente, c'est une motion de proc...  
4  Madame la Présidente, c'est une motion de proc...  


                                                             

KeyboardInterrupt: 

In [18]:
## EXP 5: 200000 vocabulary and max. 8 words, random embeddings
max_words = 8
fraction = 20000000
max_vocab = 2000
source_data, target_data = load_data(fraction)
embeddings_loader_func=load_random_embeddings
dataset, _ = prepare(source_data, target_data, max_vocab, max_words)
model, optimizer, train_loader, train_dataset, test_dataset, _, _ = load_and_init(dataset, 300, 256, 64, None, None, embeddings_loader_func)
train(model, optimizer, dataset, train_loader, train_dataset, test_dataset, collate, "exp5", 10)


                                          source  \
0                      Resumption of the session   
1          Madam President, on a point of order.   
2          Madam President, on a point of order.   
3           It is the case of Alexander Nikitin.   
4  Mr Berenguer Fuster, we shall check all this.   

                                              target  
0                              Reprise de la session  
1  Madame la Présidente, c'est une motion de proc...  
2  Madame la Présidente, c'est une motion de proc...  
3              Il s'agit du cas d'Alexandre Nikitin.  
4      Cher collègue nous allons vérifier tout cela.  


                                                             

epoch #  1	train_loss: 1.14e-01	bleu_score_train: 0.4101



                                                             

epoch #  2	train_loss: 6.38e-02	bleu_score_train: 0.4920



                                                             

epoch #  3	train_loss: 5.02e-02	bleu_score_train: 0.5367



                                                             

epoch #  4	train_loss: 4.21e-02	bleu_score_train: 0.5692



                                                             

epoch #  5	train_loss: 3.62e-02	bleu_score_train: 0.5927



                                                               

epoch #  6	train_loss: 3.16e-02	bleu_score_train: 0.6109



                                                             

epoch #  7	train_loss: 2.77e-02	bleu_score_train: 0.6259



                                                             

epoch #  8	train_loss: 2.45e-02	bleu_score_train: 0.6397



                                                             

epoch #  9	train_loss: 2.17e-02	bleu_score_train: 0.6506



                                                             

epoch # 10	train_loss: 1.93e-02	bleu_score_train: 0.6637



In [19]:
## EXP 6: 200000 vocabulary and max. 8 words, word2vec embeddings
max_words = 8
fraction = 20000000
max_vocab = 2000
source_data, target_data = load_data(fraction)
source_embeddings_path='word2vec/english.txt'
target_embeddings_path='word2vec/france.txt'
embeddings_loader_func=load_word2vec_embeddings
dataset, _ = prepare(source_data, target_data, max_vocab, max_words)
model, optimizer, train_loader, train_dataset, test_dataset, _, _ = load_and_init(dataset, 300, 256, 64, source_embeddings_path, target_embeddings_path, embeddings_loader_func)
train(model, optimizer, dataset, train_loader, train_dataset, test_dataset, collate, "exp6", 10)


                                          source  \
0                      Resumption of the session   
1          Madam President, on a point of order.   
2          Madam President, on a point of order.   
3           It is the case of Alexander Nikitin.   
4  Mr Berenguer Fuster, we shall check all this.   

                                              target  
0                              Reprise de la session  
1  Madame la Présidente, c'est une motion de proc...  
2  Madame la Présidente, c'est une motion de proc...  
3              Il s'agit du cas d'Alexandre Nikitin.  
4      Cher collègue nous allons vérifier tout cela.  
Error converting values for word: the
Error converting values for word: of
Error converting values for word: and
Error converting values for word: to
Error converting values for word: a
Error converting values for word: in
Error converting values for word: for
Error converting values for word: is
Error converting values for word: on
Error converting value

FileNotFoundError: [Errno 2] No such file or directory: 'word2vec/french.txt'

## FR TO EN

In [None]:
## FR TO EN EXP 1: 2000 vocabulary and max. 5 words, glove + fasttext embeddings
max_words = 5
fraction = 20000000
max_vocab = 2000
source_data, target_data = load_data(fraction, "fr-en/europarl-v7.fr-en.fr", "fr-en/europarl-v7.fr-en.en")
dataset, _ = prepare(source_data, target_data, max_vocab, max_words)
model, optimizer, train_loader, train_dataset, test_dataset, _, _ = load_and_init(dataset, 300, 256, 64)
train(model, optimizer, dataset, train_loader, train_dataset, test_dataset, collate, "exp1-fr-to-en", 10)

In [None]:
## FR TO EN EXP 2: 2000 vocabulary and max. 8 words, glove + fasttext embeddings
max_words = 8
fraction = 20000000
max_vocab = 2000
source_data, target_data = load_data(fraction, "fr-en/europarl-v7.fr-en.fr", "fr-en/europarl-v7.fr-en.en")
dataset, _ = prepare(source_data, target_data, max_vocab, max_words)
model, optimizer, train_loader, train_dataset, test_dataset, _, _ = load_and_init(dataset, 300, 256, 64)
train(model, optimizer, dataset, train_loader, train_dataset, test_dataset, collate, "exp2-fr-to-en", 10)

In [None]:
## FR TO EN EXP 3: 2000 vocabulary and max. 8 words, glove + fasttext embeddings
max_words = 11
fraction = 20000000
max_vocab = 2000
source_data, target_data = load_data(fraction, "fr-en/europarl-v7.fr-en.fr", "fr-en/europarl-v7.fr-en.en")
dataset, _ = prepare(source_data, target_data, max_vocab, max_words)
model, optimizer, train_loader, train_dataset, test_dataset, _, _ = load_and_init(dataset, 300, 256, 64)
train(model, optimizer, dataset, train_loader, train_dataset, test_dataset, collate, "exp3-fr-to-en", 10)

In [None]:
## FR TO EN EXP 4: 2000 vocabulary and max. 8 words, glove + fasttext embeddings
max_words = 11
fraction = 20000000
max_vocab = 200000
source_data, target_data = load_data(fraction, "fr-en/europarl-v7.fr-en.fr", "fr-en/europarl-v7.fr-en.en")
dataset, _ = prepare(source_data, target_data, max_vocab, max_words)
model, optimizer, train_loader, train_dataset, test_dataset, _, _ = load_and_init(dataset, 300, 256, 64)
train(model, optimizer, dataset, train_loader, train_dataset, test_dataset, collate, "exp4-fr-to-en", 10)