In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!nvidia-smi -L

GPU 0: Tesla T4 (UUID: GPU-6490d090-d8fc-d8a3-89e7-82a5794ead7c)


In [None]:
!nvidia-smi

Wed Mar 22 12:02:31 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   70C    P0    29W /  70W |   6563MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!lscpu | grep "Model name"

Model name:                      Intel(R) Xeon(R) CPU @ 2.00GHz


In [None]:
!python3 --version

Python 3.9.16


In [None]:
!pip list | grep torch

torch                         1.13.1+cu116
torchaudio                    0.13.1+cu116
torchsummary                  1.5.1
torchtext                     0.14.1
torchvision                   0.14.1+cu116


In [None]:
pip install ing_theme_matplotlib

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!gcc ./drive/MyDrive/colab_storage/scripts/preprocessing_tools.c -o ./drive/MyDrive/colab_storage/lib/preprocessing_tools.so -shared

In [None]:
from copy import deepcopy
import random
import ctypes
import os
import functools
from time import time
import datetime
import math
from collections import Counter, OrderedDict

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
from ing_theme_matplotlib import mpl_style

import torch
from torch import nn
from torch.nn import functional as F
import torchtext

In [None]:
## Load the shared library
global lib
lib = ctypes.CDLL('./drive/MyDrive/colab_storage/lib/preprocessing_tools.so')


def int2str(col_int, vocab, max_steps, trim=True, join=True):
    '''
    Description:
        Converts tokens from integer to string format.
    '''

    ## trim triggers the sequence's trimming from <eos> until the maximum step
    if trim:
        trim_condition = lambda str_seq: str_seq == '<eos>'
    else:
        trim_condition = lambda *args: False

    n_examples = col_int.shape[0]

    col_str_ = [[vocab.get_itos()[col_int[example_idx][t]] for t in range(max_steps)] for example_idx in range(n_examples)]
    col_str__ = [[] for example_idx in range(n_examples)]
    for example_idx in range(n_examples):
        for step in range(max_steps):
            if trim_condition(col_str_[example_idx][step]):
                break
            col_str__[example_idx].append(col_str_[example_idx][step])

    ## join triggers the resulting sequence's concatenation of string-tokens to form the final sentence
    if join:
        col_str = [' '.join(col_str__[example_idx]) for example_idx in range(n_examples)]
    else:
        col_str = col_str__

    return col_str

def ohe2int(p_distr):
    '''
    Description:
        Converts a given distribution to the most likely vocabulary index.
    '''

    return torch.argmax(p_distr, axis=-1)

def ohe2str(p_distr, vocab, max_steps, trim=True, join=True):

    return int2str(ohe2int(p_distr), vocab, max_steps, trim, join)

def single_gram_to_k_grams(col_int, k):
    '''
    Description:
        Converts word-tokens to multi-word, k-grams.

    Inputs:
        <col_int>: Type <list[<list[<int>]>]>. The outmost list's length equals n_examples. The innermost list's length equals the corresponding example's sequence length.
        <n>: Type <int>.

    Outputs:
        <col_multitoken_int>: Type <list[<list[<tuple[<int>]>]>]>. The first (in depth) list's length equals n_examples. The second list's length equals the corresponding example's sequence length. The tuple's length equals k.
    '''

    n_examples = len(col_int)

    if k == 1:
        col_ngram_int = col_int
        return col_ngram_int

    col_ngram_int = [[] for i in range(n_examples)]

    for i in range(n_examples):
    
        n_tokens_col_int = len(col_int[i])

        n_tokens_col_ngram_int = max(1, n_tokens_col_int - k + 1)
        for t_ in range(n_tokens_col_ngram_int):
            col_ngram_int[i].append(tuple(col_int[i][t_: t_+k]))

    return col_ngram_int

def trim_sequence(seq, eos):
    '''
    Description:
        Trims sequences the part that begins with the end of sequence token. Type <d0> is defined to either be <str> or <int>.

    Inputs:
        <seq>: Type <list[list[<d0>]]>. Length: n_examples.
        <eos>: Type <d0>. The end of sequence token.

    Outputs:
        <updated_seq>: Type <list[list[<d0>]]>. Length: n_examples.
    '''

    n_examples = len(seq)
    for i in range(n_examples):
        if eos in seq[i]:
            separator = seq[i].index(eos)
            seq[i] = seq[i][:separator]

    return seq

def purger(raw_text):
        """
        Description:
            Each line is partitioned with respect to '\t' the final member of the resulting list is dropped. This successfully gets rid of the useless last segment.

        Input:
            <raw_text>: Type: <str>.

        Output:
            <reduced_text>: Type: <str>.
        """

        reduced_text = []
        raw_sentence_seq = raw_text.split('\n')
        for instance in raw_sentence_seq:
            reduced_text.append('\t'.join(instance.split('\t')[:-1]))

        reduced_text = '\n'.join(reduced_text)

        return reduced_text

def initial_text_preprocess(text):

    # Replace non-breaking space with space
    text = text.replace('\u202f', ' ').replace('\xa0', ' ')

    # Insert space between words and punctuation marks
    no_space = lambda char, prev_char: char in ',.!?;' and prev_char != ' '
    out = [' ' + char if i > 0 and no_space(char, text[i - 1]) else char
    for i, char in enumerate(text.lower())]

    return ''.join(out)

def tokenize(text, max_examples=None):

    src_str, tgt_str = [], []
    for i, line in enumerate(text.split('\n')):
        if max_examples and i > max_examples: break
        parts = line.split('\t')
        if len(parts) == 2:
            # Skip empty tokens
            src_str.append([t for t in f'{parts[0]} <eos>'.split(' ') if t])
            tgt_str.append([t for t in f'{parts[1]} <eos>'.split(' ') if t])

    return src_str, tgt_str

def c_numericalize_str(col_seq, vocab_itos): ## str2int
    """
    Description:
        Converts a list of string-token sequences to a list of vocabulary-index-token sequences, while maintaining their order. In other words it replicates the list, whilst replacing the string tokens with index tokens with respect to a vocabulary. It's worth noting that each sequence has a length that potentially varies and is not necessarily fixed.

    Inputs:
        <col_seq>: Type: <list[<list[<str>]>]>. The collection of sequences of words to be transformed/numericalized.
            Warning: It was assumed that for each list-member of <col_seq>, its final string-token-element has to be exactly b'<eos>', otherwise the .so file won't be able to process the input.
        <vocab_itos>: Type: <list[<str>]>. The list containing all the vocabularies string-tokens at the position with index equal to vocabulary-index-token. Must include the <unk> character in the beginning.

    Returns:
        <enc_col>: Type: <list[<list[<int>]>]>. Contains all the members of <col_seq> ("col" is for "collection") where each string replaced with the vocabularies index.
    """

    ## Prototype
    lib.arstr2num.argtypes = \
    [
        ctypes.POINTER(ctypes.POINTER(ctypes.c_char_p)), # char ***col_seq
        ctypes.c_int, # int col_seq_length
        ctypes.POINTER(ctypes.c_char_p), # char **vocab_itos
        ctypes.c_int, # vocab_itos_length
        ctypes.POINTER(ctypes.POINTER(ctypes.c_int)) # int **enc_col_seq
    ]

    ## ! Define the arrays' instances and allocate memory for them: Begin

    enc_col_seq = [[None for j in range(len(col_seq[i]))] for i in range(len(col_seq))]

    col_seq_array = (ctypes.POINTER(ctypes.c_char_p) * len(col_seq))()
    for i, seq in enumerate(col_seq):
        col_seq_array[i] = (ctypes.c_char_p * len(seq))()

    enc_col_seq_array = (ctypes.POINTER(ctypes.c_int) * len(enc_col_seq))()
    for i, enc_seq in enumerate(enc_col_seq):
        enc_col_seq_array[i] = (ctypes.c_int * len(enc_seq))()

    vocab_itos_array = (ctypes.c_char_p * len(vocab_itos))()

    ## Allocating memory
    for i, seq in enumerate(col_seq):
        for j, token_str in enumerate(seq):
            col_seq_array[i][j] = token_str.encode()

    ## Allocating memory
    for i, token_str in enumerate(vocab_itos):
        vocab_itos_array[i] = token_str.encode()

    ## ! Define the arrays' instances and allocate memory for them: End

    ## Update <enc_col_seq_array>
    lib.arstr2num(col_seq_array, len(col_seq), vocab_itos_array, len(vocab_itos), enc_col_seq_array)

    ## Copy values from <enc_col_seq_array> to <enc_col_seq>
    for i in range(len(enc_col_seq)):
        for j in range(len(enc_col_seq[i])):
            enc_col_seq[i][j] = enc_col_seq_array[i][j]

    return enc_col_seq

str2int = c_numericalize_str

def c_pad_or_trim(enc_col, eos_int, pad_int, t_bound):
    """
    Description:
        For each sequence in <enc_col>:
        1. If the sequence contains more than <t_bound> tokens, this function removes all the right side tokens with index greater than <t_bound>.
        2. If the sequence contains less than <t_bound> tokens, this function adds the integer-encoded padding token to its right side until the sequence length equals <t_bound>.

    Inputs:
        <enc_col>: Type <list[list[<int>]]>. An integer-encoded collection of sequences.
            Warning: It was assumed that for each list-member of <enc_col>, its final integer-token-element has to be exactly the integer-encoded '<eos>' given by <eos_int>, otherwise the .so file won't be able to process the input.
        <eos_int>: Type <int>. The special end-of-sequence token encoded as an integer.
        <pad_int>: Type <int>. The special padding token encoded as an integer.
        <t_bound>: Type <int>. Number of time steps for the resulting sequences.

    Outputs:
        <enc_col_>: Type <list[list[<int>]]>. Collection <enc_col>'s sequences where each sequence is padded or trimmed. All sequences have length equal to <t_bound>.
    """

    ## Prototype
    lib.pad_or_trim.argtypes = \
    [
        ctypes.POINTER(ctypes.POINTER(ctypes.c_int)),
        ctypes.c_int,
        ctypes.c_int,
        ctypes.c_int,
        ctypes.c_int,
        ctypes.POINTER(ctypes.POINTER(ctypes.c_int))
    ]

    ## ! Define the arrays' instances and allocate memory for them: Begin

    enc_col_ = [[None for word_idx in range(t_bound)] for seq_idx in range(len(enc_col))]

    enc_col_array = (ctypes.POINTER(ctypes.c_int) * len(enc_col))()
    for i, enc_seq in enumerate(enc_col):
        enc_col_array[i] = (ctypes.c_int * len(enc_seq))()

    enc_col_array_ = (ctypes.POINTER(ctypes.c_int) * len(enc_col_))()
    for i in range(len(enc_col_)):
        enc_col_array_[i] = (ctypes.c_int * t_bound)()

    ## Allocating memory
    for i in range(len(enc_col)):
        for j in range(len(enc_col[i])):
            enc_col_array[i][j] = enc_col[i][j]

    ## ! Define the arrays' instances and allocate memory for them: End

    ## Update <enc_col_array_>
    lib.pad_or_trim(enc_col_array, len(enc_col), eos_int, pad_int, t_bound, enc_col_array_)

    ## Copy values from <enc_col_seq_array> to <enc_col_seq>
    for i in range(len(enc_col_)):
        for j in range(len(enc_col_[i])):
            enc_col_[i][j] = enc_col_array_[i][j]

    return enc_col_

def build_vocab(seq_string, special_tokens):
    """
    Description:
        Generates a vocabulary object with respect to the decreasing order of token-frequency.

    Inputs:
        <seq_string>: Type: <list[<str>]>. Contains the dataset's tokens.
        <special_tokens>: Type: <list[<str>]>. Contains the special tokens.

    Outputs:
        <vocab_>: Type: <torchtext.vocab.Vocab>.
        <freqs>: Type: <list>.
    """

    seq_string_counter = Counter(seq_string)
    seq_string_sorted_by_freq_tuples = sorted(seq_string_counter.items(), key=lambda x: x[1], reverse=True)
    seq_string_ordered_dict = OrderedDict(seq_string_sorted_by_freq_tuples)
    vocab_ = torchtext.vocab.vocab\
    (
        ordered_dict = seq_string_ordered_dict,
        min_freq = 2,
        specials = special_tokens,
        special_first = True
    )

    freqs = list(iter(seq_string_ordered_dict.values()))

    return freqs, vocab_

def bosify_seq(col_seq_int, bos_int):
    '''
    Description:
        Extremely important function that takes a collection of sequences, and on the step axis, it removes the final (special) token and adds the <bos> token in the beginning of the sequence.

    Inputs:
        <col_seq_int>: Type: torch.Tensor. Shape: (n_examples, max_steps_seq).

    Outputs:
        <col_seq_int_bos>: Type: torch.Tensor. Shape: (n_examples, max_steps_seq)
    '''

    tgt_int_bos = col_seq_int[:,:-1]
    bos_int_expanded = torch.ones((col_seq_int.shape[0], 1), dtype=int) * bos_int
    col_seq_int_bos = torch.concat((bos_int_expanded, tgt_int_bos), axis=1)

    return col_seq_int_bos

class translated_text_dataset:

    def __init__(self, shuffle_seed):
        '''
        Input:
            <shuffle_seed>: Type <int>. Used as a seed value for the RNG of the associated dataset's splitting. <split_seed> must never change between training interruptions or parameter transfers. The initial seed value should be randomized.
        '''

        def parse_local_raw_data(raw_dataset_fpath):

            with open(raw_dataset_fpath, 'r') as file:
                raw_dataset_str = file.read()[:-1]

            return raw_dataset_str

        self.shuffle_seed = shuffle_seed

        self.instance_shuffle_generator = torch.Generator()
        self.instance_shuffle_generator.manual_seed(self.shuffle_seed)

        raw_dataset_fpath = './drive/MyDrive/colab_storage/datasets/ell.txt'
        dataset_dir_fpath = '/'.join(raw_dataset_fpath.split('/')[:-1])
        self.dataset_name = raw_dataset_fpath.split('.')[-2].split('/')[-1]

        self.max_steps_src = self.max_steps_tgt = 9
        self.n_of_instances_to_keep = -1 # For debugging use: 640

        ## [training fraction, validation fraction, test fraction].
        self.split_fractions = [0.9, 0.1, 0.0]

        self.raw_dataset_str = parse_local_raw_data(raw_dataset_fpath)

        preprocessed_dataset_fname = 'pp_' + self.dataset_name
        self.preprocessed_dataset_fpath = dataset_dir_fpath + '/' + preprocessed_dataset_fname + '.pt'

    def generate_dataset(self):
        """
        Description:
            Loads raw dataset, conducts basic token preprocessing and generates the feature and target tensors.
        """

        ## Data cleaning
        self.dataset_str = purger(self.raw_dataset_str)
        self.dataset_str = initial_text_preprocess(self.dataset_str)

        ## Tokenization (string form)
        self.src_str, self.tgt_str = tokenize(self.dataset_str)

        ## Shuffle
        perm = torch.randperm(len(self.src_str), generator=self.instance_shuffle_generator).tolist()
        self.src_str = [self.src_str[example_idx] for example_idx in perm]
        self.tgt_str = [self.tgt_str[example_idx] for example_idx in perm]

        self.src_str = self.src_str[:self.n_of_instances_to_keep]
        self.tgt_str = self.tgt_str[:self.n_of_instances_to_keep]

        ## Vocabulary and token-wise frequency of appearance
        src_freqs, self.src_vocab = build_vocab\
        (
            [token for sentence in self.src_str for token in sentence],
            special_tokens=['<unk>', '<pad>', '<eos>']
        )
        tgt_freqs, self.tgt_vocab = build_vocab\
        (
            [token for sentence in self.tgt_str for token in sentence],
            special_tokens=['<unk>', '<pad>', '<eos>', '<bos>']
        )

        ## Encode tokens from strings to integers
        self.src_int = c_numericalize_str(self.src_str, self.src_vocab.get_itos()) ## = X
        self.tgt_int = c_numericalize_str(self.tgt_str, self.tgt_vocab.get_itos()) ## = Y

        ## Produce plots
        plot_sentence_size(data_pair=(self.src_str, self.tgt_str), image_name=self.dataset_name+'_cnt_examples_per_sentence_len')
        plot_frequency_curves(freqs_pair=(src_freqs, tgt_freqs), image_name=self.dataset_name+'_freq')

        ## Pad or trim and conversion to tensor
        self.src_int = torch.tensor\
        (
            c_pad_or_trim\
            (
                enc_col=self.src_int,
                eos_int=self.src_vocab.get_stoi()['<eos>'],
                pad_int=self.src_vocab.get_stoi()['<pad>'],
                t_bound=self.max_steps_src
            ),
            # dtype=torch.int32
        )
        self.tgt_int = torch.tensor\
        (
            c_pad_or_trim\
            (
                enc_col=self.tgt_int,
                eos_int=self.tgt_vocab.get_stoi()['<eos>'],
                pad_int=self.tgt_vocab.get_stoi()['<pad>'],
                t_bound=self.max_steps_tgt
            ),
            # dtype=torch.int32 ## This leads to an error in <torch.nn.functional.one_hot> because <self.tgt_int> stops being an "index tensor" if its data type is a 32-bit integer.
        )

        # self.eos_test()
        # self.test_correct_sentence()

        self.n_instances = self.src_int.shape[0]

        self.src_vocab_size = len( self.src_vocab.get_itos() )
        self.tgt_vocab_size = len( self.tgt_vocab.get_itos() )

        self.tgt_int_bos = bosify_seq(col_seq_int=self.tgt_int, bos_int=self.tgt_vocab.get_stoi()['<bos>'])

        self.dataset = torch.utils.data.TensorDataset(self.src_int, self.tgt_int, self.tgt_int_bos)

        separateXy = lambda set: ( torch.stack([set[i][_] for i in range(len(set))], axis=0) for _ in range(2) )

        if self.split_fractions[-1] != 0: # Included a test set
            self.train_set, self.val_set, self.test_set = torch.utils.data.random_split(self.dataset, self.split_fractions, generator=torch.Generator().manual_seed(42))

            self.n_train = len(self.train_set)
            self.n_val = len(self.val_set)
            self.n_test = len(self.test_set)

            self.X_train, self.y_train = separateXy(self.train_set)
            self.X_val, self.y_val = separateXy(self.val_set)
            self.X_test, self.y_test = separateXy(self.test_set)

            dataset = {'train': self.train_set, 'val': self.val_set, 'test': self.test_set}

        else: # Excluded the test set
            self.train_set, self.val_set = torch.utils.data.random_split(self.dataset, self.split_fractions[:2], generator=torch.Generator().manual_seed(42))

            self.n_train = len(self.train_set)
            self.n_val = len(self.val_set)

            self.X_train, self.y_train = separateXy(self.train_set)
            self.X_val, self.y_val = separateXy(self.val_set)

            dataset = {'train': self.train_set, 'val': self.val_set, 'test': None}

        torch.save(obj=dataset, f=self.preprocessed_dataset_fpath)

    def eos_test(self):
        '''
        Description:
            <eos> token exists exactly once iff it's a pass.
        '''

        pass_eos_test = True
        for i in range(self.src_int.shape[0]):
            exactly_one = (Counter(self.src_int[i,:].tolist())[self.src_vocab.get_stoi()['<eos>']] == 1) and (Counter(self.tgt_int[i,:].tolist())[self.tgt_vocab.get_stoi()['<eos>']] == 1)
            pass_eos_test = pass_eos_test and exactly_one

        if pass_eos_test:
            print('eos test passed')
        else:
            print('eos test failed')

    def test_correct_sentence(self):

        # translated_int = torch.argmax(p_distr, axis=2)[0]
        translated_str_src = [self.src_vocab.get_itos()[self.src_int[100,t]] for t in range(self.src_int.shape[1])]
        translated_str_tgt = [self.tgt_vocab.get_itos()[self.tgt_int[100,t]] for t in range(self.tgt_int.shape[1])]

In [None]:
def xavier_uniform(n_in, n_out, device):

    bound = math.sqrt(3.0) * math.sqrt(2.0 / float(n_in + n_out))
    return nn.Parameter(torch.rand((n_in, n_out)).to(device)*(2*bound)-bound)

In [None]:
class dense(nn.Module):

    def __init__(self, n_inp, n_out, n_steps, device, dropout_rate=0.0):

        super().__init__()

        self.device = device

        self.n_inp = n_inp
        self.n_out = n_out
        self.n_steps = n_steps

        self.dropout_rate = dropout_rate

        self.W = xavier_uniform(self.n_inp, self.n_out, self.device)
        self.b = nn.Parameter(torch.zeros(self.n_out).to(self.device))

        self.name = 'dense'

    def forward(self, inputs, train):
        '''
        Inputs:
            <inputs>: Type: <torch.Tensor>. Shape: (n_examples, n_steps, n_inp) or (n_examples, n_inp).
            <train>: Type: <bool>.

        Outputs:
            <outputs>: Type: <torch.Tensor>. Shape: (n_examples, n_steps, n_out) if len(inputs.shape) is 3 or (n_examples, n_out) if len(inputs.shape) is 3.
        '''

        input_shape_length = len(inputs.shape)
        if input_shape_length == 2:
            ## Expanding inputs to a tensor with shape (n_examples, 1, n_inp)
            inputs = inputs[:, None, :]

        outputs = []
        for (t, X_t) in enumerate(inputs.swapaxes(0,1)):
            O_t = torch.matmul(X_t, self.W) + self.b
            outputs.append(O_t)
        outputs = torch.stack(outputs).swapaxes(0,1)

        if train and (self.dropout_rate != 0.0):
            switches = torch.ones(outputs.shape[2]).to(self.device)
            switches[0:round(self.dropout_rate * outputs.shape[2])] = 0.0
            switches = switches[torch.randperm(outputs.shape[2])]

            switches_outputs = switches.repeat(outputs.shape[0], outputs.shape[1], 1)
            outputs = outputs * switches_outputs

        if input_shape_length == 2:
            outputs = outputs[:, 0, :]

        return outputs

    def __repr__(self):

        n_pars = 0
        for par in self.parameters():
            n_pars += math.prod(par.shape)

        return f"%s(n_params=%d, n_inp=%d, n_out=%d, n_steps=%d)"%(self.name, n_pars, self.n_inp, self.n_out, self.n_steps)

class vanilla_recurrent(nn.Module):
    """
    Description:
        Vanilla recurrent layer.
    """

    def __init__(self, n_inp, n_hid, device):

        super().__init__()

        self.device = device

        self.n_inp = n_inp
        self.n_hid = n_hid

        self.W_xh = xavier_uniform(self.n_inp, self.n_hid, self.device)
        self.W_hh = xavier_uniform(self.n_hid, self.n_hid, self.device)
        self.b_h = nn.Parameter(torch.zeros(n_hid).to(self.device))

        self.name = 'vanilla_recurrent'

    def forward(self, inputs, H_t=None):
        """
        Inputs:
            <inputs>: Type: <torch.Tensor>. Shape: (n_minibatch, n_steps, n_inp).
            <H_t>: Type: <torch.Tensor>. Shape: (n_minibatch, n_hid).

        Outputs:
            <outputs>: Type: torch.Tensor. Shape: (n_minibatch, n_steps, n_hid).
        """

        if H_t is None:
            H_t = torch.zeros((inputs.shape[0], self.n_hid)).to(self.device)
        outputs = []
        for (t, X_t) in enumerate(inputs.swapaxes(0, 1)):
            H_t = torch.tanh(torch.matmul(X_t, self.W_xh) + torch.matmul(H_t, self.W_hh) + self.b_h)
            outputs.append(H_t)
        outputs = torch.stack(outputs).swapaxes(0, 1)

        return outputs, H_t
    
    def __repr__(self):

        n_pars = 0
        for par in self.parameters():
            n_pars += math.prod(par.shape)

        return f"%s(n_params=%d)"%(self.name, n_pars)

class gru(nn.Module):

    def __init__(self, n_inp, n_hid, device, dropout_rate=0.0):

        def init_weight(n_in, n_out):

            bound = math.sqrt(3.0) * math.sqrt(2.0 / float(n_in + n_out))

            return nn.Parameter(torch.rand((n_in, n_out)).to(self.device)*(2*bound)-bound)

        super().__init__()

        self.device = device

        self.n_inp = n_inp
        self.n_hid = n_hid

        self.dropout_rate = dropout_rate

        triple = lambda: \
        (
            xavier_uniform(self.n_inp, self.n_hid, self.device),
            xavier_uniform(self.n_hid, self.n_hid, self.device),
            nn.Parameter(torch.zeros(self.n_hid).to(self.device))
        )
        self.W_xz, self.W_hz, self.b_z = triple() # Update gate
        self.W_xr, self.W_hr, self.b_r = triple() # Reset gate
        self.W_xh, self.W_hh, self.b_h = triple() # Candidate hidden state

        self.name = 'gru'

    def forward(self, inputs, train, H_t=None):
        '''
        Inputs:
            <inputs>: Type: <torch.Tensor>. Shape: (n_examples, n_steps, vocab_size).
            <train>: Type: <bool>. Toggles training/predicting.
            <H_t>: Type: <torch.Tensor>. Shape: (n_examples, n_hid). The initial hidden state. Default is None.

        Outputs:
            <outputs_>: Type: <tuple(<outputs>, <H_t>)>. Assuming that T=n_steps.
                <outputs>: <torch.Tensor>. Shape: (n_examples, n_steps, n_hid). Hidden states for all t=0,1,...,T.
                <H_t>: <torch.Tensor>. Shape: (n_examples, n_hid). Final hidden state at t=T.
        '''

        if H_t is None:
            # Initial state with shape: (self.batch_size, self.n_hid)
            H_t = torch.zeros((inputs.shape[0], self.n_hid), device=self.device)
        outputs = []
        for (t, X_t) in enumerate(inputs.swapaxes(0, 1)):
            Z_t = torch.sigmoid(torch.matmul(X_t, self.W_xz) + torch.matmul(H_t, self.W_hz) + self.b_z)
            R_t = torch.sigmoid(torch.matmul(X_t, self.W_xr) + torch.matmul(H_t, self.W_hr) + self.b_r)
            H_tilde_t = torch.tanh(torch.matmul(X_t, self.W_xh) + torch.matmul(R_t * H_t, self.W_hh) + self.b_h)
            H_t = Z_t * H_t + (1 - Z_t) * H_tilde_t
            outputs.append(H_t)
        outputs = torch.stack(outputs).swapaxes(0, 1)

        if train and (self.dropout_rate != 0.0):

            switches = torch.ones(self.n_hid).to(self.device)
            switches[0:round(self.dropout_rate * self.n_hid)] = 0.0
            switches = switches[torch.randperm(self.n_hid)]

            switches_outputs = switches.repeat(outputs.shape[0], outputs.shape[1], 1)
            outputs = outputs * switches_outputs

            switches_H_t = switches.repeat(H_t.shape[0], 1)
            H_t = H_t * switches_H_t

        return outputs, H_t

    def __repr__(self):

        n_pars = 0
        for par in self.parameters():
            n_pars += math.prod(par.shape)

        return f"%s(n_params=%d, n_inp=%d, n_hid=%d)"%(self.name, n_pars, self.n_inp, self.n_hid)

In [None]:
class encoder(nn.Module):

    def __init__(self, n_inp, n_max_steps, n_out, device):
        '''
        Description:
            The encoder of a MT module. The final layer is a recurrent type.

        Inputs:
            <n_inp>: Type <int>. The number of input neurons per example. Coincides with the size of the source vocabulary.
            <n_max_steps>: Type <int>. The number of source's steps including the padding size.
            <n_out>: Type <int>. For a fixed example and step, this is the number of neurons in the encoder's output layer.
            <device>: Type <torch.device>.
        '''

        super().__init__()

        self.device = device

        ## Number of embedding axes.
        self.n_emb = 600

        self.n_hid1 = 600

        self.n_inp = n_inp
        self.n_max_steps = n_max_steps

        ## Output neurons scaler. May as well be named as "self.n_hid".
        self.n_out = n_out

        self.emb = dense(n_inp=self.n_inp, n_out=self.n_emb, n_steps=self.n_max_steps, device=self.device)
        self.rec1 = gru(n_inp=self.n_emb, n_hid=self.n_hid1, device=self.device, dropout_rate=0.0)
        self.rec2 = gru(n_inp=self.n_hid1, n_hid=self.n_out, device=self.device, dropout_rate=0.0)

    def forward(self, X, train):
        '''
        Inputs:
            <X>: Type: <torch.Tensor>. Shape: (n_examples, n_steps, vocab_size). The encoder's input which is the preprocessed source data.
            <train>: Type: <bool>.

        Outputs:
            <out>: Type: <tuple[<torch.Tensor>, <torch.Tensor>]>.
                <output_of_final_layer>: Type: <torch.Tensor>. Shape: (n_examples, n_steps, n_out). The output of the final recurrent layer containing all steps.
                <layerwise_final_states>: Type: <torch.Tensor>. Shape: (n_recurrent_layers, n_examples, n_out). Along the first axis, it contains the final state of each recurrent layer in this module.
        '''

        recurrent_layers_final_states = []

        out = self.emb(inputs=X, train=train)
        out = self.rec1(inputs=out, train=train)
        recurrent_layers_final_states.append(out[1])
        out = self.rec2(inputs=out[0], train=train)
        recurrent_layers_final_states.append(out[1])

        return (out[0], recurrent_layers_final_states)

class decoder(nn.Module):

    def __init__(self, n_tgt_inp, n_context, n_max_steps, n_out, bos_int, eos_int, pad_int, device):
        '''
        Inputs:
            <n_tgt_inp>: Type <int>. The target's size per example per step.
            <n_context>: Type <int>. The context's size.
            <n_max_steps>: Type <int>. The number of target's steps including the padding size.
            <n_out>: Type <int>. Size of the output dense layer. Equal to the size of the source's vocabulary.
            <device>: Type <torch.device>.
        '''

        super().__init__()

        self.device = device

        self.n_emb = 600
        self.n_hid1 = 600
        self.n_hid2 = 600

        self.bos_int = bos_int
        self.eos_int = eos_int
        self.pad_int = pad_int
        self.n_tgt_inp = n_tgt_inp
        self.n_context = n_context
        self.n_inp = self.n_tgt_inp
        self.n_yc = self.n_context + self.n_emb
        self.n_out = n_out
        self.n_max_steps = n_max_steps

        self.tgt_ohe_modifier = lambda x: torch.stack([F.one_hot(x[seq_idx], self.n_tgt_inp) for seq_idx in range(len(x))], axis=0).type(torch.float32).to(self.device)

        self.emb = dense(n_inp=self.n_inp, n_out=self.n_emb, n_steps=self.n_max_steps, device=self.device)
        self.rec1 = gru(n_inp=self.n_yc, n_hid=self.n_hid1, device=self.device, dropout_rate=0.0)
        self.rec2 = gru(n_inp=self.n_hid1, n_hid=self.n_hid2, device=self.device, dropout_rate=0.0)
        self.dense_out = dense(n_inp=self.n_hid2, n_out=self.n_out, n_steps=self.n_max_steps, device=self.device)
        self.softmax = nn.Softmax(dim=-1)

        self.available_pipelines = \
        [
            self.train_teacher_forcing_pipeline,
            self.train_scheduled_sampling_pipeline,
            self.prediction_stochastic_pipeline,
            self.prediction_greedy_search_pipeline,
            self.prediction_beam_search_pipeline
        ]

        self.eos_ohe = self.tgt_ohe_modifier(torch.tensor(eos_int)[None, None])
        self.pad_ohe = self.tgt_ohe_modifier(torch.tensor(pad_int)[None, None])

    ## ! Architecture: Begin

    def one_step_primary_pipeline(self, inp, context, initial_states, train):
        rec1_init_state, rec2_init_state = initial_states

        out_ = inp
        out_ = self.emb(inputs=out_, train=train)
        out_ = torch.cat((out_, context), -1)
        out_ = self.rec1(inputs=out_, H_t=rec1_init_state, train=train)
        rec1_init_state = out_[1]
        out_ = self.rec2(inputs=out_[0], H_t=rec2_init_state, train=train)
        rec2_init_state = out_[1]
        out_ = self.dense_out(inputs=out_[0], train=train)
        out_ = self.softmax(out_)

        return out_, (rec1_init_state, rec2_init_state)

    def multi_step_primary_pipeline(self, Y, context, initial_states, train):
        rec1_init_state, rec2_init_state = initial_states

        out = self.emb(inputs=Y, train=train)
        out = torch.cat((out, context), -1)
        out = self.rec1(inputs=out, H_t=rec1_init_state, train=train)
        out = self.rec2(inputs=out[0], H_t=rec2_init_state, train=train)
        out = self.dense_out(inputs=out[0], train=train)
        out = self.softmax(out)

        return out

    ## ! Architecture: End

    ## ! Wraps of the primary/main decoder's pipeline: Begin

    def train_teacher_forcing_pipeline(self, Y, context, initial_states, train, dec_config):
        '''
        Trigger:
            When <dec_mode> is 0.

        Description:
            Returns a distribution function. Given a t \in \{ 1, ..., T-1 \}, it's
            P(Y_t | Y_{t-1}=y_{t-1}).
        '''

        out = self.multi_step_primary_pipeline(Y=Y, context=context, initial_states=initial_states, train=train)

        return out

    def train_scheduled_sampling_pipeline(self, Y, context, initial_states, train, dec_config):
        '''
        Trigger:
            When <dec_mode> is 1.

        Description:
            Returns a distribution function. Given a t \in \{ 1, ..., T-1 \}, it's
            P(Y_t | Y_{t-1}=y_{t-1}).
        '''

        decay = dec_config['decay']

        switches = torch.ones(self.n_max_steps).type(torch.bool)
        switches[0:round((1-decay) * (self.n_max_steps))] = False
        switches = [True] + switches[torch.randperm(self.n_max_steps)].tolist()

        context = context[:,0:1,:]

        out = []
        for t in range(self.n_max_steps):
            if switches[t]:
                out_ = Y[:,t:t+1,:]  ## Remember that the first value is <bos> !!!

            out_, initial_states = self.one_step_primary_pipeline(inp=out_, context=context, initial_states=initial_states, train=False)
            out.append(out_[:,0,:]) ## To justify the slice out_[:,0,:]: <out_> has (n_examples, 1, n_tgt_inp), and we're iterating with respect to the step t. When the loops terminates, <out> will properly contain the time step axis.
        out = torch.stack(out).swapaxes(0, 1)

        return out

    def prediction_stochastic_pipeline(self, Y, context, initial_states, train, dec_config):
        '''
        Trigger:
            When <dec_mode> is 2.

        Description:
            Returns a distribution function.
            P(Y_t | Y_{t-1}, ..., Y_0, CONTEXT=context)
        '''

        out = []
        out_ = Y
        for t in range(self.n_max_steps):
            out_, initial_states = self.one_step_primary_pipeline(inp=out_, context=context, initial_states=initial_states, train=False)
            out.append(out_[:,0,:]) ## To justify the slice out_[:,0,:]: <out_> has (n_examples, 1, n_tgt_inp), and we're iterating with respect to the step t. When the loops terminates, <out> will properly contain the time step axis.
        out = torch.stack(out).swapaxes(0, 1)

        return out

    def prediction_greedy_search_pipeline(self, Y, context, initial_states, train, dec_config):
        '''
        Trigger:
            When <dec_mode> is 3.

        Description:
            Returns a fixed token
            x_{t+1}
            inferred from
            P(X_{t+1} | X_t=x_t, ..., X_0=x_0, CONTEXT=context)
        '''

        out = []
        out_ = Y
        for t in range(self.n_max_steps):
            out_, initial_states = self.one_step_primary_pipeline(inp=out_, context=context, initial_states=initial_states, train=False)
            out.append(out_[:,0,:])
            out_ = torch.argmax(out_, axis=-1)
            out_ = self.tgt_ohe_modifier(out_)
        out = torch.stack(out).swapaxes(0, 1)

        return out

    def prediction_beam_search_pipeline(self, Y, context, initial_states, train, dec_config):
        '''
        Trigger:
            When <dec_mode> is 4.

        Description:
            Returns a fixed token.
        '''

        def score(probability, seq_length):

            a = 0.75
            score_value = np.log(probability) / seq_length**a

            return score_value

        def get_scores(probabilities):

            scores = [[None for branch_idx in range(len(probabilities[t]))] for t in range(len(probabilities))]

            for t in range(len(probabilities)):
                seq_length = t + 1
                for branch_idx in range(len(probabilities[t])):
                    scores[t][branch_idx] = score(probabilities[t][branch_idx], seq_length)

            return scores

        def get_optimal_score_position(score):

            n_examples = len(score[0][0])

            max_score = [-np.infty for i in range(n_examples)]
            t_optimal = [None for i in range(n_examples)]
            branch_idcs_optimal = [None for i in range(n_examples)]

            for i in range(n_examples):

                for t in range(len(score)):
                    for branch_idx in range(len(score[t])):
                        if score[t][branch_idx][i] > max_score[i]:
                            max_score[i] = score[t][branch_idx][i]
                            t_optimal[i] = t
                            branch_idcs_optimal[i] = branch_idx

            return t_optimal, branch_idcs_optimal

        def get_optimal_col_sequences(beam_graph, t_optimal, branch_idcs_optimal, eos_ohe, pad_ohe):
            '''
            Description:
                Finalizes the prediction tensor containing the optimal sequences among others based on given indices.

            Inputs:
                <beam_graph>: Type: <list[<list[torch.Tensor]>]>. The outer list has length equal to n_max_steps. At the outer list's position beam_step, the inner list has length equal to the number of branches there. The tensor inside has shape (n_examples, 1, n_out). Hence a value can be indexed like this
                e.g. beam_graph[beam_step][branch_idx][example_idx, 0, out_dim]
                <t_optimal>: Type: <list[<int>]>. Length: n_examples. For a given example, this list contains the steps where beam_graph contains the optimal sequence.
                <branch_idcs_optimal>: Type: <list[<int>]>. Length: n_examples. For a given example, this list contains the branch index (relative to its corresponding step) where beam_graph contains the optimal sequence.
                <eos_ohe>: Type: <torch.Tensor>. Shape: (1, 1, n_out). The end of sequence token (<eos>) in OHE.
                <pad_ohe>: Type: <torch.Tensor>. Shape: (1, 1, n_out). The padding token (<pad>) in OHE.

            Outputs:
                <optimal_col_sequences>: Type: <torch.Tensor>. Shape: (n_examples, max_n_steps, n_out).
            '''

            n_examples = len(t_optimal)

            optimal_col_sequences = []
            for i in range(n_examples):

                ## Find optimal sequences
                optimal_sequence = beam_graph[t_optimal[i]][branch_idcs_optimal[i]][i:i+1] ## The tensor has shape (x1 example, t_optimal[i]+1, n_out).

                ## ! Fill in each sequence to achieve shape homogeneity: Begin

                empty_positions = self.n_max_steps - optimal_sequence.shape[1]

                # optimal_col_sequences.append(optimal_sequence)
                if empty_positions > 0:
                    optimal_sequence = torch.cat((optimal_sequence, eos_ohe, pad_ohe.repeat(1, empty_positions-1, 1)), axis=1)

                ## ! Fill in each sequence to achieve shape homogeneity: Begin

                optimal_col_sequences.append(optimal_sequence)

            optimal_col_sequences = torch.cat(optimal_col_sequences, axis=0)

            return optimal_col_sequences

        beam_width = dec_config['beam_width']
        assert isinstance(beam_width, int) and (beam_width >= 1) and (beam_width <= self.n_out), "E: Invalid beam width value."

        beam_graph = [[None for parent_branch_idx in range(min(beam_width, t*beam_width+1))] for t in range(self.n_max_steps+1)]
        beam_graph_probabilities = [[None for parent_branch_idx in range(min(beam_width, t*beam_width+1))] for t in range(self.n_max_steps+1)]
        beam_graph[0][0] = Y
        beam_graph_probabilities[0][0] = np.ones((Y.shape[0], 1)) ## Shape: (n_examples, 1 n_possibilities@t=0)

        for t in range(self.n_max_steps): ## Referring to the branch specified by <t> and <parent_branch_idx> as the parent branch, and all the brances connected with this parent branch that belong to step, <t>+1 as child branches.
            for parent_branch_idx in range(len(beam_graph[t])):

                ## ! Forward run: Begin

                out_ = beam_graph[t][parent_branch_idx][:,-1,:][:, None, :]
                out_, initial_states = self.one_step_primary_pipeline(inp=out_, context=context, initial_states=initial_states, train=False)

                beam_graph_node = out_

                ## ! Forward run: End

                ## ! Beam Search graph update: Begin

                if t == 0:
                    k = deepcopy(beam_width)
                else:
                    k = 1

                child_branches_token_probability_given_parent_token, child_branches_likely_tokens_int = torch.topk(input=beam_graph_node, k=k, dim=-1) ## Shapes are both (n_examples, 1 step, k)
                child_branches_token_probability_given_parent_token = child_branches_token_probability_given_parent_token[:,0,:].cpu().detach().numpy()
                child_branches_likely_tokens_ohe = self.tgt_ohe_modifier(child_branches_likely_tokens_int) ## Shape (n_examples, 1 step, k, n_out)
                parent_branch_built_sequence_probability = np.repeat(beam_graph_probabilities[t][parent_branch_idx], k, axis=1)

                child_branches_built_sequence_probability = parent_branch_built_sequence_probability * child_branches_token_probability_given_parent_token ## Probability chain rule

                child_step_connected_branch_relative_to_parent_idx = 0
                for child_step_connected_branch_idx in range(k*parent_branch_idx, k*(parent_branch_idx+1)):
                    beam_graph[t+1][child_step_connected_branch_idx] = torch.cat((beam_graph[t][parent_branch_idx], child_branches_likely_tokens_ohe[..., child_step_connected_branch_relative_to_parent_idx, :]), -2) ## Concatenating all tokens produced until this step, for this possibility. Shape (n_examples, (t+1)+1, n_out)
                    beam_graph_probabilities[t+1][child_step_connected_branch_idx] = child_branches_built_sequence_probability[..., child_step_connected_branch_relative_to_parent_idx:child_step_connected_branch_relative_to_parent_idx+1]
                    child_step_connected_branch_relative_to_parent_idx += 1
                del child_step_connected_branch_relative_to_parent_idx

                ## ! Beam Search graph update: End

        beam_graph = beam_graph[1:]

        ## ! Dropping <bos> entirely: Begin

        for t in range(self.n_max_steps):
            for parent_branch_idx in range(len(beam_graph[t])):
                beam_graph[t][parent_branch_idx] = beam_graph[t][parent_branch_idx][:,1:,:]

        beam_graph_probabilities = beam_graph_probabilities[1:]

        ## ! Dropping <bos> entirely: End

        ## ! Search optimal sequence on graph: Begin

        scores = get_scores(beam_graph_probabilities)
        t_optimal, branch_idcs_optimal = get_optimal_score_position(scores)
        optimal_col_sequences = get_optimal_col_sequences\
        (
            beam_graph=beam_graph,
            t_optimal=t_optimal,
            branch_idcs_optimal=branch_idcs_optimal,
            eos_ohe=self.eos_ohe,
            pad_ohe=self.pad_ohe
        )

        ## ! Search optimal sequence on graph: End

        return optimal_col_sequences

    ## ! Wraps of the primary/main decoder's pipeline: End

    def forward(self, Y, context, initial_states, dec_mode, dec_config, train):
        '''
        Inputs:
            <Y>: Type: <torch.Tensor> or <NoneType>. Shape: (n_examples, max_steps_tgt+1, n_tgt_inp). Targets of collection-examples in OHE starting with the <bos> token.
            <context>: Type: <torch.Tensor>. Shape: (n_examples, Y.shape[1], n_context). Context tensor. Repeated across axis 1.
            <initial_states>: Type: <torch.Tensor>. Shape: (n_recurrent_layers, n_examples, n_out). Assumed that the number of recurrent layers in the encoder is the same as in the decoder. Additionally it was assumed that the configuration of these layers are identical.
            <dec_mode>: Type: <int>. Specifies which of the decoder's pipeline will be utilized
                0: Teacher forcing pipeline
                2: Stochastic model sampling
                3: Greedy Search
                4: Beam Search
            <dec_config>: Type: <dict>. Contains the configuration values of its respective pipeline.
            <train>: Type: <bool>.

        Outputs:
            <out>: Output of corresponding pipeline.
        '''

        pipeline = self.available_pipelines[dec_mode]

        out = pipeline(Y=Y, context=context, initial_states=initial_states, dec_config=dec_config, train=train)

        return out

In [None]:
class seq2seq(nn.Module):
    '''
    Description:
        This is the Sequence to Sequence model based on some encoder and decoder.

    Specification (High abstraction):
        (S1) This pipeline's training process utilizes the Teacher-Forcing methodology.
        (S2) The context tensor is set to equal the final state of the encoder's output layer (which is a recurrent layer).
        (S3) In case that the model is being trained, for each given decoder's state, the input is a concatenation of the target tensor slice and the context tensor. Otherwise if the model is used to predict, for each given decoder's state, the input is a concatenation of a previous state's output with the context tensor. The decoder's initial state is the OHE of '<bos>'.
    '''

    def __init__(self, n_src_inp, max_steps_src, n_tgt_inp, max_steps_tgt, bos_int, eos_int, pad_int, device):
        '''
        Inputs:
            <n_src_inp>: Type <int>. The number of source inputs per example per step.
            <max_steps_src>: Type <int>. The number source's steps including the padding size.
            <n_tgt_inp>: Type <int>. The number of target inputs per example per step.
            <max_steps_tgt>: Type <int>. The number target's steps including the padding size.
            <bos_int>: Type <int>. The '<bos>' token expressed as an integer i.e. encoded as the target vocabularies index.
            <eos_int>: Type <int>. The '<eos>' token expressed as an integer i.e. encoded as the target vocabularies index.
            <pad_int>: Type <int>. The '<pad>' token expressed as an integer i.e. encoded as the target vocabularies index.
            <device>: Type <torch.device>.
        '''

        super().__init__()

        self.device = device

        ## Input size, coiciding with the respective vocabularies' lengths.
        self.n_src_inp = n_src_inp
        self.n_tgt_inp = n_tgt_inp

        ## Number of steps including all paddings.
        self.max_steps_src = max_steps_src
        self.max_steps_tgt = max_steps_tgt

        self.bos_int = bos_int
        self.eos_int = eos_int
        self.pad_int = pad_int

        ## These assignments are established by (S2). The "enc" in the name "enc_n_out" refers to the encoder.
        self.context_n = self.enc_n_out = 600

        self.encoder = encoder(n_inp=self.n_src_inp, n_max_steps=self.max_steps_src, n_out=self.enc_n_out, device=self.device)
        self.decoder = decoder(n_tgt_inp=self.n_tgt_inp, n_context=self.context_n, n_max_steps=self.max_steps_tgt, n_out=self.n_tgt_inp, device=self.device, bos_int=self.bos_int, eos_int=self.eos_int, pad_int=self.pad_int)

        self.name = 'seq2seq'

    def init_state(self, enc_output):
        '''
        Description:
            Constructs the context tensor. Assumed that the encoder returns the output of a recurrent layer along with their final states.

        Inputs:
            <enc_output>: Type: <tuple[<torch.Tensor>, <torch.Tensor>]>. The context tensor.
                <output_of_enc_final_layer>: Type: <torch.Tensor>. Shape: (n_examples, n_steps, n_out). The output of the final recurrent layer in the encoder containing all steps.
                <layerwise_final_states>: Type: <torch.Tensor>. Shape: (n_recurrent_layers, n_examples, n_out). Along the first axis, it contains the final state of each of the encoder's recurrent layers.

        Outputs:
            <context>: Type: <tuple[<torch.Tensor>, <torch.Tensor>]>. The context tensor.
                <context>: Type: <torch.Tensor>. Shape: (n_examples, n_steps, n_out). The output of the final recurrent layer in the encoder containing all steps.
                <layerwise_final_states>: Type: <torch.Tensor>. Shape: (n_recurrent_layers, n_examples, n_out). Along the first axis, it contains the final state of each of the encoder's recurrent layers.
        '''

        return enc_output

    def forward(self, dec_mode, dec_config, train, X, Y=None):
        '''
        Inputs:
            <dec_mode>: Type: <int>. Specifies which of the decoder's pipeline will be utilized.
            <dec_config>: Type: <dict>. Decoder's pipeline configuration.
            <train>: Type: <bool>.
            <X>: Type: <torch.Tensor>. Shape: (n_examples, max_steps_src, n_src_inp). Sources of collection-examples in OHE.
            <Y>: Type: <torch.Tensor> or <NoneType>. Shape: (n_examples, max_steps_tgt+1, n_tgt_inp). Targets of collection-examples in OHE starting with the <bos> token.

        Outputs:
            <Y_hat>: Type: <torch.Tensor>. Shape (n_examples, max_steps_tgt, n_tgt_inp). Predicted sequence. Estimation of Y.
        '''

        enc_out = self.encoder(X=X, train=train)
        _, initial_states = self.init_state(enc_out)
        context = initial_states[-1]

        if Y == None:
            bos_ohe = torch.zeros((X.shape[0], 1, self.n_tgt_inp), device=self.device)
            bos_ohe[:,:,self.bos_int] = 1
            Y = bos_ohe

        context = context.repeat(Y.shape[1], 1, 1).swapaxes(0, 1)

        Y_hat = self.decoder(Y=Y, context=context, initial_states=initial_states, dec_mode=dec_mode, dec_config=dec_config, train=train)

        return Y_hat

In [None]:
def categ_cross_entropy(p_distr_pred, p_distr_ground_truth, device, ignore_index=None):
    '''
    Inputs:
        <p_distr_pred>: Type: <torch.Tensor>. Shape: (n_examples, max_steps_tgt, tgt_vocab_size). Estimation of probability distribution.
        <p_distr_ground_truth>: Type: <torch.Tensor>. Shape: (n_examples, max_steps_tgt, tgt_vocab_size). Underlying probability distribution.
        <ignore_index>: Type: <int> or <NoneType>. If <ignore_index> is an integer then the loss function's computation will exclude the class with index <ignore_index>. In case <ignore_index> is set to None then the loss function will be computed normally, i.e. over all indices of <p_distr_pred> and <p_distr_ground_truth>. By default <ignore_index> is set to None.

    Outputs:
        <loss_>: Type <torch.Tensor>. Shape: (). The loss function. If the <grad_fn> attribute is None then there is an issue.
    '''

    numerical_stabilizer = 10**-20

    mask = torch.ones(p_distr_ground_truth.shape, dtype=torch.float32, device=device)
    if ignore_index != None:
        mask[:,:,ignore_index] = 0

    scaler = 400. / mask.sum()
    loss_ = - scaler * torch.sum((mask * p_distr_ground_truth) * torch.log(p_distr_pred+numerical_stabilizer), axis=(0, 1, 2))

    return loss_

def bleu_k(col_pred, col_ground_truth, max_k, unk, eos):
    '''
    Description:
        BLEU function defined with respect to 1-grams, ..., max_k-grams. <d0> is defined to either be <int> or <str>. The inputs have to be trimmed and as a result they shouldn't contain the end of sequence or padding tokens.

    Examples:
        bleu = bleu_k(col_pred=pred_str, col_ground_truth=tgt_str, max_k=2, unk='<unk>', eos='<eos>')

    Inputs:
        <col_pred>: Type: <list[<list[<d0>]>]>. The outer list's length is n_examples. The inner list's length can vary depending on the corresponding number of steps.
        <col_ground_truth>: Type: <list[<list[<d0>]>]>. The outer list's length is n_examples. The inner list's length can vary depending on the corresponding number of steps.
        <max_k>: Type: <int>. Maximum number of gram mergings.
        <unk>: Type: <d0>. The unknown token in integer format.
        <eos>: Type: <d0>. The end-of-sequence token in integer format.

    Outputs:
        <bleu_k_value>: Type <float>.
    '''

    assert len(col_pred) == len(col_ground_truth), 'E: Input collections do not share a common multitude of examples.'

    n_examples = len(col_pred)

    ## Trimming
    col_pred = trim_sequence(seq=col_pred, eos=eos)
    col_ground_truth = trim_sequence(seq=col_ground_truth, eos=eos)

    ## Make the unkown token differ in the sequences to prevent it from contributing to the score. There are, of course, many ways to achieve this.
    col_ground_truth = [[token if (token != unk) else -1 for token in example] for example in col_ground_truth]

    col_pred_kgrams = []
    col_ground_truth_kgrams = []

    bleu_ = [None for i in range(n_examples)]

    for k_idx in range(max_k):
        k = k_idx+1

        col_pred_kgrams.append(single_gram_to_k_grams(col_pred, k))
        col_ground_truth_kgrams.append(single_gram_to_k_grams(col_ground_truth, k))

    for i in range(n_examples):

        product_of_weighted_precisions = 1
        n_pred = len(col_pred[i])
        n_ground_truth = len(col_ground_truth[i])

        if min(n_pred, n_ground_truth) == 0:
            if n_ground_truth == 0:
                bleu_[i] = 1
            else: ## equivalent to `elif (n_ground_truth == 1) and (n_pred==0):`
                bleu_[i] = 0
        else:
            brevity_penalty = min(1.0, math.exp(1 - n_ground_truth/n_pred))

            for k_idx in range(max_k):
                k = k_idx+1

                kgram_instability = k > min([min(len(col_pred[i]), len(col_ground_truth[i])) for i in range(n_examples)])
                if kgram_instability:
                    break

                kgrams_precision = 0. ## Given an example i, for each member of <valid_kgrams>, compute the number of occurences inside <col_ground_truth_kgrams[k_idx][i]> and inside <col_pred_kgrams[k_idx][i]>, take their minimum and add that value. Finally divide by the number of predicted k-grams. The rest is obvious.

                n_kgrams_pred = len(col_pred_kgrams[k_idx][i])
                n_kgrams_ground_truth = len(col_ground_truth_kgrams[k_idx][i])

                valid_kgrams = set() ## kgrams that exist inside <col_ground_truth_kgrams[k_idx][i]>.

                ## Find the common tokens
                for seq_pred_idx in range(n_kgrams_pred):
                    if col_pred_kgrams[k_idx][i][seq_pred_idx] in col_ground_truth_kgrams[k_idx][i]:
                        valid_kgrams.add(col_pred_kgrams[k_idx][i][seq_pred_idx])

                for valid_kgram in valid_kgrams:
                    n_occurences_pred = col_pred_kgrams[k_idx][i].count(valid_kgram)
                    n_occurences_ground_truth = col_ground_truth_kgrams[k_idx][i].count(valid_kgram)
                    kgrams_precision += min((n_occurences_pred, n_occurences_ground_truth))

                kgrams_precision /= n_kgrams_pred

                product_of_weighted_precisions *= kgrams_precision**((1/2)**k)

            bleu_[i] = brevity_penalty * product_of_weighted_precisions

    bleu = sum(bleu_)/n_examples

    return bleu

In [None]:
class figure:

    def __init__(self, metric_names, pipeline_names):

        self.metric_names = metric_names
        self.pipeline_names = pipeline_names

        ## 2 side by side axes
        asp_ratio = 0.7# 0.4375 
        l = 1400# 1400
        h = asp_ratio * l
        self.fig = plt.figure(figsize=(l/96, h/96), dpi=96)
        mpl_style(dark=True)

        self.ax = \
        [
            [
                None
                for metric_idx in range(len(self.metric_names))
            ] for pipeline_idx in range(len(self.pipeline_names))
        ]

        ax_idx = 0
        for pipeline_idx in range(len(self.pipeline_names)):
            for metric_idx in range(len(self.metric_names)):
                ax_idx += 1
                self.ax[pipeline_idx][metric_idx] = self.fig.add_subplot\
                (
                    len(self.pipeline_names)*100 + len(self.metric_names)*10 + ax_idx
                )

    def plot(self, hor_seq, metrics_history):

        def plot_(hor_seq, ax_idx, train_seq, val_seq, ver_name, title_name):

            pipeline_idx, metric_idx = ax_idx

            ## Clearing previous frame
            self.ax[metric_idx][pipeline_idx].clear()

            ## To show only integer numbers on the x-axis
            self.ax[metric_idx][pipeline_idx].xaxis.set_major_locator(MaxNLocator(integer=True))

            self.ax[metric_idx][pipeline_idx].set_xlabel('epoch')
            self.ax[metric_idx][pipeline_idx].set_ylabel(ver_name)

            self.ax[metric_idx][pipeline_idx].plot(hor_seq, train_seq, color='red', label='etr %s'%(ver_name))
            self.ax[metric_idx][pipeline_idx].plot(hor_seq, val_seq, color='cyan', label='val %s'%(ver_name))

            self.ax[metric_idx][pipeline_idx].set_title(title_name)

            self.ax[metric_idx][pipeline_idx].legend()

            # self.ax[metric_idx][pipeline_idx].grid(visible=True, color='gray', alpha=0.5)

        for (pipeline_idx, pipeline_name) in enumerate(self.pipeline_names):
            pipeline_name_ = deepcopy(pipeline_name)
            for (metric_idx, metric_name) in enumerate(self.metric_names):
                plot_\
                (
                    hor_seq=hor_seq,
                    ax_idx=(pipeline_idx, metric_idx),
                    train_seq=metrics_history[metric_name][pipeline_name]['train'],
                    val_seq=metrics_history[metric_name][pipeline_name]['val'],
                    ver_name=self.metric_names[metric_idx],
                    title_name=pipeline_name_,
                )
                pipeline_name_ = ''


def plot_sentence_size(data_pair, image_name):

    mpl_style(dark=True)

    src, tgt = data_pair

    src_sentence_len = [len(sentence) for sentence in src]
    tgt_sentence_len = [len(sentence) for sentence in tgt]

    src_sentence_len_counted = Counter(src_sentence_len)
    src_sentence_len_counted_sorted = sorted(src_sentence_len_counted.items(), key=lambda x: x[0], reverse=False)

    tgt_sentence_len_counted = Counter(tgt_sentence_len)
    tgt_sentence_len_counted_sorted = sorted(tgt_sentence_len_counted.items(), key=lambda x: x[0], reverse=False)

    src_possible_sentence_len = [element[0] for element in src_sentence_len_counted_sorted]
    src_max_possible_sentence_len = max(src_possible_sentence_len)

    tgt_possible_sentence_len = [element[0] for element in tgt_sentence_len_counted_sorted]
    tgt_max_possible_sentence_len = max(tgt_possible_sentence_len)

    max_possible_sentence_len = max([src_max_possible_sentence_len, tgt_max_possible_sentence_len])

    range_possible_sentence_len = list(range(0, max_possible_sentence_len+1))
    src_sentence_len_counted_sorted_extended = [0 for sentence_len in range_possible_sentence_len]
    tgt_sentence_len_counted_sorted_extended = [0 for sentence_len in range_possible_sentence_len]

    for sentence_len in range_possible_sentence_len:

        if sentence_len in src_sentence_len_counted.keys():
            src_sentence_len_counted_sorted_extended[sentence_len] = src_sentence_len_counted[sentence_len]

        if sentence_len in tgt_sentence_len_counted.keys():
            tgt_sentence_len_counted_sorted_extended[sentence_len] = tgt_sentence_len_counted[sentence_len]

    asp_ratio = 0.8
    l = 700
    h = asp_ratio * l
    plt.figure(figsize=(l/96, h/96), dpi=96)

    plt.plot(range_possible_sentence_len, src_sentence_len_counted_sorted_extended, color='orange', label='source')
    plt.plot(range_possible_sentence_len, tgt_sentence_len_counted_sorted_extended, color='lime', label='target')
    # plt.title('Frequency Graph')
    plt.xticks(range_possible_sentence_len[::5])
    plt.xlabel('Token Count')
    plt.ylabel('# of Instances')
    plt.legend()
    plt.grid(visible=True, color='gray', alpha=0.5)

    plt.savefig('./drive/MyDrive/colab_storage/datasets/'+image_name)

def plot_frequency_curves(freqs_pair, image_name):

    mpl_style(dark=True)

    asp_ratio = 0.8
    l = 700
    h = asp_ratio * l
    plt.figure(figsize=(l/96, h/96), dpi=96)

    src_freqs, tgt_freqs = freqs_pair

    src_word_keys = list(range(len(src_freqs)))
    plt.plot(src_word_keys, src_freqs, color='orange', label='source')
    tgt_word_keys = list(range(len(tgt_freqs)))
    plt.plot(tgt_word_keys, tgt_freqs, color='lime', label='target')

    # plt.title('Frequency Graph')
    plt.xlabel('Token Index')
    plt.ylabel('Frequency')
    plt.grid(visible=True, color='gray', alpha=0.5)
    plt.semilogx()
    plt.semilogy()

    plt.legend()

    plt.savefig('./drive/MyDrive/colab_storage/datasets/'+image_name)

In [None]:
class rnn_trainer(nn.Module):

    def __init__(self, model_, data, device):
        """
        Inputs:
            <model_>: Type: <torch.nn.module>. Specifies the trained model's architecture and parameters.
            <data>: Type: <class>. Contains all the necessary dataset's information.
            <device>: Type: <torch.device>.
        """

        super().__init__()

        self.device = device

        self.model = model_
        self.data = data

        self.epochs = 1000
        self.lr = 0.005
        self.minibatch_size = 2**10

        # self.decay_rate = 16.666 # DEFAULT: 16.666
        # self.min_decay = 5
        # self.decay_scheduler = lambda epoch: max(self.min_decay, self.decay_rate/(self.decay_rate+math.exp(epoch/self.decay_rate)))
        self.decay_scheduler = lambda epoch: 9/9

        ## Backup frequence used for live training monitoring
        self.bkp_freq = 10

        ## Backup frequency of saved results
        saved_bkp_freq = 10
        self.scheduled_checkpoints = {}#{10, 20, 30, 60, 90, 120, 150, 200, 250, 300, 350, 400, 500, 600, 700, 800, 1000} #{saved_bkp_freq*ep for ep in range(self.epochs // saved_bkp_freq)}; self.scheduled_checkpoints.remove(0)

        self.metric_names = ['loss', 'bleu']
        self.pipeline_names = ['training_pipeline', 'id2_pipeline']
        self.data_subset_names = ['train', 'val']

        self.training_dir_path, self.training_format = './drive/MyDrive/colab_storage/training/', '.pt'
        self.criterion = categ_cross_entropy ## Has to be masked
        self.optimizer = torch.optim.Adam(params=self.model.parameters(), lr=self.lr)

        self.train_dataloader = torch.utils.data.DataLoader(self.data.train_set, batch_size=self.minibatch_size, shuffle=True)
        self.val_dataloader = torch.utils.data.DataLoader(self.data.val_set, batch_size=2**10, shuffle=False)

        ## One hot encoder of tensor <tsr> having shape (n_examples, n_steps, n) and number of classes <n_classes>.
        self.ohe_modifier = lambda tsr, n_classes: torch.stack([F.one_hot(tsr[:,seq_idx], n_classes) for seq_idx in range(tsr.shape[-1])], axis=0).swapaxes(0, 1).type(torch.float32).to(self.device)

        n_examples_display = 5
        X_train_small_idcs = torch.round(torch.linspace(51, self.data.n_train-1, n_examples_display)).type(torch.int)
        self.X_train_small = self.ohe_modifier(self.data.X_train[X_train_small_idcs.to(torch.long)], self.data.src_vocab_size)
        X_val_small_idcs = torch.round(torch.linspace(51, self.data.n_val-1, n_examples_display)).type(torch.int)
        self.X_val_small = self.ohe_modifier(self.data.X_val[X_val_small_idcs.to(torch.long)], self.data.src_vocab_size)

    def train(self, metrics_history=None):
        """
        Description:
            Trains a model.

        Inputs:
            <metrics_history>: Type: <dict>. Holds all training's history evaluation measurements.
        """

        def print_metrics_per_pipeline(pipeline_name):
            print('> '+pipeline_name)
            print('>> '+'Train loss: %f | Val loss: %f'%(metrics_history['loss'][pipeline_name]['train'][-1], metrics_history['loss'][pipeline_name]['val'][-1]))
            print('>> '+'Train BLEU: %f | Val BLEU: %f'%(metrics_history['bleu'][pipeline_name]['train'][-1], metrics_history['bleu'][pipeline_name]['val'][-1]))

        def get_metrics(prediction, ground_truth):

            loss = self.criterion\
            (
                p_distr_pred=prediction,
                p_distr_ground_truth=ground_truth,
                device=self.device,
                ignore_index=self.data.tgt_vocab.get_stoi()['<pad>']
            )
            bleu = torch.tensor\
            (
                bleu_k\
                (
                    col_pred=ohe2int(prediction).tolist(), 
                    col_ground_truth=ohe2int(ground_truth).tolist(),
                    max_k=2,
                    unk=self.data.tgt_vocab.get_stoi()['<unk>'],
                    eos=self.data.tgt_vocab.get_stoi()['<eos>']
                )
            ).to(device=self.device).detach()

            return loss, bleu

        if metrics_history == None:
            metrics_history = \
            {
                metric_name:
                {
                    pipeline_name:
                    {
                        data_subset: [] for data_subset in self.data_subset_names
                    } for pipeline_name in self.pipeline_names
                } for metric_name in self.metric_names
            }

        print('Number of training instances: %d'%(self.data.n_train))
        print('Training Status:')

        if metrics_history['bleu']['id2_pipeline']['val'] == []:
            epochs_bleu_epoch_id2_pipeline_val_hat_prev = float('inf')
        else:
            epochs_bleu_epoch_id2_pipeline_val_hat_prev = metrics_history['bleu']['id2_pipeline']['val'][-1]

        figure_ = figure(metric_names=self.metric_names, pipeline_names=self.pipeline_names)
        initial_epoch = len(metrics_history['loss']['training_pipeline']['val'])
        t_before_training = time()
        for epoch in range(initial_epoch, self.epochs):
            t_i = time()

            losses_minibatch_training_pipeline_train = []
            losses_minibatch_training_pipeline_val = []
            losses_minibatch_id2_pipeline_train = []
            losses_minibatch_id2_pipeline_val = []

            bleus_minibatch_training_pipeline_train = []
            bleus_minibatch_training_pipeline_val = []
            bleus_minibatch_id2_pipeline_train = []
            bleus_minibatch_id2_pipeline_val = []

            for (i, instance_i) in enumerate(self.train_dataloader):

                ## ! Training step initialization: Begin

                current_parameters_are_optimal = False

                X_train_minibatch_i = self.ohe_modifier(instance_i[0], self.data.src_vocab_size)
                Y_train_minibatch_i = self.ohe_modifier(instance_i[1], self.data.tgt_vocab_size)
                Y_bos_train_minibatch_i = self.ohe_modifier(instance_i[2], self.data.tgt_vocab_size)

                ## Resetting gradient tensor variables
                self.optimizer.zero_grad()

                ## ! Training step initialization: End

                ## Forward propagation
                prediction_minibatch_training_pipeline_train = self.model(dec_mode=1, dec_config={'decay': self.decay_scheduler(epoch)}, train=True, X=X_train_minibatch_i, Y=Y_bos_train_minibatch_i)
                prediction_minibatch_id2_pipeline_train = self.model(dec_mode=2, dec_config=None, train=False, X=X_train_minibatch_i, Y=None)

                ## ! Loss and other evaluation metrics: Begin

                ## ! Training pipeline: Begin

                loss_minibatch_training_pipeline_train, \
                bleu_minibatch_training_pipeline_train \
                = get_metrics\
                (
                    prediction=prediction_minibatch_training_pipeline_train, ground_truth=Y_train_minibatch_i
                )

                losses_minibatch_training_pipeline_train.append(loss_minibatch_training_pipeline_train.detach())
                bleus_minibatch_training_pipeline_train.append(bleu_minibatch_training_pipeline_train)

                ## ! Training pipeline: End

                ## ! Prediction pipeline with ID == 2: Begin

                loss_minibatch_id2_pipeline_train, \
                bleu_minibatch_id2_pipeline_train \
                = get_metrics\
                (
                    prediction=prediction_minibatch_id2_pipeline_train, ground_truth=Y_train_minibatch_i
                )

                losses_minibatch_id2_pipeline_train.append(loss_minibatch_id2_pipeline_train.detach())
                bleus_minibatch_id2_pipeline_train.append(bleu_minibatch_id2_pipeline_train)

                ## ! Prediction pipeline with ID == 2: End

                ## ! Loss and other evaluation metrics: End

                ## Backpropagation - Gradients computation
                loss_minibatch_training_pipeline_train.backward()

                self.clip_gradients(grad_clip_val=1, model=self.model)

                ## Gradient Descent - Gradient update
                self.optimizer.step()

            with torch.no_grad():

                for (j, instance_j) in enumerate(self.val_dataloader):

                    ## ! Validation/Test step initialization: Begin

                    X_val_minibatch_j = self.ohe_modifier(instance_j[0], self.data.src_vocab_size)
                    Y_val_minibatch_j = self.ohe_modifier(instance_j[1], self.data.tgt_vocab_size)
                    Y_bos_val_minibatch_j = self.ohe_modifier(instance_j[2], self.data.tgt_vocab_size)

                    ## ! Validation/Test step initialization: End

                    ## Forward propagation
                    prediction_minibatch_training_pipeline_val = self.model(dec_mode=1, dec_config={'decay': self.decay_scheduler(epoch)}, train=False, X=X_val_minibatch_j, Y=Y_bos_val_minibatch_j)
                    prediction_minibatch_id2_pipeline_val = self.model(dec_mode=2, dec_config=None, train=False, X=X_val_minibatch_j, Y=None).detach()

                    ## ! Loss and other evaluation metrics: Begin

                    ## ! Training pipeline: Begin

                    loss_minibatch_training_pipeline_val, \
                    bleu_minibatch_training_pipeline_val \
                    = get_metrics\
                    (
                        prediction=prediction_minibatch_training_pipeline_val, ground_truth=Y_val_minibatch_j
                    )

                    losses_minibatch_training_pipeline_val.append(loss_minibatch_training_pipeline_val)
                    bleus_minibatch_training_pipeline_val.append(bleu_minibatch_training_pipeline_val)

                    ## ! Training pipeline: End

                    ## ! Prediction pipeline with ID == 2: Begin

                    loss_minibatch_id2_pipeline_val, \
                    bleu_minibatch_id2_pipeline_val \
                    = get_metrics\
                    (
                        prediction=prediction_minibatch_id2_pipeline_val, ground_truth=Y_val_minibatch_j
                    )

                    losses_minibatch_id2_pipeline_val.append(loss_minibatch_id2_pipeline_val)
                    bleus_minibatch_id2_pipeline_val.append(bleu_minibatch_id2_pipeline_val)

                    ## ! Prediction pipeline with ID == 2: End

                    ## ! Loss and other evaluation metrics: End

            ## ! Training pipeline: Begin

            loss_epoch_training_pipeline_train_hat = torch.sum(torch.stack(losses_minibatch_training_pipeline_train)) / len(losses_minibatch_training_pipeline_train)
            metrics_history['loss']['training_pipeline']['train'].append(loss_epoch_training_pipeline_train_hat.item())
            bleu_epoch_training_pipeline_train_hat = torch.sum(torch.stack(bleus_minibatch_training_pipeline_train)) / len(bleus_minibatch_training_pipeline_train)
            metrics_history['bleu']['training_pipeline']['train'].append(bleu_epoch_training_pipeline_train_hat.item())

            loss_epoch_training_pipeline_val_hat = torch.sum(torch.stack(losses_minibatch_training_pipeline_val)) / len(losses_minibatch_training_pipeline_val)
            metrics_history['loss']['training_pipeline']['val'].append(loss_epoch_training_pipeline_val_hat.item())
            bleu_epoch_training_pipeline_val_hat = torch.sum(torch.stack(bleus_minibatch_training_pipeline_val)) / len(bleus_minibatch_training_pipeline_val)
            metrics_history['bleu']['training_pipeline']['val'].append(bleu_epoch_training_pipeline_val_hat.item())

            ## ! Training pipeline: End

            ## ! Prediction pipeline with ID == 2: Begin

            loss_epoch_id2_pipeline_train_hat = torch.sum(torch.stack(losses_minibatch_id2_pipeline_train)) / len(losses_minibatch_id2_pipeline_train)
            metrics_history['loss']['id2_pipeline']['train'].append(loss_epoch_id2_pipeline_train_hat.item())
            bleu_epoch_id2_pipeline_train_hat = torch.sum(torch.stack(bleus_minibatch_id2_pipeline_train)) / len(bleus_minibatch_id2_pipeline_train)
            metrics_history['bleu']['id2_pipeline']['train'].append(bleu_epoch_id2_pipeline_train_hat.item())

            loss_epoch_id2_pipeline_val_hat = torch.sum(torch.stack(losses_minibatch_id2_pipeline_val)) / len(losses_minibatch_id2_pipeline_val)
            metrics_history['loss']['id2_pipeline']['val'].append(loss_epoch_id2_pipeline_val_hat.item())
            bleu_epoch_id2_pipeline_val_hat = torch.sum(torch.stack(bleus_minibatch_id2_pipeline_val)) / len(bleus_minibatch_id2_pipeline_val)
            metrics_history['bleu']['id2_pipeline']['val'].append(bleu_epoch_id2_pipeline_val_hat.item())

            ## ! Prediction pipeline with ID == 2: End

            figure_.plot\
            (
                hor_seq=np.arange(0, epoch+1),
                metrics_history = metrics_history
            )

            current_parameters_are_optimal = bleu_epoch_id2_pipeline_val_hat.item() >= max(metrics_history['bleu']['id2_pipeline']['val'])

            self.save_training(epoch, metrics_history=metrics_history, thparams=(self.lr, self.minibatch_size), t_before_training=t_before_training, figure_=figure_, current_parameters_are_optimal=current_parameters_are_optimal)

            t_f = time()
            delta_t = round(t_f-t_i)
            est_next_epoch_time = datetime.datetime.utcfromtimestamp(t_f) + datetime.timedelta(seconds=delta_t)

            print('[Epoch %d @ UTC %s]'%(epoch, datetime.datetime.utcfromtimestamp(t_f).strftime("%H:%M:%S")))

            print_metrics_per_pipeline(pipeline_name='training_pipeline')
            print_metrics_per_pipeline(pipeline_name='id2_pipeline')

            print('Δt: %ds | Δ training pipeline\'s val bleu: %f\nNext epoch @ ~UTC %s'%(delta_t, metrics_history['bleu']['training_pipeline']['val'][-1]-epochs_bleu_epoch_id2_pipeline_val_hat_prev, est_next_epoch_time.strftime("%H:%M:%S")))

            pred_train_small_ohe = self.model(dec_mode=2, dec_config=None, train=False, X=self.X_train_small, Y=None).detach()
            pred_train_small_int = ohe2int(pred_train_small_ohe)
            pred_train_small_str = int2str(col_int=pred_train_small_int, vocab=self.data.tgt_vocab, max_steps=self.data.max_steps_tgt)
            X_train_small_str = ohe2str(self.X_train_small, self.data.src_vocab, self.data.max_steps_src)

            pred_val_small_ohe = self.model(dec_mode=2, dec_config=None, train=False, X=self.X_val_small, Y=None).detach()
            pred_val_small_int = ohe2int(pred_val_small_ohe)
            pred_val_small_str = int2str(col_int=pred_val_small_int, vocab=self.data.tgt_vocab, max_steps=self.data.max_steps_tgt)
            X_val_small_str = ohe2str(self.X_val_small, self.data.src_vocab, self.data.max_steps_src)

            print('Training set examples:')
            for example_idx in range(len(pred_train_small_str)):
                print('%-4s'%(str(example_idx)), end='')
                print('%-40s   '%(X_train_small_str[example_idx]), end='')
                print('%s'%(pred_train_small_str[example_idx]))
            print('Validation set examples:')
            for example_idx in range(len(pred_val_small_str)):
                print('%-4s'%(str(example_idx)), end='')
                print('%-40s   '%(X_val_small_str[example_idx]), end='')
                print('%s'%(pred_val_small_str[example_idx]))

            print()

            epochs_bleu_epoch_id2_pipeline_val_hat_prev = deepcopy(metrics_history['bleu']['id2_pipeline']['val'][-1])

            torch.cuda.empty_cache()

        print('Training completed.')

    def clip_gradients(self, grad_clip_val, model):

        params = [p for p in model.parameters() if p.requires_grad]
        norm = torch.sqrt(sum(torch.sum((p.grad ** 2)) for p in params))
        if norm > grad_clip_val:
            for param in params:
                param.grad[:] *= grad_clip_val / norm

    def save_training(self, epoch, metrics_history, thparams, t_before_training, figure_, current_parameters_are_optimal):

        def get_training_information(epoch, metrics_history, thparams, t_before_training):

            lr, minibatch_size = thparams

            training_information = \
            {
                'model_params': self.model.state_dict(),
                'src_vocab': self.data.src_vocab,
                'src_vocab_size': self.data.src_vocab_size,
                'tgt_vocab': self.data.tgt_vocab,
                'tgt_vocab_size': self.data.tgt_vocab_size,
                'max_steps_src': self.data.max_steps_src,
                'max_steps_tgt': self.data.max_steps_tgt,
                'data_info':
                {
                    'n_train': self.data.n_train,
                    'n_val': self.data.n_val
                },
                'metrics_history': metrics_history,
                'training_hparams':
                {
                    'epoch': epoch,
                    'learning_rate': lr,
                    'minibatch_size': minibatch_size
                },
                'delta_t': time()-t_before_training,
                'shuffle_seed': self.data.shuffle_seed,
                'dataset_name': self.data.dataset_name
            }

            return training_information

        ## Scheduled backup - model thread
        if epoch in self.scheduled_checkpoints:
            training_information = get_training_information(epoch, metrics_history, thparams, t_before_training)
            training_scheduled_backup_path = self.training_dir_path + self.model.name + '_ep' + str(epoch) + self.training_format
            torch.save(training_information, training_scheduled_backup_path)

        ## Latest frequent backups
        if (epoch != 0) and ((epoch % self.bkp_freq) == 0):
            training_information = get_training_information(epoch, metrics_history, thparams, t_before_training)
            live_training_backup_path = self.training_dir_path + self.model.name + '_live_ep' + str(epoch) + self.training_format
            prev_live_training_backup_path = self.training_dir_path + self.model.name + '_live_ep' + str(epoch-self.bkp_freq) + self.training_format
            if os.path.exists(prev_live_training_backup_path):
                os.remove(prev_live_training_backup_path)
            torch.save(training_information, live_training_backup_path)

        if current_parameters_are_optimal:
            training_information = get_training_information(epoch, metrics_history, thparams, t_before_training)
            training_opt_backup_path = self.training_dir_path + self.model.name + '_opt' + self.training_format
            if os.path.exists(training_opt_backup_path):
                os.remove(training_opt_backup_path)
            torch.save(training_information, training_opt_backup_path)

        figure_.fig.savefig(self.training_dir_path + self.model.name + '_live')

In [None]:
def trivialize_irrelevant_possibilities(p_distr, irrelevant_possibility_indices):
    '''
    Description:
        The values corresponding to indices of <p_distr> that match the index-values specified by <irrelevant_possibility_indices> are substituted by 0. The removed probability quantity is added up and distributed equally to the rest of the probabilities. This is identical to the action of shrinking the sample space i.e. finding a *conditional* random variable.

    Inputs:
        <p_distr>: Type: <torch.Tensor>. Shape: (..., n_possibilities).
        <irrelevant_possibility_indices>: Type: <list[<int>]>.

    Outputs:
        <updated_p_distr>: Type: <torch.Tensor>. Shape: (..., n_possibilities).
    '''

    updated_p_distr = deepcopy(p_distr)

    n_possibilities = updated_p_distr.shape[-1]
    n_relevant_possibilities = n_possibilities - len(irrelevant_possibility_indices)

    redistributable_probability = torch.zeros((p_distr.shape[0], p_distr.shape[1]), device=p_distr.device)
    for irrelevant_possibility in irrelevant_possibility_indices:
        redistributable_probability += p_distr[..., irrelevant_possibility]
        updated_p_distr[..., irrelevant_possibility] = 0

    redistributable_probability /= n_relevant_possibilities

    for possibility in range(n_possibilities):
        if possibility not in irrelevant_possibility_indices:
            relevant_possibility = possibility
            updated_p_distr[..., relevant_possibility] += redistributable_probability

    return updated_p_distr

class predictor:

    def __init__(self, model, src_vocab, src_vocab_size, max_steps_src, tgt_vocab, tgt_vocab_size, max_steps_tgt, tgt_name, device):

        self.device = device

        self.model = model

        self.beam_width = 2

        self.src_vocab = src_vocab
        self.src_vocab_size = src_vocab_size
        self.max_steps_src = max_steps_src

        self.tgt_vocab = tgt_vocab
        self.tgt_vocab_size = tgt_vocab_size
        self.max_steps_tgt = max_steps_tgt

        self.tgt_name = tgt_name

        ## Source text preprocess functions
        self.initial_text_preprocess = initial_text_preprocess
        self.tokenizer = lambda x: [[t for t in f'{x} <eos>'.split(' ') if t]]
        self.src_numericalize = functools.partial(c_numericalize_str, vocab_itos=self.src_vocab.get_itos())
        self.src_c_pad_or_trim = functools.partial(c_pad_or_trim, eos_int=self.src_vocab.get_stoi()['<eos>'], pad_int=self.src_vocab.get_stoi()['<pad>'], t_bound=self.max_steps_src)
        self.src_ohe_modifier = lambda x: torch.stack([F.one_hot(x[seq_idx], self.src_vocab_size) for seq_idx in range(len(x))], axis=0).type(torch.float32).to(self.device)

    def __call__(self, x, dec_mode, dec_config):
        '''
        Inputs:
            <x>: Contains the input's information. Can be a list of strings, or a torch,tensor where its contents' shared data type may be integer or float.
            <dec_mode>: Type: <int>. Decoder's pipeline ID.
            <dec_config>: Type: <dict>. Decoder's pipeline configuration.
        '''

        self.dec_mode = dec_mode

        if isinstance(x[0], str):
            self.src_str = x
            self.src_ohe = self.preprocess(self.src_str).to(self.device)
        elif isinstance(x[0,0].item(), (int, float)):
            self.src_str = int2str(col_int=x, vocab=self.src_vocab, max_steps=self.max_steps_src)
            self.src_ohe = self.src_ohe_modifier(x)

        with torch.no_grad():
            self.p_distr = self.model(X=self.src_ohe, dec_mode=self.dec_mode, dec_config=dec_config, train=False)

        self.n_examples = len(self.p_distr)

        tgt_remove_tokens_str = ['<unk>', '<pad>', '<bos>']
        tgt_remove_tokens_int = [self.tgt_vocab.get_stoi()[token] for token in tgt_remove_tokens_str]
        self.adjusted_p_distr = trivialize_irrelevant_possibilities(self.p_distr, tgt_remove_tokens_int)

        ## Deterministic translation
        self.translated_int = ohe2int(self.adjusted_p_distr)
        self.translated_str = int2str(col_int=self.translated_int, vocab=self.tgt_vocab, max_steps=self.max_steps_tgt)

    def preprocess(self, x):

        preprocessed_x = [None for i in range(len(x))]
        for i in range(len(x)):
            preprocessed_x[i] = self.initial_text_preprocess(x[i])
            preprocessed_x[i] = self.tokenizer(preprocessed_x[i])
            preprocessed_x[i] = self.src_numericalize(preprocessed_x[i])
            preprocessed_x[i] = self.src_c_pad_or_trim(preprocessed_x[i])
            preprocessed_x[i] = torch.tensor(preprocessed_x[i])
            preprocessed_x[i] = self.src_ohe_modifier(preprocessed_x[i])[0]
        preprocessed_x = torch.stack(preprocessed_x)

        return preprocessed_x

    def display_translation(self):

        if self.n_examples == 1:
            print('Source sequence [eng]:\n%s'%(self.src_str[0]))
            print('Target sequence [%s]:\n%s'%(self.tgt_name, self.translated_str[0]))
        else:
            print(' '*4+'%-40s   %s'%('[eng]', '['+self.tgt_name+']'))
            for example_idx in range(self.n_examples):
                print('%-4s'%(str(example_idx)), end='')
                print('%-40s   '%(self.src_str[example_idx]), end='')
                print('%s'%(self.translated_str[example_idx]))

    def display_n_likely(self, n=None):
        '''
        Description:
            The probability value displayed for a given example i and step t, for the corresponding token x_{i,t} is actually
            P(x_{i,t} | x_{i,t-1}, ..., x_{i,0}).
        '''

        if (self.dec_mode == 4) and ((n == None) or (n > self.beam_width)):
            n = self.beam_width

        predicted_words_distr, predicted_words_int = torch.topk(input=self.p_distr, k=n, dim=-1) # val, ind lists of shape (self.max_steps_tgt, n) each
        predicted_words_str = \
        [
            [
                [
                    self.tgt_vocab.get_itos()[predicted_words_int[example_idx, step, possibility]]
                    for possibility in range(n)
                ]
                for step in range(self.max_steps_tgt)
            ]
            for example_idx in range(predicted_words_int.shape[0])
        ]
        predicted_words_distr = predicted_words_distr.tolist()

        for example_idx in range(len(predicted_words_str)):
            print('Source [eng]:')
            print(self.src_str[example_idx])
            print('Target token possibilities [%s]:'%(self.tgt_name))
            for step in range(self.max_steps_tgt):
                print('tkn%d: '%(step), end='')
                for possibility in range(n):
                    print('%15s: %.4f'%(predicted_words_str[example_idx][step][possibility], predicted_words_distr[example_idx][step][possibility]), end='')
                    if possibility != n-1:
                        print(' |', end='')
                if (example_idx != len(predicted_words_str)-1) or (step != self.max_steps_tgt-1):
                    print()
            if example_idx != len(predicted_words_str)-1:
                print()
        print()

In [None]:
def train_from_scratch(device):

    data = translated_text_dataset(shuffle_seed=5000)
    data.generate_dataset()

    print('Source vocabulary size: %d'%(data.src_vocab_size))
    print('Target vocabulary size: %d'%(data.tgt_vocab_size))

    s2s = seq2seq\
    (
        n_src_inp=data.src_vocab_size,
        max_steps_src=data.max_steps_src,
        n_tgt_inp=data.tgt_vocab_size,
        max_steps_tgt=data.max_steps_tgt,
        bos_int=data.tgt_vocab.get_stoi()['<bos>'],
        eos_int=data.tgt_vocab.get_stoi()['<eos>'],
        pad_int=data.tgt_vocab.get_stoi()['<pad>'],
        device=device
    )
    print(s2s)

    trainer_ = rnn_trainer(model_=s2s, data=data, device=device)
    trainer_.train()

def load_and_train_model(training_path, device):

    training = torch.load(training_path, map_location=device)

    data = translated_text_dataset(shuffle_seed=training['shuffle_seed'])
    data.generate_dataset()

    print('Source vocabulary size: %d'%(data.src_vocab_size))
    print('Target vocabulary size: %d'%(data.tgt_vocab_size))

    s2s = seq2seq\
    (
        n_src_inp=data.src_vocab_size,
        max_steps_src=data.max_steps_src,
        n_tgt_inp=data.tgt_vocab_size,
        max_steps_tgt=data.max_steps_tgt,
        bos_int=data.tgt_vocab.get_stoi()['<bos>'],
        eos_int=data.tgt_vocab.get_stoi()['<eos>'],
        pad_int=data.tgt_vocab.get_stoi()['<pad>'],
        device=device
    )
    s2s.load_state_dict(training['model_params'])
    rnn_trainer_ = rnn_trainer(model_=s2s, data=data, device=device)
    rnn_trainer_.train(metrics_history=training['metrics_history'])


device = torch.device('cuda')

In [None]:
train_from_scratch(device=device)
# load_and_train_model(training_path='./drive/MyDrive/colab_storage/training/seq2seq_opt.pt', device=device)

In [None]:
def print_metrics_per_pipeline(pipeline_name, metrics):
    print('> '+pipeline_name)
    print('>> '+'Train loss: %f | Val loss: %f'%(metrics['loss'][pipeline_name]['train'][-1], metrics['loss'][pipeline_name]['val'][-1]))
    print('>> '+'Train BLEU: %f | Val BLEU: %f'%(metrics['bleu'][pipeline_name]['train'][-1], metrics['bleu'][pipeline_name]['val'][-1]))

def user_input_loop(translator):

    while True:

        try:
            x = [input('Give a text:\n')]#['Hello world!', 'What\'s up?']

            translator(x=x, dec_mode=2, dec_config={})

            translator.display_translation()
            print()
            translator.display_n_likely(n=3)
            print()
        except KeyboardInterrupt: ## Ctrl+C triggers it
            exit('\nProcess terminated.')

        exit()

def data_sample_evaluation(translator):

    data_fpath='../datasets/pp_ell.pt'
    n_train = n_val = 5

    dataset = torch.load(f=data_fpath)
    train_src_int = dataset['train'][:][0][:n_train]
    train_tgt_int = dataset['train'][:][1][:n_train]
    val_src_int = dataset['val'][:][0][:n_val]
    val_tgt_int = dataset['val'][:][1][:n_val]

    print('\nSample from the training set:', end=2*'\n')
    translator(x=train_src_int, dec_mode=2, dec_config={})
    translator.display_translation()
    # print()
    # translator.display_n_likely(n=3)
    print(end=2*'\n')
    print('Sample from the validation set:', end=2*'\n')
    translator(x=val_src_int, dec_mode=2, dec_config={})
    translator.display_translation()
    # print()
    # translator.display_n_likely(n=3)


device = torch.device('cuda')

## The file containing a model
training_path = '../training/ell/s2s_ell_ep80.pt'

training = torch.load(training_path, map_location=device)

print('Source vocabulary size: %d'%(training['src_vocab_size']))
print('Target vocabulary size: %d'%(training['tgt_vocab_size']), end=2*'\n')

print('Current epoch is %d.'%(len(training['metrics_history']['loss']['training_pipeline']['train'])-1))
print('Translator\'s evaluation:')
print_metrics_per_pipeline(pipeline_name='training_pipeline', metrics=training['metrics_history'])
print_metrics_per_pipeline(pipeline_name='id2_pipeline', metrics=training['metrics_history'])

s2s = seq2seq\
(
    n_src_inp=training['src_vocab_size'],
    max_steps_src=training['max_steps_src'],
    n_tgt_inp=training['tgt_vocab_size'],
    max_steps_tgt=training['max_steps_tgt'],
    bos_int=training['tgt_vocab'].get_stoi()['<bos>'],
    eos_int=training['tgt_vocab'].get_stoi()['<eos>'],
    pad_int=training['tgt_vocab'].get_stoi()['<pad>'],
    device=device
)
s2s.load_state_dict(training['model_params'])

translator = predictor.predictor\
(
    model=s2s,
    src_vocab=training['src_vocab'],
    src_vocab_size=training['src_vocab_size'],
    max_steps_src=training['max_steps_src'],
    tgt_vocab=training['tgt_vocab'],
    tgt_vocab_size=training['tgt_vocab_size'],
    max_steps_tgt=training['max_steps_tgt'],
    tgt_name=training['dataset_name'],
    device=device
)

# user_input_loop(translator=translator)
data_sample_evaluation(translator=translator)