In [1]:
import math, copy, sys, logging, json, time, random, os, string, pickle, re

import numpy as np
import matplotlib.pyplot as plt

import torch
from torch import nn

%matplotlib inline
%load_ext autoreload
%autoreload 2

np.random.seed(0) 
random.seed(0)
torch.manual_seed(0)

print('torch.version', torch.__version__)
print('torch.cuda.is_available()', torch.cuda.is_available())
print('torch.cuda.device_count()', torch.cuda.device_count())

torch.version 1.7.0
torch.cuda.is_available() True
torch.cuda.device_count() 2


# The Elastic Vocab Class

In the next cell is the Vocb Class that we will explain.

We make a few simplifying assumptions that can be modified. First we assume that the input and output vocabulary are the same, which is why we keep only one word vector embedding matrix. We also assume that the dimension of the input and output to your sequence to sequence model is the same length as the word vector length.

## Embedding 

the embedding modules takes a token index or sequence of tokens and returns a vector or sequence of vectors. If I want to pass "hi" into the model as a vector, I need to pass the token index for "hi" into embedding. In the diagram, the token idex for "hi" is `3` and the `emb_dim` for our word vectors `W` is `4`. The current vocab size of the grey embedding matrix is 5 tokens, indexed 0,1,2,3,4. The blue vector is the vector for "hi" and is the input to the seq2seq model.

<img src="files/Input_Elastic_Vocab.png" height="1000" width="800" />

In [3]:
class Vocab:
    
    def __init__(self, emb_dim, word2index = None, embedding = None, 
                 word2count = None, emb2vocab = None):
        
        '''
        This is a class that handles the transformation between 
        embedding space and vocabulary space. The methods of this class update both 
        the number of word embeedings and also the size of the matrix that maps
        the vector output of a neural network module to a vector the length of
        the output vocabulary or selecting the next token to output.
        
        embed_dim (integer): number of dimensions to represent words/tokens with
        word2index (dict): a dictionary mapping a word-string to it's unique integer index 
                    that represents it in the embedding matrix
        embedding (nn.Embedding): module that keeps a matrix where each row is a trainable
                    word vector with the row index corresponding to the token index
        word2count (dict): a dictionary mapping a word-string to the number of times it's been 
                    used, either in the input or the outout 
        emb2vocab (nn.Linear): module that takes an vector of length emb_dim 
                    (note: there is no reason that this has to be the same length as
                    the word vectors) and transforms this to a vector of length
                    vocab_size, aka self.embedding.weight.shape[0] or len(self.word2index)
        '''
        
        self.emb_dim = emb_dim
        self.word2index = {"<PAD>":0, "<SOS>":1, "<EOS>":2, "<UNK>":3}
        self.embedding = nn.Embedding(len(self.word2index), self.emb_dim)
        self.emb2vocab = nn.Linear(self.emb_dim,len(self.word2index),bias=False)
        self.word2count = {}
        
        if word2index is not None:
            self.word2index = word2index
            
        if embedding is not None:
            self.embedding = embedding
            
        if word2count is not None:
            self.word2count = word2count
            
        if emb2vocab is not None:
            self.emb2vocab = emb2vocab
            
    def string2tokens(self, string):
        """ this function is to change the string according to 
        the substitution rules you apply below """
        # puts space between words and ,.!?
        string = re.sub(r"([,.!?])", r" \1", string) 
        # collapse words like you're and don`t, into youre and dont 
        string = re.sub(r"([`'])", r"", string) 
        # turn characters not in ^a-zA-Z0-9,.!? into a space
        string = re.sub(r"[^a-zA-Z0-9,.!?#]+", r" ", string) 
        # make all text lowercase
        string = string.lower()
        # split sentence string into list of word strings
        string = string.split(" ")
        return string
    
    def tokens2tensor(self, list_o_strings):
        """takes a list of strings, looks each up with word2index
           and returns a torch long tensor of those indices"""
        integer_sequence = []
        for wrdstr in list_o_strings:
            if wrdstr in self.word2index:
                integer_sequence.append(self.word2index[wrdstr])
            else:
                integer_sequence.append(self.word2index["<UNK>"])
        return torch.LongTensor([integer_sequence])
    
    def string2tensor(self, sentence_str):
        """takes string sentence, returns tensor integer sentence
        without adding new words to the vocabulary"""
        list_o_strings = self.string2tokens(sentence_str)
        tnsr_int_sntnc = self.tokens2tensor(list_o_strings)
        return tnsr_int_sntnc
    
    def string2embedding(self, sentence_str):
        """takes a sentence as a string and increments the
        wordcount of each word in the string, if word has never been
        seen, it is added to the word2index and embedding"""
        list_o_strings = self.string2tokens(sentence_str)
        for wrdstr in list_o_strings:
            if wrdstr in self.word2index and wrdstr in self.word2count:
                self.word2count[wrdstr] += 1
            else:
                self.word2count[wrdstr] = 1
                self.word2index[wrdstr] = len(self.word2index)
                self.embedding.weight = nn.Parameter(torch.cat((self.embedding.weight, 
                     torch.randn(1, self.emb_dim)), dim=0), requires_grad=True)
                self.emb2vocab.weight = nn.Parameter(torch.cat((self.emb2vocab.weight,
                     torch.randn(1, self.emb_dim)), dim=0), requires_grad=True)
        return list_o_strings
    
    def string2embed2tensor(self, sentence_str):
        """ takes string sentence, returns tensor integer sentence
         adding new words to the vocabulary"""
        
        list_o_strings = self.string2embedding(sentence_str)
        tnsr_int_sntnc = self.tokens2tensor(list_o_strings)
        return tnsr_int_sntnc
    
    def prunevocab(self, mincount):
        """ loops through word2count to find words used less than mincount
        if it's count is less than mincount then its index is used to remove
        the row of that index in the embedding matrix. The word2index dictionary
        is also adjusted by decrementing it's integer indices to make restore the
        word-index-vector relationship. lastly the words are removed from word2index
        and word2count """
        words2del = [] 
        for wrdstr in self.word2count:
            if self.word2count[wrdstr] < mincount:
                words2del.append(wrdstr)
                wrdidx = self.word2index[wrdstr]
                self.embedding.weight=nn.Parameter(torch.cat((self.embedding.weight[:wrdidx], 
                                 self.embedding.weight[wrdidx+1:]), dim=0),requires_grad=True)
                self.emb2vocab.weight=nn.Parameter(torch.cat((self.emb2vocab.weight[:wrdidx], 
                                 self.emb2vocab.weight[wrdidx+1:]),dim=0),requires_grad=True)
                for decrwrd in self.word2count:
                    self.word2index[decrwrd] -= 1

        for wrd in words2del:
            del self.word2index[wrd]
            del self.word2count[wrd]

## Initialize a Vocabulary

For simplicity we assume that the vector output of our neural network will be the same length as the word vectors

Notice that there are 4 initial tokens: padding, start of sentence, end of sentence and unknown

In [4]:
vocab = Vocab(emb_dim = 4)

print(vocab.embedding.weight)
print(' ')
print(vocab.emb2vocab.weight)
print(' ')
print(vocab.word2index)
print(' ')
print(vocab.word2count)

Parameter containing:
tensor([[-1.1258, -1.1524, -0.2506, -0.4339],
        [ 0.8487,  0.6920, -0.3160, -2.1152],
        [ 0.3223, -1.2633,  0.3500,  0.3081],
        [ 0.1198,  1.2377,  1.1168, -0.2473]], requires_grad=True)
 
Parameter containing:
tensor([[ 0.1977,  0.3000, -0.3390, -0.2177],
        [ 0.1816,  0.4152, -0.1029,  0.3742],
        [-0.0806,  0.0529,  0.4527, -0.4638],
        [-0.3148, -0.1266, -0.1949,  0.4320]], requires_grad=True)
 
{'<PAD>': 0, '<SOS>': 1, '<EOS>': 2, '<UNK>': 3}
 
{}


## Demonstrate Input Vocab Functions

We will start by inputting a sentence into the Vocab

A. 

We use the string2token function to tokenize the sentence into a
list of strings

All the tokens in this string are unknown to our vocabulary at first, which is defined as whether or not the string has a key for it in the word2str dictionary

Since each string token is not known, the token indices returned by the tokens2tensor
function are all the token index for unknown token `<UNK>`, Note that this is a tensor of indices, NOT a tensor of word vectors

In [6]:
list_o_strings = vocab.string2tokens("hi![] you're coming TONIGHT right?!")

print(list_o_strings)

tensorsentence = vocab.tokens2tensor(list_o_strings)

print(tensorsentence)

['hi', '!', 'youre', 'coming', 'tonight', 'right', '?', '!']
tensor([[3, 3, 3, 3, 3, 3, 3, 3]])


B.

`string2embedding` will add these new tokens to both the vocabulary embedding matrix and also the embedding to vocab matrix by adding a row vector to each for every new word. The growing of embedding and emb2vocab allow access to thesenew tokens as an input and output respectively by the seq2seq model

The diagram shows growing of a matrix, this growing process applies both to the `vocab.embedding.weight` and `vocab.emb2vocab.weight` matrices.

<img src="files/Expand_Elastic_Vocab.png" height="1000" width="800" />

The diagram shows an example of growing the matrix by one token only, the vocab size expands from 5 tokens to 6, the emb_dim is still the same at 4 dimensions. V' is the new Vocab size. 

In [7]:
list_o_strings = vocab.string2embedding("hi![] you're coming TONIGHT right?!")

print(list_o_strings)

tensorsentence = vocab.tokens2tensor(list_o_strings)

print(tensorsentence)

['hi', '!', 'youre', 'coming', 'tonight', 'right', '?', '!']
tensor([[ 4,  5,  6,  7,  8,  9, 10,  5]])


In [9]:
# There are new rows added to the embedding matrix and to the emb2vocab matrix

print(vocab.embedding.weight)
print(' ')
print(vocab.emb2vocab.weight)
print(' ')
print(vocab.word2index)
print(' ')
print(vocab.word2count)

Parameter containing:
tensor([[-1.1258, -1.1524, -0.2506, -0.4339],
        [ 0.8487,  0.6920, -0.3160, -2.1152],
        [ 0.3223, -1.2633,  0.3500,  0.3081],
        [ 0.1198,  1.2377,  1.1168, -0.2473],
        [-0.5663,  0.3731, -0.8920, -1.5091],
        [ 0.1919,  1.2638, -1.2904, -0.7911],
        [ 0.1920,  0.5428, -2.2188,  0.2590],
        [-0.0404,  0.2881, -0.0075, -0.9145],
        [ 2.0576, -0.0354,  0.0627, -0.7663],
        [-1.5055, -0.6610,  1.3232,  0.0371],
        [-0.4584, -0.3360, -1.5700,  1.2315]], requires_grad=True)
 
Parameter containing:
tensor([[ 0.1977,  0.3000, -0.3390, -0.2177],
        [ 0.1816,  0.4152, -0.1029,  0.3742],
        [-0.0806,  0.0529,  0.4527, -0.4638],
        [-0.3148, -0.1266, -0.1949,  0.4320],
        [ 0.3704,  1.4565,  0.9398,  0.7748],
        [-0.0209, -0.7185,  0.5186, -1.3125],
        [-1.0297, -0.5008,  0.2734, -0.9181],
        [-1.0886, -0.2666,  0.1894, -0.2190],
        [ 1.0993,  2.7565,  0.1753, -0.9315],
        [-0.2

## Demonstrate Output functions

I am going to make a random vector `seq2seq_ouput`, pretend this is the output from our seq2seq model.

I am going to pass this vector to emb2vocab to give us our `token_selection_vector`, which should be the same length as the length of our vocabulary. 

In the diagram `vocab.emb2vocab.weight` is in green, `seq2seq_ouput` is our red vector, which by matrix multiplication `emb2vocab` transforms into `token_selection_vector` in purple, which is placed through a softmax can be interpreted as a probability distribution over all the tokens of our vocabulary, yellow.

<img src="files/Output_Elastic_Vocab.png" height="1000" width="800" />

In [13]:
seq2seq_ouput = torch.randn((1,4))

print(seq2seq_ouput)

token_selection_vector = vocab.emb2vocab(seq2seq_ouput)

print(token_selection_vector)

print(' ')

print(token_selection_vector.shape, len(vocab.word2index))

tensor([[-0.9759,  0.7112, -0.7582, -0.6436]])
tensor([[ 0.4176, -0.0447,  0.0715,  0.0869, -0.5368, -0.0392,  1.0324,  0.8701,
          1.3544, -3.2543,  0.2593]], grad_fn=<MmBackward>)
 
torch.Size([1, 11]) 11
