In [3]:
import math, copy, sys, logging, json, time, random, os, string, pickle, re

import numpy as np
import matplotlib.pyplot as plt

import torch
from torch import nn

%matplotlib inline
%load_ext autoreload
%autoreload 2

np.random.seed(0) 
random.seed(0)
torch.manual_seed(0)

print('torch.version', torch.__version__)
print('torch.cuda.is_available()', torch.cuda.is_available())
print('torch.cuda.device_count()', torch.cuda.device_count())

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
torch.version 1.7.1
torch.cuda.is_available() False
torch.cuda.device_count() 0


In [4]:
class Vocab:
    
    def __init__(self, emb_dim, word2index = None, embedding = None, 
                 word2count = None, emb2vocab = None):
        
        '''
        embed_dim: integer, number of dimensions to represent words/tokens with
        word2index: a dictionary mapping a word-string to it's unique integer index 
                    that represents it in the embedding matrix
        word2count: a dictionary mapping a word-string to the number of times it's been 
                    used, either in the input or the outout 
        '''
        
        self.emb_dim = emb_dim
        self.word2index = {"<PAD>":0, "<SOS>":1, "<EOS>":2, "<UNK>":3}
        self.embedding = nn.Embedding(len(self.word2index), self.emb_dim)
        self.emb2vocab=nn.Linear(self.emb_dim,len(self.word2index),bias=False)
        self.word2count = {}
        
        if word2index is not None:
            self.word2index = word2index
            
        if embedding is not None:
            self.embedding = embedding
            
        if word2count is not None:
            self.word2count = word2count
            
        if emb2vocab is not None:
            self.emb2vocab = emb2vocab
            
    def string2tokens(self, string):
        """ this function is to change the string according to 
        the substitution rules you apply below """
        # puts space between words and ,.!?
        string = re.sub(r"([,.!?])", r" \1", string) 
        # collapse words like you're and don`t, into youre and dont 
        string = re.sub(r"([`'])", r"", string) 
        # turn characters not in ^a-zA-Z0-9,.!? into a space
        string = re.sub(r"[^a-zA-Z0-9,.!?#]+", r" ", string) 
        # make all text lowercase
        string = string.lower()
        # split sentence string into list of word strings
        string = string.split(" ")
        return string
    
    def tokens2tensor(self, list_o_strings):
        """takes a list of strings, looks each up with word2index
           and returns a torch long tensor of those indices"""
        integer_sequence = []
        for wrdstr in list_o_strings:
            if wrdstr in self.word2index:
                integer_sequence.append(self.word2index[wrdstr])
            else:
                integer_sequence.append(self.word2index["<UNK>"])
        return torch.LongTensor([integer_sequence])
    
    def string2tensor(self, sentence_str):
        """ takes string sentence, returns tensor integer sentence
        without adding new words to the vocabulary"""
        list_o_strings = self.string2tokens(sentence_str)
        tnsr_int_sntnc = self.tokens2tensor(list_o_strings)
        return tnsr_int_sntnc
    
    def string2embedding(self, sentence_str):
        """takes a sentence as a string and increments the
        wordcount of each word in the string, if word has never been
        seen, it is added to the word2index and embedding"""
        list_o_strings = self.string2tokens(sentence_str)
        for wrdstr in list_o_strings:
            if wrdstr in self.word2index and wrdstr in self.word2count:
                self.word2count[wrdstr] += 1
            else:
                self.word2count[wrdstr] = 1
                self.word2index[wrdstr] = len(self.word2index)
                self.embedding.weight = nn.Parameter(torch.cat((self.embedding.weight, 
                     torch.randn(1, self.emb_dim)), dim=0), requires_grad=True)
                self.emb2vocab.weight = nn.Parameter(torch.cat((self.emb2vocab.weight,
                     torch.randn(1, self.emb_dim)), dim=0), requires_grad=True)
        return list_o_strings
    
    def string2embed2tensor(self, sentence_str):
        """ takes string sentence, returns tensor integer sentence
         adding new words to the vocabulary"""
        
        list_o_strings = self.string2embedding(sentence_str)
        tnsr_int_sntnc = self.tokens2tensor(list_o_strings)
        return tnsr_int_sntnc
    
    def prunevocab(self, mincount):
        """ loops through word2count to find words used less than mincount
        if it's count is less than mincount then its index is used to remove
        the row of that index in the embedding matrix. The word2index dictionary
        is also adjusted by decrementing it's integer indices to make restore the
        word-index-vector relationship. lastly the words are removed from word2index
        and word2count """
        words2del = [] 
        for wrdstr in self.word2count:
            if self.word2count[wrdstr] < mincount:
                words2del.append(wrdstr)
                wrdidx = self.word2index[wrdstr]
                self.embedding.weight=nn.Parameter(torch.cat((self.embedding.weight[:wrdidx], 
                                 self.embedding.weight[wrdidx+1:]), dim=0),requires_grad=True)
                self.emb2vocab.weight=nn.Parameter(torch.cat((self.emb2vocab.weight[:wrdidx], 
                                 self.emb2vocab.weight[wrdidx+1:]),dim=0),requires_grad=True)
                for decrwrd in self.word2count:
                    self.word2index[decrwrd] -= 1

        for wrd in words2del:
            del self.word2index[wrd]
            del self.word2count[wrd]

## initialize a vocabulary

notice that there are 4 initial tokens: padding, start of sentence, end of sentence and unknown

In [5]:
vocab = Vocab(emb_dim = 4)
print(vocab.embedding.weight)
print(vocab.emb2vocab.weight)
print(vocab.word2index)
print(vocab.word2count)

Parameter containing:
tensor([[-1.1258, -1.1524, -0.2506, -0.4339],
        [ 0.8487,  0.6920, -0.3160, -2.1152],
        [ 0.3223, -1.2633,  0.3500,  0.3081],
        [ 0.1198,  1.2377,  1.1168, -0.2473]], requires_grad=True)
Parameter containing:
tensor([[ 0.1977,  0.3000, -0.3390, -0.2177],
        [ 0.1816,  0.4152, -0.1029,  0.3742],
        [-0.0806,  0.0529,  0.4527, -0.4638],
        [-0.3148, -0.1266, -0.1949,  0.4320]], requires_grad=True)
{'<PAD>': 0, '<SOS>': 1, '<EOS>': 2, '<UNK>': 3}
{}


In [6]:
# example of the tokenizer function
list_o_strings = vocab.string2tokens("hi![] you're coming TONIGHT right?!")
print(list_o_strings)

['hi', '!', 'youre', 'coming', 'tonight', 'right', '?', '!']

All the tokens in this string are unknown to our vocabulary at first

In [7]:
tensorsentence = vocab.tokens2tensor(list_o_strings)
print(tensorsentence)

tensor([[3, 3, 3, 3, 3, 3, 3, 3]])


add a sentence to the vocabulary

In [8]:
# example of the tokenizer function
list_o_strings = vocab.string2embedding("hi![] you're coming TONIGHT right?!")
print(list_o_strings)
tensorsentence = vocab.tokens2tensor(list_o_strings)
print(tensorsentence)

['hi', '!', 'youre', 'coming', 'tonight', 'right', '?', '!']
tensor([[ 4,  5,  6,  7,  8,  9, 10,  5]])


<img src='files/elastic_vocab.jpg' height=800 width=500>