In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [2]:
import nltk
import re
from collections import Counter # collections library; counter: dict subclass for counting hashable objects
import itertools

In [3]:
from utils import my_sentence_tokenizer, get_all_tokens, encode_sentence

# Getting Corpus Data to test

In [4]:
from nltk.corpus import gutenberg

In [5]:
hamlet_corpus = gutenberg.words('shakespeare-hamlet.txt')
print(type(hamlet_corpus),len(hamlet_corpus))

<class 'nltk.corpus.reader.util.StreamBackedCorpusView'> 37360


In [6]:
corpus_str = ' '.join(hamlet_corpus)
print(type(corpus_str),len(corpus_str))

<class 'str'> 166764


In [7]:
corpus_str = re.sub(r'[,!?;-]+', '.', corpus_str) # clean punctuation

In [8]:
tag_start = '<s>'
tag_end = '</s>'
tag_oov = '<unk>'
tag_pad = '<pad>'

tags = [tag_start, tag_end, tag_oov, tag_pad]

In [9]:
doc = my_sentence_tokenizer(corpus_str,tag_start,tag_end)

In [10]:
tokens = get_all_tokens(doc)

In [11]:
n_print = 100
print(f'After cleaning:  {len(tokens)} tokens, first {n_print}  {tokens[:n_print]}')

After cleaning:  47309 tokens, first 100  ['<s>', 'the', 'tragedie', 'of', 'hamlet', 'by', 'william', 'shakespeare', 'actus', 'primus', '.', '</s>', '<s>', 'scoena', 'prima', '.', '</s>', '<s>', 'enter', 'barnardo', 'and', 'francisco', 'two', 'centinels', '.', '</s>', '<s>', 'barnardo', '.', '</s>', '<s>', 'who', 's', 'there', '.', '</s>', '<s>', 'fran', '.', '</s>', '<s>', 'nay', 'answer', 'me', 'stand', 'vnfold', 'your', 'selfe', 'bar', '.', '</s>', '<s>', 'long', 'liue', 'the', 'king', 'fran', '.', '</s>', '<s>', 'barnardo', '.', '</s>', '<s>', 'bar', '.', '</s>', '<s>', 'he', 'fran', '.', '</s>', '<s>', 'you', 'come', 'most', 'carefully', 'vpon', 'your', 'houre', 'bar', '.', '</s>', '<s>', 'tis', 'now', 'strook', 'twelue', '.', '</s>', '<s>', 'get', 'thee', 'to', 'bed', 'francisco', 'fran', '.', '</s>', '<s>']


In [12]:
# create vocab including word count using collections.Counter
word_count_vocab = dict()
word_count_vocab = Counter(tokens)

word_count_vocab.pop(tag_start)
word_count_vocab.pop(tag_end)

print(word_count_vocab.most_common(10))
print('count : ',len(word_count_vocab))

[('.', 5672), ('the', 993), ('and', 863), ('to', 685), ('of', 610), ('i', 574), ('you', 527), ('a', 511), ('my', 502), ('it', 419)]
count :  4699


In [13]:
vocabulary = list(enumerate(sorted(set(word_count_vocab.keys())),start=4))

for i, tag in enumerate(tags):
    vocabulary.insert(i,(i,tag))
vocabulary[:10] # sorted vocabulary

[(0, '<s>'),
 (1, '</s>'),
 (2, '<unk>'),
 (3, '<pad>'),
 (4, '.'),
 (5, 'a'),
 (6, 'abhominably'),
 (7, 'abhorred'),
 (8, 'abilitie'),
 (9, 'aboord')]

In [14]:
idx2word = dict(vocabulary)
list(idx2word.items())[0:10]

[(0, '<s>'),
 (1, '</s>'),
 (2, '<unk>'),
 (3, '<pad>'),
 (4, '.'),
 (5, 'a'),
 (6, 'abhominably'),
 (7, 'abhorred'),
 (8, 'abilitie'),
 (9, 'aboord')]

In [18]:
word2idx = dict({k:v for v, k in idx2word.items()})
list(word2idx.items())[:10]

[('<s>', 0),
 ('</s>', 1),
 ('<unk>', 2),
 ('<pad>', 3),
 ('.', 4),
 ('a', 5),
 ('abhominably', 6),
 ('abhorred', 7),
 ('abilitie', 8),
 ('aboord', 9)]

In [19]:
coded_corpus = [encode_sentence(s,word2idx, max_len_sentence=30,
                               tag_oov=tag_oov, tag_pad=tag_pad) for s in doc]

In [22]:
print(coded_corpus[202])
len(coded_corpus)

[0, 0, 2802, 1484, 4, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]


5672

In [21]:
S = len(coded_corpus)           
T = len(coded_corpus[0]) # max_len_sentence + 2
V = len(word2idx)  

# Embeddings in PyTorch

Next, we'll see how to use the ``Embedding`` layer of Torch. <br>
It converts categorical data with $V$ classes to dense vectors with $N_d$ dimensions. <br>
Suppose $c\in{F_2^V}$ is a one-hot encoded vector. <br>
An embedding is a mapping $e:F_2^V\to R^{N_d}$ (sparse vector, one-hot encoded, to dense real vector).

In [None]:
Nd = 2                  # Number of Dimensions of the Dense embedding
e = nn.Embedding(V,Nd)  # (vocab_size, num_of_dimensions_of_embedding)

Let's first convert the coded sentences to Torch Tensors.

In [None]:
coded_sentences = torch.LongTensor(coded_corpus).reshape(-1,T).T
print(f' T x S : {coded_sentences.shape} (max. sentence size + 2 x num. of sentences)')
print(f' Tensors of type {coded_sentences.dtype}')
print(' All encoded sentences as tensors (each column is a sentence):')
print(coded_sentences)

Let's select, for example, the first sentence, as the varible $c$.

In [None]:
c = coded_sentences[:,0]
print(c.shape,c,sep='\n')

Next, we convert the sentence from sequences of one-hot-encoded words to its embedding vector.<br>
When printing the corresponding sequence, we see that:
- each index (one-hot-encoded word) is converted into a real row-vector of $N_d$ dimensions
- the initialized embedding vectors are just random values.

In [None]:
e_seq = e(c)
print(e_seq.shape,e_seq[:10,:],'... and more words (truncated in the 10-th word).',sep='\n') 

Choose a word to check its embedding vector.

In [None]:
word    = 'queen'

idx     = word2idx[word] # uses dictionary to map word to index
emb_vec = e(torch.LongTensor([idx]))  # uses torch embedding to map index to dense Nd-vector

print(f' Word "{word}" corresponds to index {idx}')
print(f' Index {idx} maps to embedding vector "{emb_vec.detach().numpy().reshape(-1).tolist()}"')

In [None]:
W1 = e.weight.data
W1.shape, W1.dtype, W1