In [7]:
import torch
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import torch.nn as nn

In [2]:
dataset = [
	"I like cats",
	"I hate dogs",
	"I'm impartial to hippos"
]

In [3]:
tokenizer = get_tokenizer('spacy', 'en_core_web_sm')

def yield_tokens(data_itr):
    for data_sample in data_itr:
        yield tokenizer(data_sample)

data_iter = iter(dataset)

In [4]:
vocab = build_vocab_from_iterator(yield_tokens(data_iter))
vocab.get_stoi()

{'to': 8,
 'hippos': 5,
 'cats': 2,
 'I': 0,
 "'m": 1,
 'like': 7,
 'dogs': 3,
 'hate': 4,
 'impartial': 6}

In [5]:
input_ids = lambda x: [torch.tensor(vocab(tokenizer(data_sample))) for data_sample in dataset]

In [6]:
index = input_ids(dataset)
index

[tensor([0, 7, 2]), tensor([0, 4, 3]), tensor([0, 1, 6, 8, 5])]

In [8]:
embedding_dim = 3
n_embedding = len(vocab)
embeds = nn.Embedding(n_embedding, embedding_dim)
embeds

In [16]:
i_like_cats = embeds(index[0])
print(i_like_cats)
hippos = embeds(index[2])
print(hippos)

tensor([[-0.4790, -0.6981,  0.6997],
        [-2.2306, -0.1350, -0.8695],
        [-1.5711, -0.2814, -0.2066]], grad_fn=<EmbeddingBackward0>)
tensor([[-0.4790, -0.6981,  0.6997],
        [-1.1978,  0.5505, -0.2858],
        [-0.5615,  2.6911,  0.2075],
        [ 0.4864,  0.4192, -0.0505],
        [-0.2478,  1.7505, -0.3507]], grad_fn=<EmbeddingBackward0>)


Embedding Bag Layer

In [15]:
embedding_dim = 3
n_embedding = len(vocab)
embedding_bag = nn.EmbeddingBag(n_embedding, embedding_dim)

In [17]:
cats_bag = embedding_bag(index[0], offsets=torch.tensor([0]))
cats_bag

tensor([[-0.0340, -0.2746,  0.6256]], grad_fn=<EmbeddingBagBackward0>)

In [18]:
index_flat = torch.cat(index)
index_flat

tensor([0, 7, 2, 0, 4, 3, 0, 1, 6, 8, 5])

In [27]:
offset = [len(sample) for sample in index]
offset.insert(0, 0)
offset

[0, 3, 3, 5]

In [28]:
offset = torch.cumsum(torch.tensor(offset), 0)[0:-1]
offset

tensor([0, 3, 6])

In [29]:
my_embedding_bag = embedding_bag(index_flat, offsets=offset)
my_embedding_bag

tensor([[-0.0340, -0.2746,  0.6256],
        [ 0.4936, -0.2994,  0.0887],
        [-0.4871,  0.6245,  1.1183]], grad_fn=<EmbeddingBagBackward0>)