<a href="https://colab.research.google.com/github/dogukartal/IBM_AI_Labs/blob/main/Gen%20AI%20Foundational%20Models%20for%20NLP%20%26%20Language%20Understanding/Embeddings_with_PyTorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Embeddings
---

In [None]:
!pip install -Uqq torch==2.3.0
!pip install -Uqq torchtext==0.18.0
!pip install -Uqq spacy
!python -m spacy download en_core_web_sm -qq

In [12]:
import torchtext
import torch
from torch import nn
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

In [9]:
dataset = ["I like cats", "I hate dogs", "I'm impartial to hippos"]

tokenizer = get_tokenizer("spacy", language="en_core_web_sm")

def yield_tokens(data_iter):
  for data_sample in data_iter:
    yield tokenizer(data_sample)

data_iter = iter(dataset)

# Creating the vocab
vocab = build_vocab_from_iterator(yield_tokens(data_iter))

7

In [10]:
vocab["like"]

7

In [22]:
# Assigning IDs to tokens
input_ids = lambda x:[torch.tensor(vocab(tokenizer(data_sample))) for data_sample in dataset]

index = input_ids(dataset)
print(index)

[tensor([0, 7, 2]), tensor([0, 4, 3]), tensor([0, 1, 6, 8, 5])]


## Embedding Layer

In [15]:
# Creating Embedding Layer
embedding_dim = 3

num_embeddings = len(vocab)

embeds = nn.Embedding(num_embeddings, embedding_dim)
print(embeds)

Embedding(9, 3)


In [16]:
# Applying the Embedding to First Sentence
i_like_cats = embeds(index[0])
i_like_cats

tensor([[-0.5457,  0.9364,  0.0987],
        [-0.8796, -1.6541,  0.4330],
        [ 1.7350, -1.1639, -1.1612]], grad_fn=<EmbeddingBackward0>)

## Embedding Bag Layer

In [17]:
# Creating Embedding Bag Layer
embedding_bag = nn.EmbeddingBag(num_embeddings, embedding_dim)
print(embedding_bag)

EmbeddingBag(9, 3, mode='mean')


In [18]:
i_like_cats = embedding_bag(index[0], offsets=torch.tensor([0]))
i_like_cats

tensor([[-0.6630,  0.8253,  0.3493]], grad_fn=<EmbeddingBagBackward0>)

In [34]:
# Applying Embedding Bag to multiple Samples
index_flat = torch.cat(index)
print(index_flat)

offset = [len(sample) for sample in index]
offset.insert(0, 0)
offset = torch.cumsum(torch.tensor(offset), 0)[0:-1]

i_like_cats = embedding_bag(index_flat, offsets=offset)
i_like_cats

tensor([0, 7, 2, 0, 4, 3, 0, 1, 6, 8, 5])


tensor([[-0.6630,  0.8253,  0.3493],
        [-0.5844, -0.1395,  0.4568],
        [ 0.1389,  0.2569, -0.4592]], grad_fn=<EmbeddingBagBackward0>)