# One-Hot encoding

In [20]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from rich.console import Console
from rich.table import Table

# Sample corpus
corpus = ["The cat slept on the soft rug.",
          "The dog played on the rug."]

# Fit the vectorizer to your corpus
vectorizer = CountVectorizer(binary=True)
vectorizer.fit(corpus) 

# Get the vocabulary 
vocabulary = vectorizer.get_feature_names_out()

# Now transform the corpus using the fitted vocabulary
one_hot_encoded = vectorizer.transform(corpus)

# visualize
df = pd.DataFrame(one_hot_encoded.toarray(), columns=vocabulary)
df.insert(0, "Sentence", corpus)

# ensure dataframe contains only string values
df = df.astype(str)

table = Table(title="One-Hot Encoding")
for col in df.columns:
    table.add_column(col)
for row in df.values:
    table.add_row(*row)

console = Console()
console.print(table)

In [5]:
from rich.markdown import Markdown

console = Console()
md = Markdown(df.to_markdown())
console.print(md)

# Dense embeddings from BERT models

In [32]:
from transformers import BertTokenizer, BertModel

# initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# function to get BERT embeddings
def get_bert_embeddings(sentence):
    inputs = tokenizer(sentence, return_tensors='pt')
    outputs = model(**inputs)
    last_hidden_states = outputs.last_hidden_state
    tokens = tokenizer.tokenize(sentence)
    embedding = last_hidden_states[0, :]  # +1 to account for [CLS] token
    return embedding

bert_embeddings = [get_bert_embeddings(d) for d in corpus]

print(f"# of dimensions in the sentence 1 embeddings = {tuple(bert_embeddings[0].shape)}")
print(f"Sample embeddings[:5] = {bert_embeddings[1][:5]}")

# of dimensions in the sentence 1 embeddings = (10, 768)
Sample embeddings[:5] = tensor([[-0.0146,  0.0706, -0.3223,  ..., -0.6195,  0.4784,  0.3209],
        [-0.3229,  0.1109, -0.4878,  ..., -0.4270,  0.3301, -0.2171],
        [ 0.0209,  0.3718, -0.1737,  ..., -0.4981,  0.2606,  0.5935],
        [ 0.5665, -0.4937, -0.2265,  ..., -0.8287,  0.1734,  0.0389],
        [ 0.1579, -0.2385,  0.2189,  ..., -0.0540,  0.2653,  0.0139]],
       grad_fn=<SliceBackward0>)


In [22]:
bert_embeddings = [get_bert_embeddings(d) for d in corpus]
bert_embeddings

[tensor([[-0.2126, -0.2405, -0.5741,  ..., -0.4036,  0.7588,  0.4445],
         [-0.4847, -0.5858, -0.2595,  ..., -0.7802,  0.9080, -0.0304],
         [-0.2277, -0.1399,  0.0161,  ..., -1.0611,  0.3858,  0.6314],
         ...,
         [ 0.5969,  0.0696,  0.0393,  ..., -0.4466,  0.1420, -0.1435],
         [-0.0929, -0.5271, -0.2424,  ...,  0.1697,  0.6789, -0.6303],
         [ 0.5555, -0.0605, -0.3327,  ...,  0.0226, -0.3393, -0.3695]],
        grad_fn=<SliceBackward0>),
 tensor([[-0.0146,  0.0706, -0.3223,  ..., -0.6195,  0.4784,  0.3209],
         [-0.3229,  0.1109, -0.4878,  ..., -0.4270,  0.3301, -0.2171],
         [ 0.0209,  0.3718, -0.1737,  ..., -0.4981,  0.2606,  0.5935],
         ...,
         [ 0.8921,  0.2020,  0.2865,  ..., -0.2967,  0.0288, -0.6615],
         [ 0.1118, -0.6143, -0.1392,  ...,  0.3322,  0.5662, -0.6797],
         [ 0.4856,  0.1500, -0.1191,  ...,  0.0294, -0.1803, -0.3612]],
        grad_fn=<SliceBackward0>)]

In [23]:
bert_embeddings[1].shape

torch.Size([9, 768])