# Comparison of Text embedding with BOW, Word2Vec and BERT

In [None]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import gensim.downloader as api
from transformers import BertModel, BertTokenizer
import torch


In [None]:

text = ["I want a dog"]

# Bag of Words
vectorizer = CountVectorizer()
bow_representation = vectorizer.fit_transform(text).toarray()
print("Bag of Words Representation:", bow_representation)
print("Bag of Words Vocabulary:", vectorizer.get_feature_names_out())
print(f"Shape of the vector embedding {bow_representation.shape}")


Bag of Words Representation: [[1 1]]
Bag of Words Vocabulary: ['dog' 'want']
Shape of the vector embedding (1, 2)


In [None]:

# Word2Vec
try:
    word2vec_model = api.load("word2vec-google-news-300")
    word2vec_representation = np.array([word2vec_model[word] for word in text[0].split() if word in word2vec_model])
    avg_representation = np.mean(word2vec_representation, axis=0)
    print("\nWord2Vec Representation (averaged):",avg_representation )
    print(f"Shape of the vector embedding {word2vec_representation.shape}")
except Exception as e:
    print(f"\nCould not load Word2Vec model or process text: {e}")
    print("Please try again later or with a different model.")



Word2Vec Representation (averaged): [ 0.08902995  0.04035441  0.01790365  0.14794922 -0.04020182  0.01212565
  0.12491862 -0.0797526  -0.01969401  0.02441406  0.02881114 -0.26985678
 -0.00665283 -0.13582356 -0.11905924  0.12174479 -0.01155599  0.1451009
  0.0172526  -0.10123698 -0.02775065  0.0390625   0.18945312 -0.05777995
 -0.01171875  0.09675089 -0.10576773  0.0867513   0.12882487 -0.03503418
  0.03792318  0.08268229 -0.03808594 -0.11669922  0.01147461  0.0659078
  0.10587565  0.07470703  0.03924561  0.19921875  0.09041341 -0.1398112
  0.16164143  0.02799479 -0.06738281 -0.00813802  0.10469564 -0.02921549
 -0.01257324 -0.0061849  -0.05794271  0.13102214  0.09729004  0.06083171
  0.05541992  0.06467692  0.08243815 -0.05696614  0.17020671 -0.05094401
  0.14697266  0.1007487  -0.08300781  0.01670329  0.03487142 -0.06624349
 -0.10990397  0.14526367 -0.05529785 -0.00066121  0.21940105  0.12379964
 -0.00854492  0.02742513 -0.29101562 -0.14650472  0.16243489  0.02676392
  0.15901692  0.0

In [None]:

# BERT
try:
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')

    encoded_input = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        output = model(**encoded_input)

    # Using the representation of the [CLS] token
    bert_representation = output.last_hidden_state[:, 0, :].numpy()
    print("\nBERT Representation ([CLS] token):", bert_representation)
    print(f"Shape of the vector embedding {bert_representation.shape}")
except Exception as e:
    print(f"\nCould not load BERT model or process text: {e}")
    print("Please try again later or with a different model.")

In [None]:
# BERT

#tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
#model = BertModel.from_pretrained("bert-base-uncased")
model.eval()

encoded = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
with torch.no_grad():
    out = model(**encoded)  # out.last_hidden_state: [batch, seq_len, hidden=768]

# mean pooling with attention mask
token_embeddings = out.last_hidden_state                  # [1, L, 768]
print(token_embeddings)
print("Shape:", token_embeddings.shape)


tensor([[[ 0.1212,  0.3516, -0.0730,  ..., -0.3403,  0.2087,  0.3752],
         [ 0.2480,  0.3334, -0.1770,  ..., -0.2022,  0.5678,  0.1120],
         [ 0.8236, -0.1229,  1.1042,  ..., -0.1084,  0.4627, -0.2217],
         [ 0.2773, -0.1071,  0.9779,  ..., -1.5229,  0.5047,  0.9493],
         [ 0.3514,  0.5343, -0.2563,  ..., -1.0669,  0.0827,  0.1577],
         [ 0.7256,  0.1635, -0.3260,  ...,  0.0015, -0.5861, -0.4024]]])
Shape: torch.Size([1, 6, 768])


In [None]:
encoded

{'input_ids': tensor([[ 101, 1045, 2215, 1037, 3899,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}