In [21]:
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  
  '''))

get_ipython().events.register('pre_run_cell', set_css)

In [2]:
import pandas as pd
import torch
from datasets import load_from_disk
from transformers import BertTokenizer, BertModel

In [7]:
dataset_path = '../data/block0'

block0 = load_from_disk(dataset_path)

In [13]:
train_dataset = block0['train']
num_examples_to_view = 5
example_text = train_dataset.select(range(num_examples_to_view))['Title']


In [32]:
example_text

 '75 Texas Tech football players have tested positive for COVID-19 since testing began in June Facebook buying REI\'s brand-new HQ for $368M Why Fastly Stock Surged Today Jim Cramer reveals playbook for Snowflake IPO: \'It\'s a great concept\' Here\'s some of the places Boston Sports Club owes money to Amazon to Hire 5,500 New Employees in Chicago Area, Signing Bonuses Offered What\'s Behind Amazon\'s Hiring Spree Ford Ranger Tremor: Meet the Midsize Off-Road Adventure Truck Snowflake Boosts Expected IPO Pricing Range Texas changes how it reports coronavirus positivity rate VA data breach exposes personal information for 46,000 veterans CenturyLink to Get a New Name Home Depot co-founder Arthur Blank on corporate profits, social change Hobby Lobby raises full-time hourly minimum wage to $17 New York City\'s 2nd tallest office building unveiled at ceremony in Manhattan Taco Bell introduces Jalapeno Noir to its menu Why TikTok\'s Oracle deal is a green light for advertisers to keep spend

In [15]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

text = example_text[0]
inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)

# Generate token embeddings
with torch.no_grad():
    outputs = model(**inputs)


In [33]:
#Compute the context vector (sentence embedding)
# Get the embeddings of all tokens from the last hidden layer
token_embeddings = outputs.last_hidden_state
token_embeddings


tensor([[[-0.4843, -0.2927,  0.3627,  ..., -0.2957,  0.2738, -0.0632],
         [-0.1309,  0.2477,  0.3474,  ..., -0.2088,  0.1294, -0.2448],
         [-0.5710, -0.0039,  1.2619,  ..., -0.0632, -0.6504, -0.2742],
         ...,
         [ 0.5651,  0.1234, -0.2354,  ...,  0.4797, -0.3500, -0.4519],
         [-0.3651, -0.2252,  0.1992,  ..., -0.2913, -1.3470, -0.7258],
         [ 0.5177,  0.4043,  0.0338,  ..., -0.0112, -0.4431, -0.2221]]])

In [35]:
# [batch_size, sequence_length, embedded_size]
token_embeddings.shape

torch.Size([1, 512, 768])

In [30]:
# Calculate the average across all tokens (ignoring padding tokens)
# unsqueeze(-1) adds a new dimension at the end of the tensor's shape
input_mask_expanded = inputs.attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
input_mask_expanded

tensor([[[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.]]])

In [38]:
input_mask_expanded.shape

torch.Size([1, 512, 768])

In [36]:
# Multiplying by the input mask zeroes out all values that correspond to padding tokens
# Right now I have 512 tokens each with 768 features. I am summing across 512 tokens to get 
# a single vector of 768 features. 
sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
sum_embeddings

tensor([[-1.0818e+02, -5.9293e+01,  2.3933e+02,  1.4442e+00,  2.4621e+02,
         -2.5807e+01,  1.8217e+01,  1.6991e+02,  4.7850e+01,  2.1589e+01,
          4.2217e+01, -2.2120e+02, -8.5419e+01,  1.0467e+02, -8.1140e+01,
          2.0444e+02,  5.3340e+01,  5.5553e+01, -8.9378e+01,  1.4397e+02,
          2.2045e+02, -4.9439e+01,  1.7987e+02,  2.7302e+02,  1.8653e+02,
         -3.6538e+01, -7.4321e+01, -1.3496e+02, -1.3454e+02,  1.7655e+01,
          2.5665e+02,  3.7903e+01, -3.0103e+01, -1.2576e+02,  1.1899e+02,
          2.8119e+01, -6.5225e+01, -1.1827e+02,  4.9310e+01,  1.6466e+02,
         -3.0646e+02, -1.8274e+02, -8.6946e+01,  5.1672e+01, -1.3257e+02,
         -6.8176e+01,  3.3928e+01, -3.3435e+00, -1.8446e+00, -3.1792e+01,
         -2.0191e+02,  1.1228e+02, -3.0497e+01, -1.0522e+02, -1.9890e+01,
          3.0791e+02, -4.7782e+01, -2.6257e+02, -1.6712e+02, -1.7255e+02,
          1.0261e+02, -1.1033e+02,  5.9920e+01, -8.6325e+01, -1.8116e+01,
          3.8220e+00, -2.6691e+01,  2.

In [37]:
sum_embeddings.shape

torch.Size([1, 768])

In [24]:
# Defining the denominator, which is the number of words that are not masked
sum_mask = input_mask_expanded.sum(1) 
sum_mask

tensor([[512., 512., 512., 512., 512., 512., 512., 512., 512., 512., 512., 512.,
         512., 512., 512., 512., 512., 512., 512., 512., 512., 512., 512., 512.,
         512., 512., 512., 512., 512., 512., 512., 512., 512., 512., 512., 512.,
         512., 512., 512., 512., 512., 512., 512., 512., 512., 512., 512., 512.,
         512., 512., 512., 512., 512., 512., 512., 512., 512., 512., 512., 512.,
         512., 512., 512., 512., 512., 512., 512., 512., 512., 512., 512., 512.,
         512., 512., 512., 512., 512., 512., 512., 512., 512., 512., 512., 512.,
         512., 512., 512., 512., 512., 512., 512., 512., 512., 512., 512., 512.,
         512., 512., 512., 512., 512., 512., 512., 512., 512., 512., 512., 512.,
         512., 512., 512., 512., 512., 512., 512., 512., 512., 512., 512., 512.,
         512., 512., 512., 512., 512., 512., 512., 512., 512., 512., 512., 512.,
         512., 512., 512., 512., 512., 512., 512., 512., 512., 512., 512., 512.,
         512., 512., 512., 5

In [28]:
context_vector = sum_embeddings / sum_mask
context_vector

tensor([[-2.1128e-01, -1.1581e-01,  4.6745e-01,  2.8208e-03,  4.8088e-01,
         -5.0404e-02,  3.5580e-02,  3.3186e-01,  9.3456e-02,  4.2167e-02,
          8.2456e-02, -4.3204e-01, -1.6683e-01,  2.0443e-01, -1.5848e-01,
          3.9930e-01,  1.0418e-01,  1.0850e-01, -1.7457e-01,  2.8119e-01,
          4.3057e-01, -9.6561e-02,  3.5131e-01,  5.3325e-01,  3.6432e-01,
         -7.1363e-02, -1.4516e-01, -2.6359e-01, -2.6278e-01,  3.4483e-02,
          5.0126e-01,  7.4030e-02, -5.8794e-02, -2.4563e-01,  2.3239e-01,
          5.4919e-02, -1.2739e-01, -2.3099e-01,  9.6308e-02,  3.2160e-01,
         -5.9856e-01, -3.5692e-01, -1.6982e-01,  1.0092e-01, -2.5893e-01,
         -1.3316e-01,  6.6266e-02, -6.5303e-03, -3.6027e-03, -6.2095e-02,
         -3.9436e-01,  2.1929e-01, -5.9565e-02, -2.0551e-01, -3.8847e-02,
          6.0138e-01, -9.3325e-02, -5.1283e-01, -3.2641e-01, -3.3701e-01,
          2.0041e-01, -2.1548e-01,  1.1703e-01, -1.6860e-01, -3.5383e-02,
          7.4648e-03, -5.2131e-02,  5.

In [29]:
context_vector.shape

torch.Size([1, 768])