In [8]:
from transformers import BertModel, BertTokenizer
import torch
import random
from sklearn.metrics.pairwise import cosine_similarity

# random seed  for reproducibility
random_seed = 42
random.seed(random_seed)

torch.manual_seed(random_seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(random_seed)

    
MODEL_DIR = "E:/code/dta/cls/exam/histbert" 
model = BertModel.from_pretrained(MODEL_DIR)
tokenizer = BertTokenizer.from_pretrained(MODEL_DIR)

model.eval() 


if torch.cuda.is_available():
    model.to('cuda')
    print("Model moved to GPU.")




Model moved to GPU.


In [None]:
def get_embedding_for_timeslice(target_word, sentences, tokenizer, model):
    """
    Generates a single aggregated embedding for a target word from a list of sentences.
    """
    all_embeddings = []

    for sentence in sentences:
        # 1. Tokenize the sentence and find the token(s) for our target word
        inputs = tokenizer(sentence, return_tensors="pt", truncation=True, max_length=128)
        input_ids = inputs["input_ids"][0]

        # Find all occurrences of the target word's token(s)
        # Note: Words can be split into multiple subword tokens (e.g., "broadcast" -> "broad", "##cast")
        target_tokens = tokenizer.tokenize(target_word)
        target_token_ids = tokenizer.convert_tokens_to_ids(target_tokens)
        
        token_indices = []
        for i in range(len(input_ids) - len(target_token_ids) + 1):
            if list(input_ids[i:i+len(target_token_ids)]) == target_token_ids:
                token_indices.extend(range(i, i + len(target_token_ids)))

        if not token_indices:
            continue

        # 2. Get the model's hidden states (the embeddings)
        with torch.no_grad():
            outputs = model(**inputs)
            hidden_states = outputs.hidden_states[-2] 
            
        # 3. Extract and average the embeddings for our target word's tokens
        word_embedding = hidden_states[0, token_indices, :].mean(dim=0)
        all_embeddings.append(word_embedding)

    if not all_embeddings:
        return None # Return None if the word was not found in any sentence

    # 4. Aggregate all contextual instances into a single vector
    final_embedding = torch.stack(all_embeddings).mean(dim=0)
    
    return final_embedding

In [9]:
sentence = """Nothing is more usual and more natural for those, who pretend to discover
 anything new to the world in philosophy and the sciences, than to
 insinuate the praises of their own systems, by decrying all those, which
 have been advanced before them."""

inputs = tokenizer(sentence, return_tensors="pt", truncation=True, max_length=128)

In [10]:
inputs

{'input_ids': tensor([[  101,  2498,  2003,  2062,  5156,  1998,  2062,  3019,  2005,  2216,
          1010,  2040,  9811,  2000,  7523,  2505,  2047,  2000,  1996,  2088,
          1999,  4695,  1998,  1996,  4163,  1010,  2084,  2000, 16021,  2378,
         20598,  1996, 27128,  1997,  2037,  2219,  3001,  1010,  2011, 11703,
          2854,  2075,  2035,  2216,  1010,  2029,  2031,  2042,  3935,  2077,
          2068,  1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1]])}

In [None]:
# Input text
text = "GeeksforGeeks is a computer science portal"

# Tokenize and encode text using batch_encode_plus
# The function returns a dictionary containing the token IDs and attention masks
encoding = tokenizer.batch_encode_plus( [text],# List of input texts
    padding=True,              # Pad to the maximum sequence length
    truncation=True,           # Truncate to the maximum sequence length if necessary
    return_tensors='pt',      # Return PyTorch tensors
    add_special_tokens=True    # Add special tokens CLS and SEP
)

input_ids = encoding['input_ids']  # Token IDs
# print input IDs
print(f"Input ID: {input_ids}")
attention_mask = encoding['attention_mask']  # Attention mask
# print attention mask
print(f"Attention mask: {attention_mask}")

In [None]:
# Generate embeddings using BERT model
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    word_embeddings = outputs.last_hidden_state  # This contains the embeddings

# Output the shape of word embeddings
print(f"Shape of Word Embeddings: {word_embeddings.shape}")