# Testing BERT for embeddings

In [None]:
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
bert_version = 'bert-base-uncased'
#bert_version = 'google-bert/bert-base-multilingual-cased'

# check also: dbmdz/bert-base-italian-xxl-uncased

In [None]:
# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [None]:
# Function to get sentence embedding
def get_sentence_embedding(text):
    inputs = tokenizer(text, return_tensors='pt')  # Tokenize and prepare input tensors
    
    with torch.no_grad():  # Disable gradient calculation
        outputs = model(**inputs)  # Get model outputs
    
    last_hidden_states = outputs.last_hidden_state  # Extract last hidden states
    
    return last_hidden_states
    
    sentence_embedding = torch.mean(last_hidden_states, dim=1).numpy()  # Average token embeddings
    
    return sentence_embedding

In [None]:
emb = get_sentence_embedding("The quick brown fox jumps over the lazy dog.")

In [None]:
emb.shape

In [None]:
# Example sentences
texts = [
    "The quick brown fox jumps over the lazy dog.",
    "A fast brown fox leaps over a sleepy dog.",
    "This sentence is completely different from the others."
]

# Generate embeddings for each example sentence
embeddings = [get_sentence_embedding(text) for text in texts]

In [None]:
embeddings

In [None]:
# Query sentence
query_text = "The quick red fox jumps over the lazy dog."
query_embedding = get_sentence_embedding(query_text)

In [None]:
query_text =  "The quick brown fox jumps over the lazy dog.",
query_embedding = get_sentence_embedding(query_text)

In [None]:
# Compute cosine similarities between query and example sentences
similarities = cosine_similarity(query_embedding, np.vstack(embeddings))

In [None]:
# Print query text
print(f"Query text: {query_text}")

# Print similarity scores
for i, text in enumerate(texts):
    print(f"Similarity with '{text}': {similarities[0][i]}")

In [None]:
from local_llm_model import *

In [None]:
# fetch_model(model_selected_index=2, current_path="./") # already downloaded

In [None]:
feature_extraction = load_model(model_selected_index=2, current_path="./")

In [None]:
embed = feature_extraction("You are a fucking retarded")#, "You are a fucking idiot")

embed = embed[0,0, :]

In [None]:
embed_original = get_sentence_embedding("You are a fucking retarded")

In [None]:
embed.shape

In [None]:
embed_original.shape

In [None]:
embed = feature_extraction("You are a fucking retarded")#, "You are a fucking idiot")


In [None]:
embed.shape

In [None]:
local_model, local_tokenizer = load_model(model_selected_index=2, current_path="./")
text = "You are a fucking retarded"

In [None]:
inputs = local_tokenizer(text, return_tensors="pt")

outputs = local_model.bert(**inputs)
        
embeddings = outputs.last_hidden_state


In [None]:
embeddings

In [None]:
embeddings.shape

In [None]:
# Input text
text = "Voglio generare gli embedding per questa frase."

# Ottieni gli embedding usando la pipeline
embeddings = feature_extraction(text)

# La forma dell'output sarà [batch_size, sequence_length, hidden_size]
print(f"Embeddings shape: {embeddings.shape}")

# Se vuoi un embedding per l'intera frase
# Opzione 1: usa il token [CLS] (primo token)
sentence_embedding_cls = embeddings[0, 0, :]
# Opzione 2: media di tutti i token
sentence_embedding_mean = torch.mean(embeddings, dim=1)

print(f"Sentence embedding shape (CLS): {sentence_embedding_cls.shape}")
print(f"Sentence embedding shape (Mean): {sentence_embedding_mean.shape}")

In [None]:
# Example sentences
texts = [
    "The quick brown fox jumps over the lazy dog.",
    "A fast brown fox leaps over a sleepy dog.",
    "This sentence is completely different from the others."
]

# Generate embeddings for each example sentence
embeddings = [feature_extraction(text)[0, 0, :].reshape(1, -1)  for text in texts]

In [None]:
# Query sentence
query_text = "The quick red fox jumps over the lazy dog."
query_embedding = feature_extraction(query_text)[0, 0, :].reshape(1, -1) 

In [None]:
query_text =  "The quick brown fox jumps over the lazy dog.",
query_embedding = feature_extraction(query_text)[0, 0, :].reshape(1, -1)

In [None]:
# Compute cosine similarities between query and example sentences
similarities = cosine_similarity(query_embedding, np.vstack(embeddings))

In [None]:
# Print query text
print(f"Query text: {query_text}")

# Print similarity scores
for i, text in enumerate(texts):
    print(f"Similarity with '{text}': {similarities[0][i]}")