In [None]:
from transformers import AutoTokenizer, AutoModel
import torch

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-large-en-v1.5')
model = AutoModel.from_pretrained('BAAI/bge-large-en-v1.5')

doc_embeddings = torch.load("embeddings/document_embeddings.pt")




In [None]:
def generate_question_embedding(question, tokenizer, model):
    inputs = tokenizer(question, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1)  # Mean pooling

In [None]:
import torch
from torch.nn.functional import cosine_similarity

question_embedding = generate_question_embedding(input(), tokenizer, model)

# Assume question_embedding is your query embedding, normalized to unit length
# doc_embeddings is a 2D tensor, with each row being an embedding for a different excerpt of the document

# Normalize doc_embeddings to unit length for each excerpt
doc_embeddings = doc_embeddings / doc_embeddings.norm(dim=1, keepdim=True)

# Calculate cosine similarity between the question embedding and all document excerpt embeddings
similarities = cosine_similarity(question_embedding.unsqueeze(0), doc_embeddings)

similarities = similarities.flatten()

# Sort the similarities and get indices in descending order
sorted_indices = torch.argsort(similarities, descending=True)

# Extract the top N indices for the most similar excerpts
N = 5
top_n_indices = sorted_indices[:N]

# Correctly accessing the similarities using top N indices
# Since similarities is 1D, top_n_indices should directly index the similarities without error
top_n_similarities = similarities[top_n_indices]

print(f"Indices of top {N} most similar excerpts: {top_n_indices}")
print(f"Similarities of top {N} excerpts: {top_n_similarities}")



In [None]:

from transformers import AutoTokenizer, AutoModelForQuestionAnswering

tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
model = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")



In [None]:
inputs = tokenizer.encode_plus("What is the question?", "This is the context passage that contains the answer.", return_tensors='pt')


In [None]:
with torch.no_grad():
    outputs = model(**inputs)
answer_start_scores, answer_end_scores = outputs.start_logits, outputs.end_logits

# Find the position (token indices) with the highest start and end scores
answer_start = torch.argmax(answer_start_scores)
answer_end = torch.argmax(answer_end_scores) + 1  # add 1 to include the end token


In [None]:
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end]))

In [None]:
print(answer)