#### Using Embeddings in RAG : Inference

In [1]:
# Warning control
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np

from sentence_transformers import SentenceTransformer

from transformers import AutoTokenizer, \
                         DPRContextEncoder, DPRQuestionEncoder

import logging
logging.getLogger("transformers.modeling_utils").setLevel(logging.ERROR)

In [3]:
def cosine_similarity_matrix(features):
    norms               = np.linalg.norm(features, axis=1, keepdims=True)
    normalized_features = features / norms
    
    similarity_matrix         = np.inner(normalized_features, normalized_features)
    rounded_similarity_matrix = np.round(similarity_matrix, 4)
    
    return rounded_similarity_matrix

#### Pure similarity

In [4]:
answers = [
    "What is the tallest mountain in the world?",
    "The tallest mountain in the world is Mount Everest.",
    "Mount Shasta",
    "I like my hike in the mountains",
    "I am going to a yoga class"
]

question = 'What is the tallest mountain in the world?'

In [5]:
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
question_embedding = list(model.encode(question))

sim = []
for answer in answers:
    answer_embedding = list(model.encode(answer))
    sim.append(cosine_similarity_matrix(np.stack([question_embedding, answer_embedding]))[0,1])

print(sim)
best_inx = np.argmax(sim)
print(f"Question = {question}")
print(f"Best answer = {answers[best_inx]}")

[1.0, 0.7976, 0.4001, 0.3559, 0.0972]
Question = What is the tallest mountain in the world?
Best answer = What is the tallest mountain in the world?


#### Dual-Encoder inference

In [7]:
from transformers import DPRContextEncoder, DPRContextEncoderTokenizer

In [8]:
# Load pre-trained model and tokenizer
model     = DPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-multiset-base', 
                                             cache_dir = r'D:\AI-DATASETS\07-Hugging-Face-Data')

tokenizer = DPRContextEncoderTokenizer.from_pretrained('facebook/dpr-ctx_encoder-multiset-base', 
                                             cache_dir = r'D:\AI-DATASETS\07-Hugging-Face-Data')

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizer'.


`Encoding Passages:`
Use the model to encode text passages into dense vectors.

In [12]:
# Encode a passage
passage = "This is an example passage."
inputs  = tokenizer(passage, return_tensors="pt", truncation=True, padding=True)

outputs    = model(**inputs)
embeddings = outputs.pooler_output  # Dense vector representation of the passage

embeddings.shape

torch.Size([1, 768])

`Retrieval Process:`
For retrieval, encode both the query and the passages, then compute similarities to find the most relevant passages.

In [10]:
from sklearn.metrics.pairwise import cosine_similarity

In [18]:
# Encode a query
query = "What is an example passage?"
query_inputs = tokenizer(query, return_tensors="pt", truncation=True, padding=True)
query_outputs = model(**query_inputs)
query_embedding = query_outputs.pooler_output.squeeze().detach().numpy()  # Convert to 1D array

# Encode passages
passages = ["This is an example passage.", "Another passage here."]
passage_embeddings = []
for passage in passages:
    passage_inputs = tokenizer(passage, return_tensors="pt", truncation=True, padding=True)
    passage_outputs = model(**passage_inputs)
    passage_embeddings.append(passage_outputs.pooler_output.squeeze().detach().numpy())  # Convert to 1D arrays

In [19]:
# Convert passage embeddings to a 2D numpy array
passage_embeddings = np.array(passage_embeddings)

# Compute similarities
similarities = cosine_similarity(query_embedding.reshape(1, -1), passage_embeddings)
print(similarities)

[[0.8666056 0.7041036]]


#### Differences Between DPR and General Dual Encoders

**Dual Encoders:**
- **General Concept:** Dual encoders use two separate neural network encoders to process two types of text (e.g., queries and documents) into dense vector embeddings. These embeddings are then compared to compute similarities.
- **Use Cases:** General dual encoders can be used for a variety of tasks, including retrieval, question-answering, and sentence similarity. The design and efficiency depend on the specific use case and how the encoders and embeddings are managed.
- **Training:** Dual encoders can be trained with various loss functions, including contrastive loss, but the effectiveness and efficiency can vary based on the architecture and training data.

**Dense Passage Retrieval (DPR):**
- **Specific Design for Retrieval:** DPR is a specialized type of dual encoder architecture designed specifically for dense retrieval tasks.
- **Optimized Architecture:** DPR uses two separate BERT-based encoders (or similar models) to create dense embeddings for queries and passages. These embeddings are then compared using efficient nearest neighbor search techniques.
- **Training Strategy:** DPR employs a contrastive loss function during training, where it focuses on aligning query and passage embeddings to ensure that relevant pairs are close in the vector space while irrelevant pairs are far apart. This training strategy enhances the retrieval quality.
- **Indexing and Retrieval:**
  - **Approximate Nearest Neighbor Search:** DPR uses approximate nearest neighbor (ANN) search algorithms (like FAISS or HNSW) to efficiently search through large collections of passage embeddings. This allows for fast retrieval even with large-scale datasets.
  - **Efficiency:** The use of specialized retrieval libraries and optimized indexing techniques makes DPR faster in retrieving relevant passages compared to some general dual encoder setups.

**Why DPR Might Be Faster:**
1. **Optimized Indexing:**
   - **DPR:** Utilizes efficient indexing techniques (e.g., FAISS, HNSW) for quick retrieval from large-scale datasets. These techniques accelerate the search process by organizing embeddings in a way that allows for fast approximate nearest neighbor queries.
   - **General Dual Encoders:** May not always include such optimized indexing or might use simpler similarity search methods that can be slower for large-scale retrieval.

2. **Contrastive Training:**
   - **DPR:** Specifically trained with contrastive loss to ensure that the query and passage embeddings are highly effective for retrieval. This means the embeddings are fine-tuned for fast and accurate retrieval.
   - **General Dual Encoders:** While they can use contrastive loss, the training may not be as specialized for retrieval efficiency as DPR.

3. **Specialization:**
   - **DPR:** Designed with retrieval as its primary task, including architecture and training optimizations aimed at improving retrieval speed and accuracy.
   - **General Dual Encoders:** Might be designed for a range of tasks, which can lead to less optimization for any single task like retrieval.

**Summary:**
While both DPR and general dual encoders involve computing similarities between queries and documents using dense embeddings, DPR is tailored specifically for efficient dense retrieval. DPR’s speed advantages come from its specialized indexing techniques and training strategies optimized for large-scale retrieval tasks. General dual encoders might not always have these specific optimizations, leading to potential differences in retrieval speed and efficiency.


... back to the main code

In [21]:
answer_tokenizer = AutoTokenizer.from_pretrained("facebook/dpr-ctx_encoder-multiset-base")
answer_encoder   = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-multiset-base")

question_tokenizer = AutoTokenizer.from_pretrained("facebook/dpr-question_encoder-multiset-base")
question_encoder   = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-multiset-base")

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/492 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/493 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [22]:
# Compute the question embeddings
question_tokens = question_tokenizer(question, return_tensors="pt")["input_ids"]
question_embedding = question_encoder(question_tokens).pooler_output.flatten().tolist()
print(question_embedding[:10], len(question_embedding))

[0.07758244127035141, 0.25172561407089233, 0.18663954734802246, 0.22120100259780884, 0.02641526237130165, -0.1578557789325714, 0.32760268449783325, 0.26732853055000305, -0.08503071963787079, 0.12929508090019226] 768


In [23]:
sim = []
for answer in answers:
    answer_tokens = answer_tokenizer(answer, return_tensors="pt")["input_ids"]
    answer_embedding = answer_encoder(answer_tokens).pooler_output.flatten().tolist() 
    sim.append(cosine_similarity_matrix(np.stack([question_embedding, answer_embedding]))[0,1])

print(sim)
best_inx = np.argmax(sim)
print(f"Question = {question}")
print(f"Best answer = {answers[best_inx]}")

[0.6253, 0.7472, 0.5506, 0.3687, 0.25]
Question = What is the tallest mountain in the world?
Best answer = The tallest mountain in the world is Mount Everest.
