## Setup for Hybrid Retrieval

In [1]:
!pip install transformers
!pip install 'accelerate>=0.26.0'
#!pip install -U datasets huggingface_hub
#!pip install fsspec==2023.9.2
!pip install datasets

!pip install SentenceTransformer
!pip install faiss-cpu

!pip install Whoosh

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate>=0.26.0)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate>=0.26.0)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate>=0.26.0)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate>=0.26.0)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate>=0.26.0)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=2.0.0->accelerate>=0.26.0)
  Downloading nvidia_cuff

In [2]:
from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer
from datasets import load_dataset 
from sentence_transformers import SentenceTransformer
import faiss

2025-08-10 23:25:29.899748: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1754868330.110079      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1754868330.168723      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
#Load DPR question encoder and tokenizer
dpr_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
dpr_model = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/493 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/dpr-question_encoder-single-nq-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
# Load a sample dataset
wiki_data = load_dataset('wikimedia/wikipedia', '20231101.en', split='train[:100000]', trust_remote_code=True)

Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/41 [00:00<?, ?files/s]

Generating train split:   0%|          | 0/6407814 [00:00<?, ? examples/s]

In [13]:
# Use SentenceTransformers to create dense embeddings for the document corpus
sentence_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
document_texts = [doc['text'] for doc in wiki_data]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [14]:
# Create document embeddings
document_embeddings = sentence_model.encode(document_texts, convert_to_tensor=True)

Batches:   0%|          | 0/3125 [00:00<?, ?it/s]

In [15]:
# Build FAISS index for fast retrieval
document_embeddings_np = document_embeddings.cpu().numpy()
index = faiss.IndexFlatL2(document_embeddings_np.shape[1])
index.add(document_embeddings_np)

## BM25 with Whoosh

In [16]:
from whoosh.index import create_in
from whoosh.fields import Schema, TEXT
from whoosh.qparser import QueryParser

import os

In [17]:
# Define schema for indexing documents with BM25
schema = Schema(content=TEXT(stored=True))

In [18]:
# Create BM25 index
if not os.path.exists("index"): 
    os.mkdir("index")
    ix = create_in("index", schema)

In [19]:
# Index documents using BM25
writer = ix.writer()
for doc in document_texts: 
    writer.add_document(content=doc)
    
writer.commit()

In [20]:
# Query the indexwith 
with ix.searcher() as searcher: 
    query = QueryParser("content", ix.schema).parse("What is the capital of France?") 
    results = searcher.search(query, limit=5) 
    for result in results: 
        print(result['content'])

A public execution is a form of capital punishment which "members of the general public may voluntarily attend." This definition excludes the presence of only a small number of witnesses called upon to assure executive accountability. The purpose of such displays has historically been to deter individuals from defying laws or authorities. Attendance at such events was historically encouraged and sometimes even mandatory.

While today most countries regard public executions with distaste, they have been practiced at some point in history nearly everywhere. At many points in the past, public executions were preferred to executions behind closed doors because of their capacity for deterrence. However, the actual efficacy of this form of terror is disputed. They also allowed the convicted the opportunity to make a final speech, gave the state the chance to display its power in front of those who fell under its jurisdiction, and granted the public what was considered to be a great spectacle

## Perform Hybrid Retrieval

In [21]:
def hybrid_retrieve(query): 
    # Use SentenceTransformer to get dense embedding of the query 
    query_embedding = sentence_model.encode(query, convert_to_tensor=True) 

    # Convert the query embedding to numpy array for FAISS 
    query_embedding_np = query_embedding.cpu().numpy().reshape(1, -1) 

    # FAISS expects 2D array # Use FAISS to retrieve documents based on query embedding 
    _, dense_indices = index.search(query_embedding_np, k=5) # k is the number of results 
    dense_results = [document_texts[i] for i in dense_indices[0]] 

    # Perform BM25 retrieval 
    with ix.searcher() as searcher: 
        bm25_results = searcher.search(QueryParser("content", ix.schema).parse(query), limit=5) 
        sparse_results = [result['content'] for result in bm25_results] 
        return dense_results, sparse_results

## Fine Tuning BART

In [22]:
from transformers import BartForConditionalGeneration, BartTokenizer, Trainer, TrainingArguments
import torch
from datasets import Dataset

In [23]:
#Load pre-trained BART model and tokenizer
tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

In [24]:
# Fine-tune data (you need input_ids and labels for training)
fine_tuning_data = ["The capital of France is Paris.", "The capital of Germany is Berlin."]
inputs = tokenizer(fine_tuning_data, return_tensors="pt", padding=True, truncation=True, max_length=128)

In [25]:
# Create labels (BART expects labels during training)
labels = inputs.input_ids.clone()

In [26]:
# Create a custom dataset with both input_ids and labels
dataset = Dataset.from_dict({"input_ids": inputs.input_ids, "labels": labels})

In [27]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=1,  # Match data size
    save_steps=1000,  # Don't save every few steps
    logging_steps=50,
    logging_dir="./logs",
    report_to="none"  # Avoid TensorBoard overhead
)

In [28]:
# Initialize Trainer
trainer = Trainer( model=model, args=training_args, train_dataset=dataset, 
                  eval_dataset=dataset, # Using the same data for evaluation here for simplicity
                 )

In [29]:
# Train the model
trainer.train()

Step,Training Loss




TrainOutput(global_step=6, training_loss=0.017778154462575912, metrics={'train_runtime': 11.7462, 'train_samples_per_second': 0.511, 'train_steps_per_second': 0.511, 'total_flos': 32154071040.0, 'train_loss': 0.017778154462575912, 'epoch': 3.0})

## Generating Responses

In [30]:
# Call hybrid retrieval to get dense and sparse results
dense_res, sparse_res = hybrid_retrieve("What is the capital of France?")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [31]:
# Check if dense_res has valid results before using it
if dense_res:
    # Truncate the retrieved document if it's too long
    relevant_info = dense_res[0][:1024]
    input_text = f"Query: What is the capital of France?\nRelevant Information: {relevant_info}"

    # Tokenize the input
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=1024)

    # Move inputs to the same device as the model
    device = next(model.parameters()).device  # Get model device (e.g., cuda:0)
    inputs = {k: v.to(device) for k, v in inputs.items()}  # Move inputs to same device

    # Generate response
    output = model.generate(inputs["input_ids"])

    # Decode and print the response
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    print("Generated Response:", response)

else:
    print("No results found in dense retrieval.")


Generated Response: Query: What is the capital of France?Relevant Information: The Val de Seine


### Due to limitations in processing the full dataset and the time required for fine-tuning, this project currently includes a working model that can be trained on specific, targeted data to improve accurate answer. This approach allows for flexible adaptation to domain-specific tasks while keeping training time manageable