# RAG (Retrieval Augmented Generation) pipeline using FAISS and LangChain

### Install Required Packages

In [1]:
#!pip install faiss-cpu

In [2]:
#!pip install langchain langchain-community langchain-groq python-dotenv

In [3]:
#pip install -r requirements.txt

### Import Modules & Set Logging

In [None]:
import os
import math
import pickle
from typing import List, Union
import logging
from dotenv import load_dotenv

import pandas as pd
from tqdm import tqdm
from groq import Groq
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_groq import ChatGroq
from getpass import getpass
from rouge_score import rouge_scorer
from bert_score import score as bert_score
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [None]:
# ========= Config ========= #

os.environ["GROQ_API_KEY"] = getpass("Enter your GROQ API Key: ")

INDEX_DIR = "faiss_index_bioasq_full"
STATE_FILE = "checkpoint_all_chunks.pkl"
BATCH_SIZE = 3000
EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
MODEL_NAME = "llama3-8b-8192"

Enter your GROQ API Key: ··········


In [None]:
# ========= Data Loading ========= #
def load_data():
    passages = pd.read_parquet("hf://datasets/rag-datasets/rag-mini-bioasq/data/passages.parquet/part.0.parquet")
    test = pd.read_parquet("hf://datasets/rag-datasets/rag-mini-bioasq/data/test.parquet/part.0.parquet")
    return passages, test


In [None]:
# ========= Text Processing ========= #
def chunk_documents(passages: pd.DataFrame, chunk_size=1000, chunk_overlap=100) -> List[Document]:
    docs = [Document(page_content=text) for text in passages["passage"].dropna()]
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return splitter.split_documents(docs)

In [None]:
# ========= FAISS Indexing ========= #
def build_faiss_index(chunks, index_dir=INDEX_DIR, state_file=STATE_FILE) -> FAISS:
    total_chunks = len(chunks)
    num_batches = math.ceil(total_chunks / BATCH_SIZE)
    embedding_model = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME, encode_kwargs={"batch_size": 64})

    if os.path.exists(state_file) and os.path.exists(index_dir):
        with open(state_file, "rb") as f:
            start_batch = pickle.load(f)
        faiss_index = FAISS.load_local(index_dir, embedding_model, allow_dangerous_deserialization=True)
        print(f"Resuming from batch {start_batch}")
    else:
        start_batch = 0
        faiss_index = None
        print("Starting fresh FAISS index...")

    for i in tqdm(range(start_batch, num_batches), desc="Embedding batches"):
        batch_chunks = chunks[i * BATCH_SIZE: (i + 1) * BATCH_SIZE]
        if faiss_index is None:
            faiss_index = FAISS.from_documents(batch_chunks, embedding_model)
        else:
            faiss_index.add_documents(batch_chunks)

        faiss_index.save_local(index_dir)
        with open(state_file, "wb") as f:
            pickle.dump(i + 1, f)

    print("Index ready.")
    return faiss_index

In [None]:
# ========= Load Model and Chain ========= #
def get_rag_chain(index_dir=INDEX_DIR) -> RetrievalQA:
    embedding_model = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME, encode_kwargs={"batch_size": 64})
    retriever = FAISS.load_local(index_dir, embedding_model, allow_dangerous_deserialization=True).as_retriever(search_kwargs={"k": 10})

    llm = ChatGroq(api_key=os.environ["GROQ_API_KEY"], model_name=MODEL_NAME)

    prompt_template = PromptTemplate.from_template("""
    You are a biomedical expert AI. Based on the provided documents, answer the question concisely.
    If the answer is not explicitly stated, do not make assumptions.

    Context:
    {context}

    Question: {question}
    Answer:
    """)

    return RetrievalQA.from_chain_type(
        llm=llm,
        retriever=retriever,
        return_source_documents=True,
        chain_type_kwargs={"prompt": prompt_template}
    )


In [None]:
def evaluate_rag(rag_chain: RetrievalQA, test_queries: List[dict]):
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    rouge_l_scores = []
    bert_scores = []
    correct = 0

    print("📋 List of Test Queries:")
    for i, test in enumerate(test_queries, 1):
        print(f"{i}. {test['question']}")
    print("\nRunning evaluation...\n")

    for i, test in enumerate(test_queries, 1):
        output = rag_chain.invoke({"query": test["question"]})
        predicted = output["result"]
        result = predicted.lower()
        expected = test["expected"]

        # For matching (same as before)
        is_correct = False
        if isinstance(expected, set):
            matched = {term for term in expected if term.lower() in result}
            is_correct = len(matched) >= 4
            expected_text = " ".join(expected)
        else:
            is_correct = expected.lower() in result
            expected_text = expected
        correct += int(is_correct)

        # ROUGE-L Score
        rouge = scorer.score(expected_text, predicted)
        rouge_l = rouge["rougeL"].fmeasure
        rouge_l_scores.append(rouge_l)

        # BERTScore
        P, R, F1 = bert_score([predicted], [expected_text], lang="en", verbose=False)
        bert_scores.append(F1[0].item())

        print(f"Q{i}: {test['question']}")
        print(f"Expected: {expected}")
        print(f"Predicted: {predicted}")
        print(f"Correct: {'Yes ✅' if is_correct else 'No ❌'}")
        print(f"ROUGE-L Score: {rouge_l:.4f}")
        print(f"BERTScore F1: {F1[0].item():.4f}\n")

    # Summary
    accuracy = correct / len(test_queries) * 100
    avg_rouge = sum(rouge_l_scores) / len(rouge_l_scores)
    avg_bert = sum(bert_scores) / len(bert_scores)

    print("======== Final Evaluation ========")
    print(f"Accuracy: {accuracy:.2f}%")
    print(f"Average ROUGE-L Score: {avg_rouge:.4f}")
    print(f"Average BERTScore F1: {avg_bert:.4f}")


In [None]:
# ========= Run Pipeline ========= #
def main():
    df_passages, df_test = load_data()
    chunks = chunk_documents(df_passages)
    print("Total Chunks:", len(chunks))

    build_faiss_index(chunks)

    rag_chain = get_rag_chain()

    test_queries = [
        {"question": "Is Hirschsprung disease a mendelian or a multifactorial disorder?", "expected": "multifactorial"},
        {"question": "List signaling molecules (ligands) that interact with the receptor EGFR?", "expected": {"EGF", "TGF-α", "AREG", "EPR", "HB-EGF", "BTC", "EPG"}},
        {"question": "Is the protein Papilin secreted?", "expected": "yes"},
        {"question": "Is RANKL secreted from the cells?", "expected": "yes"},
        {"question": "Are long non-coding RNAs spliced?", "expected": "yes"}
    ]

    evaluate_rag(rag_chain, test_queries)


if __name__ == "__main__":
    main()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Total Chunks: 69217


  embedding_model = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME, encode_kwargs={"batch_size": 64})


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Starting fresh FAISS index...


Embedding batches: 100%|██████████| 24/24 [1:39:05<00:00, 247.73s/it]


Index ready.
📋 List of Test Queries:
1. Is Hirschsprung disease a mendelian or a multifactorial disorder?
2. List signaling molecules (ligands) that interact with the receptor EGFR?
3. Is the protein Papilin secreted?
4. Is RANKL secreted from the cells?
5. Are long non-coding RNAs spliced?

Running evaluation...



tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Q1: Is Hirschsprung disease a mendelian or a multifactorial disorder?
Expected: multifactorial
Predicted: Based on the provided documents, it can be concluded that Hirschsprung's disease is a multifactorial disorder. The text states that "non-syndromic non-familial, short-segment HSCR appears to represent a non-Mendelian condition with variable expression and sex-dependent penetrance" and that "the genetics of Hirschsprung's disease are highly complex with the majority of known genetic sites relating to the main susceptibility pathways (RET and EDNRB)". Additionally, the text mentions that "low-penetrance mutations would be necessary but not sufficient and the additional presence of the 'Hirschsprung disease haplotype' could contribute to the manifestation of the disease", suggesting that multiple genetic and environmental factors contribute to the development of the disease.
Correct: Yes ✅
ROUGE-L Score: 0.0174
BERTScore F1: 0.7924



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Q2: List signaling molecules (ligands) that interact with the receptor EGFR?
Expected: {'EGF', 'BTC', 'TGF-α', 'EPG', 'AREG', 'HB-EGF', 'EPR'}
Predicted: Based on the provided documents, the following signaling molecules (ligands) interact with the receptor EGFR:

1. Epidermal growth factor (EGF)
2. HB-EGF (heparin-binding EGF-like growth factor)
3. TGF-α (transforming growth factor-alpha)
4. BTC (betacellulin)
5. EPR (epiregulin)
6. EPG (epigen)
7. AR (amphiregulin)
8. PEPD (a novel ligand of EGFR)

Note that the document does not explicitly state the binding affinities or dissociation constants for each ligand, but it mentions that the affinities range from sub-nanomolar to near micromolar.
Correct: Yes ✅
ROUGE-L Score: 0.0870
BERTScore F1: 0.8012



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Q3: Is the protein Papilin secreted?
Expected: yes
Predicted: According to the provided documents, Papilin is a secreted extracellular matrix protein that is found in basement membranes. It is also mentioned that Papilin was isolated from the culture media of Drosophila Kc cells, indicating that it is secreted by cells. Additionally, it is stated that Papilin forms oligomers linked by disulfide bridges, which is consistent with its secreted nature. Therefore, the answer to the question is:

Yes, the protein Papilin is secreted.
Correct: Yes ✅
ROUGE-L Score: 0.0270
BERTScore F1: 0.7935



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Q4: Is RANKL secreted from the cells?
Expected: yes
Predicted: The answer is: No, the provided documents do not mention RANKL (Receptor Activator of NF-κB Ligand). The documents discuss protein secretion and the role of the KDEL receptor in protein sorting, but RANKL is not mentioned.
Correct: No ❌
ROUGE-L Score: 0.0000
BERTScore F1: 0.7863



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Q5: Are long non-coding RNAs spliced?
Expected: yes
Predicted: Based on the provided documents, the answer is:

Not explicitly stated. The documents discuss the splicing of RNAs, but do not specifically mention the splicing of long non-coding RNAs.
Correct: No ❌
ROUGE-L Score: 0.0000
BERTScore F1: 0.8167

Accuracy: 60.00%
Average ROUGE-L Score: 0.0263
Average BERTScore F1: 0.7980


# Tuning the Retriever

### Load Environment Variables

##### ➡️ Make sure your .env file contains:

In [None]:
GROQ_API_KEY="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"

In [None]:
# from dotenv import load_dotenv
# import os

# load_dotenv()  # Loads .env file into environment

# # Check if the variable is available
# print("GROQ_API_KEY:", os.getenv("GROQ_API_KEY"))


In [None]:
# Load API key
# load_dotenv()
# GROQ_API_KEY = os.getenv("GROQ_API_KEY")

GROQ_API_KEY = "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"

if not GROQ_API_KEY:
    raise ValueError(" GROQ_API_KEY not found in environment variables!")
else:
    logging.info(" GROQ_API_KEY loaded successfully.")

In [None]:
print("GROQ_API_KEY:",GROQ_API_KEY)

GROQ_API_KEY: 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


### Load FAISS Index & Checkpoint

In [None]:
def load_index(index_path, checkpoint_path, embedding_model_name):
    logging.info("Loading embedding model...")
    embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)

    logging.info("Loading FAISS index...")
    print("index_path--->"+str(index_path))
    print("embeddings--->"+str(embeddings))
    faiss_index = FAISS.load_local(index_path, embeddings, allow_dangerous_deserialization=True)
    logging.info("FAISS index loaded successfully.")

    print("faiss_index--->"+str(faiss_index))


    if os.path.exists(checkpoint_path):
        with open(checkpoint_path, "rb") as f:
            batch = pickle.load(f)
            logging.info(f"Resuming from batch {batch}")
    else:
        batch = 0
        logging.info("No checkpoint found. Starting fresh.")

    return faiss_index, batch

In [None]:
def tune_retriever(faiss_index, k=5):
    retriever = faiss_index.as_retriever(
        search_type="mmr",
        search_kwargs={"k": k}
    )
    logging.info(f"Retriever tuned with k={k} and MMR search.")
    return retriever

### Create RAG Chain with Groq + LLaMA3

In [None]:
def create_rag_chain(retriever, groq_api_key):
    logging.info("Initializing LLM (LLaMA3-8b-8192)...")
    llm = ChatGroq(
        model_name="LLaMA3-8b-8192",
        groq_api_key=groq_api_key
    )

    logging.info("Creating RAG chain with RetrievalQA...")
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        retriever=retriever,
        return_source_documents=True,
        chain_type="stuff"
    )
    return qa_chain

### Run a Query

In [None]:
def run_query(chain, question):
    logging.info(f"Running query: {question}")
    result = chain({"query": question})

    print("\n Question:")
    print(question)

    print("\n Answer:")
    print(result["result"])

    print("\n Sources:")
    for doc in result["source_documents"]:
        print("- Source:", doc.metadata.get("source", "[no source]"))
        print("  Content Sample:", doc.page_content[:200], "...\n")

### Main Execution Block

In [None]:
# === SET PATHS ===
index_path = "."
checkpoint_path = "checkpoint_all_chunks.pkl"
embedding_model = "sentence-transformers/all-MiniLM-L6-v2"

# === LOAD & RUN ===
faiss_index, last_batch = load_index(index_path, checkpoint_path, embedding_model)
retriever = tune_retriever(faiss_index, k=5)
qa_chain = create_rag_chain(retriever, GROQ_API_KEY)

# === SAMPLE QUESTION ===
run_query(qa_chain, "What is the relationship between metabolism and the immune system?")


index_path--->.
embeddings--->client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
) model_name='sentence-transformers/all-MiniLM-L6-v2' cache_folder=None model_kwargs={} encode_kwargs={} multi_process=False show_progress=False
faiss_index---><langchain_community.vectorstores.faiss.FAISS object at 0x79c8349f87d0>

🔍 Question:
What is the relationship between metabolism and the immune system?

✅ Answer:
According to the provided context, there is a relationship between metabolism and the immune system. The text states that "metabolism of T4 and T3 by rat hepatocytes in primary culture was measured in t