# RAG Advanced Retrieval


## Notebook Setup

In [1]:
# Importing the necessary Python libraries
import os
import json
import time
import yaml

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from datasets import Dataset
from langchain.vectorstores import FAISS
from langchain_core.output_parsers.string import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain_community.document_loaders import DataFrameLoader
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    answer_correctness,
    context_recall,
    context_precision
)

  from .autonotebook import tqdm as notebook_tqdm



For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from ragas.metrics._answer_correctness import AnswerCorrectness, answer_correctness

For example, replace imports like: `from langchain.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from ragas.metrics._context_entities_recall import (


In [2]:
# Loading in the data from the CSV files
df_kis = pd.read_csv('../data/synthetic_knowledge_items.csv')
df_validation = pd.read_csv('../data/rag_sample_qas_from_kis.csv')

# Dropping alt_ki_text from the df_kis DataFrame
df_kis.drop(columns = ['alt_ki_text'], inplace = True)

# Dropping any unnecessary columns from the validation DataFrame
df_validation.drop(columns = ['ki_topic', 'ki_text'], inplace = True)

# Renaming the remaining columns
df_validation.rename(columns = {
    'sample_question': 'question',
    'sample_ground_truth': 'ground_truth'
}, inplace = True)

In [3]:
# Setting the embedding algorithm
embedding_algorithm = OpenAIEmbeddings(model = 'text-embedding-3-large')

# Setting the chat model
chat_model = ChatOpenAI(model = 'gpt-4o')

In [4]:
# Creating the ground truth simulation prompt template
ANSWER_GENERATION_PROMPT = '''You are an expert evaluator for question-answering systems. Your task is to provide the ideal answer based on the given question and context. Please follow these guidelines:

1. Question: {question}

2. Context: {context}

3. Instructions:
   - Carefully analyze the question and the provided context.
   - Formulate a comprehensive and accurate answer based solely on the information given in the context.
   - Ensure your answer directly addresses the question.
   - Include all relevant information from the context, but do not add any external knowledge.
   - If the context doesn't contain enough information to fully answer the question, state this clearly and provide the best possible partial answer.
   - Use a formal, objective tone.

Remember, your goal is to provide the ideal answer that should be used as the benchmark for evaluating the AI's performance.'''

# Creating the prompt engineering emplate to generate the simulated ground truth
answer_generation_prompt = ChatPromptTemplate.from_messages(messages = [
    HumanMessagePromptTemplate.from_template(template = ANSWER_GENERATION_PROMPT)
])

## Vector Database Setup
Since we already covered advanced ingestion techniques in a different notebook, we'll quickly set ourselves up here with a vector database that we can use for practicing our advanced retrieval techniques.

In [5]:
# Setting the filepath for the index file
index_file = '../data/semantic_index.bin'

# Checking if the index file exists
if os.path.exists(index_file):

    # Load the index from file
    faiss_index = FAISS.load_local(index_file,
                                   embeddings = OpenAIEmbeddings(),
                                   allow_dangerous_deserialization = True)

# Creating the FAISS index from scratch
else:

    # Loading the documents
    documents = DataFrameLoader(df_kis, page_content_column = 'ki_text').load()

    # Creating a semantic text splitter
    text_splitter = SemanticChunker(embeddings = embedding_algorithm)

    # Splitting the documents into chunks
    chunks = text_splitter.split_documents(documents)

    # Creating FAISS index for the current chunk size
    faiss_index = FAISS.from_documents(chunks, embedding_algorithm)

    # Save the index to file
    faiss_index.save_local(index_file)

In [6]:
# Getting the number of documents in the FAISS index
num_documents = faiss_index.index.ntotal

# Printing the number of documents
print(f'Number of documents in the FAISS index: {num_documents}')

Number of documents in the FAISS index: 269


## Advanced Retrieval Techniques
Let's move into talking about the advanced retrieval techniques we'll use! For your own use case, you may want to use a different combination of these various techniques. Keep in mind: some of these techniques WILL add latency your pipeline. In addition to applying these techniques to get better results, you will want to ensure that you are balancing the trade-offs between speed and accuracy.

### HyDE (Hypothetical Document Embeddings)
Don't let the name fool you: this is actually a relatively simple technique. The idea is to take a query and generate a hypothetical document embedding for it. This hypothetical document embedding is then used to retrieve documents from the vector database. This is a great technique to use when you have a query that is very different from the documents in your vector database.

In [7]:
# Setting up a prompt to help us produce the HyDE generation
HyDE_PROMPT = '''Generate a brief, factual paragraph that answers the following question: {question}'''

# Setting up the model to use for HyDE generation
hyde_model = ChatOpenAI(model = 'gpt-4o-mini')

# Creating the prompt engineering template to generate the HyDE information
hyde_prompt_template = ChatPromptTemplate.from_messages(messages = [
    HumanMessagePromptTemplate.from_template(template = HyDE_PROMPT)
])

In [8]:
def retrieve_doc_w_hyde(inputs):
    '''
    Retrieves a document from the FAISS index using the HyDE generation method

    Inputs:
    - inputs (dict): A dictionary containing LangChain stuff

    Returns:
    - context (str): The retrieved document based on the HyDE generation
    '''

    # Setting the question and HyDe document appropriately
    question = inputs['question']
    hyde_doc = inputs['hyde_doc']

    # Combining the question and HyDE document
    combined_query = f'{question}\n\n{hyde_doc}'

    # Creating a retriever from the FAISS index
    retriever = faiss_index.as_retriever(search_kwargs = {'k': 1})

    # Retrieving the document from the FAISS index
    context = retriever.invoke(combined_query)[0].page_content

    print(context)

    return {'context': context}

In [9]:
# Creating the HyDE chain
hyde_chain = (
    {
        'question': RunnablePassthrough(),
        'hyde_doc': hyde_prompt_template | hyde_model | StrOutputParser(),
    }
    | RunnableLambda(retrieve_doc_w_hyde)
    | answer_generation_prompt
    | chat_model
    | StrOutputParser()
)

In [10]:
query = df_validation.iloc[0]['question']

In [11]:
query

'"How do I set up my company email on my mobile device?"'

In [12]:
hyde_chain.invoke({'question': query})

AssertionError: 