# RAG Advanced Retrieval


## Notebook Setup

In [5]:
# Importing the necessary Python libraries
import os
import json
import time
import yaml

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from datasets import Dataset
from langchain.vectorstores import FAISS
from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate
from langchain_community.document_loaders import DataFrameLoader
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    answer_correctness,
    context_recall,
    context_precision
)

In [6]:
# Loading in the data from the CSV files
df_kis = pd.read_csv('../data/synthetic_knowledge_items.csv')
df_validation = pd.read_csv('../data/rag_sample_qas_from_kis.csv')

# Dropping alt_ki_text from the df_kis DataFrame
df_kis.drop(columns = ['alt_ki_text'], inplace = True)

# Dropping any unnecessary columns from the validation DataFrame
df_validation.drop(columns = ['ki_topic', 'ki_text'], inplace = True)

# Renaming the remaining columns
df_validation.rename(columns = {
    'sample_question': 'question',
    'sample_ground_truth': 'ground_truth'
}, inplace = True)

In [7]:
# Setting the filepath for the index file
index_file = '../data/semantic_index.bin'

# Checking if the index file exists
if os.path.exists(index_file):

    # Load the index from file
    faiss_index = FAISS.load_local(index_file,
                                   embeddings = OpenAIEmbeddings(),
                                   allow_dangerous_deserialization = True)

# Creating the FAISS index from scratch
else:

    # Setting the embedding algorithm
    embedding_algorithm = OpenAIEmbeddings()

    # Loading the documents
    documents = DataFrameLoader(df_kis, page_content_column = 'ki_text').load()

    # Creating a semantic text splitter
    text_splitter = SemanticChunker(embeddings = embedding_algorithm)

    # Splitting the documents into chunks
    chunks = text_splitter.split_documents(documents)

    # Creating FAISS index for the current chunk size
    faiss_index = FAISS.from_documents(chunks, embedding_algorithm)

    # Save the index to file
    faiss_index.save_local(index_file)

In [8]:
import os

file_size_kb = os.path.getsize(index_file) / 1024
print(f"The file size of {index_file} is {file_size_kb} kilobytes.")


The file size of ../data/semantic_index.bin is 4.0 kilobytes.
