<a href="https://colab.research.google.com/github/edquestofficial/Gen-AI-Cohort/blob/main/2024/april/Level_2/RAG_Generative_Search_with_GeminiPro_and_ChromaDB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pdfplumber

In [None]:
!pip install google.generativeai

In [None]:
!pip install chromaDB

## Mount Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [21]:
import os

base_path = "/content/drive/MyDrive/Gen AI Course/RAG_For_HDFC_Policy"
filepath = f"{base_path}/gemini_api_key.txt"
with open(filepath, "r") as f:
  api_key = ' '.join(f.readlines())
  os.environ["GEMINI_API_KEY"] = api_key

## Read Document and Create <b>Chunks</b>

In [4]:
import pdfplumber
from pathlib import Path
import pandas as pd
from operator import itemgetter
import json

In [5]:
# Function to extract text from a PDF file.
# 1. Declare a variable p to store the iteration of the loop that will help us store page numbers alongside the text
# 2. Declare an empty list 'full_text' to store all the text files
# 3. Use pdfplumber to open the pdf pages one by one
# 4. Find the tables and their locations in the page
# 5. Extract the text from the tables in the variable 'tables'
# 6. Extract the regular words by calling the function check_bboxes() and checking whether words are present in the table or not
# 7. Use the cluster_objects utility to cluster non-table and table words together so that they retain the same chronology as in the original PDF
# 8. Declare an empty list 'lines' to store the page text
# 9. If a text element in present in the cluster, append it to 'lines', else if a table element is present, append the table
# 10. Append the page number and all lines to full_text, and increment 'p'
# 11. When the function has iterated over all pages, return the 'full_text' list

def load_text_from_pdf(pdf_path):
    full_text = []
    p = 1

    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_no = f"page {p}"
            text = page.extract_text()

            tables = page.find_tables()
            table_bboxes = [i.bbox for i in tables]
            tables = [{'table': i.extract(), 'top': i.bbox[1]} for i in tables]
            non_table_words = [word for word in page.extract_words() if not any(
                [check_bboxes(word, table_bbox) for table_bbox in table_bboxes])]
            lines = []

            for cluster in pdfplumber.utils.cluster_objects(non_table_words + tables, itemgetter('top'), tolerance=5):

                if 'text' in cluster[0]:
                    try:
                        lines.append(' '.join([i['text'] for i in cluster]))
                    except KeyError:
                        pass

                elif 'table' in cluster[0]:
                    lines.append(json.dumps(cluster[0]['table']))

            page_text = " ".join(lines)
            full_text.append([page_no, page_text])
            p += 1

    return full_text

In [8]:
pdf_directory_str = "/content/drive/MyDrive/Gen AI Course/RAG_For_HDFC_Policy/data"
pdf_directory_path = Path(pdf_directory_str)

In [9]:
# Function to check whether a word is present in a table or not for segregation of regular text and tables

def check_bboxes(word, table_bbox):
    # Check whether word is inside a table bbox.
    l = word['x0'], word['top'], word['x1'], word['bottom']
    r = table_bbox
    return l[0] > r[0] and l[1] > r[1] and l[2] < r[2] and l[3] < r[3]

In [10]:
# Define the directory containing the PDF files
pdf_directory = Path(pdf_directory_path)

# Initialize an empty list to store the extracted texts and document names
data = []

# Loop through all files in the directory
for pdf_path in pdf_directory.glob("*.pdf"):

    # Process the PDF file
    print(f"...Processing {pdf_path.name}")

    # Call the function to extract the text from the PDF
    extracted_text = load_text_from_pdf(pdf_path)

    # Convert the extracted list to a PDF, and add a column to store document names
    extracted_text_df = pd.DataFrame(extracted_text, columns=['Page No.', 'Page_Text'])
    extracted_text_df['Document Name'] = pdf_path.name

    # Append the extracted text and document name to the list
    data.append(extracted_text_df)

    # Print a message to indicate progress
    print(f"Finished processing {pdf_path.name}")

# Print a message to indicate all PDFs have been processed
print("All PDFs have been processed.")

...Processing HDFC-Life-Easy-Health-101N110V03-Policy-Bond-Single-Pay.pdf
Finished processing HDFC-Life-Easy-Health-101N110V03-Policy-Bond-Single-Pay.pdf
All PDFs have been processed.


In [11]:
insurance_pdfs_data = pd.concat(data, ignore_index=True)

In [12]:
insurance_pdfs_data['Text_Length'] = insurance_pdfs_data['Page_Text'].apply(lambda x: len(x.split(' ')))

In [None]:
insurance_pdfs_data = insurance_pdfs_data.loc[insurance_pdfs_data['Text_Length'] >= 10]

In [33]:
insurance_pdfs_data.head()

Unnamed: 0,Page No.,Page_Text,Document Name,Text_Length
0,page 1,Part A <<Date>> <<Policyholder’s Name>> <<Poli...,HDFC-Life-Easy-Health-101N110V03-Policy-Bond-S...,508
1,page 2,Agency/Intermediary Contact Details: <<Agency/...,HDFC-Life-Easy-Health-101N110V03-Policy-Bond-S...,85
2,page 3,POLICY DOCUMENT- HDFC LIFE EASY HEALTH Unique ...,HDFC-Life-Easy-Health-101N110V03-Policy-Bond-S...,298
3,page 4,"[[null, ""<< dd/mm/yyyy >>""], [""Appointee's Add...",HDFC-Life-Easy-Health-101N110V03-Policy-Bond-S...,63
4,page 5,Part B Definitions The following capitalised t...,HDFC-Life-Easy-Health-101N110V03-Policy-Bond-S...,514


## Create Embeddings and store them in Vector DB

In [16]:
import google.generativeai as genai
from chromadb import Documents, EmbeddingFunction, Embeddings
import os

In [17]:
# Define the path where chroma collections will be stored

chroma_data_path = '/content/drive/MyDrive/Gen AI Course/Open AI/ChromaDB_Data'

In [18]:
class GeminiEmbeddingFunction(EmbeddingFunction):
  def __call__(self, input: Documents) -> Embeddings:
    gemini_api_key = os.getenv("GEMINI_API_KEY")
    if not gemini_api_key:
      raise ValueError("Gemini API Key not provided. Please provide GEMINI_API_KEY as an environment variable")
    genai.configure(api_key=gemini_api_key)
    model = "models/embedding-001"
    title = "Custom query"
    return genai.embed_content(model=model,content=input, task_type="retrieval_document",title=title)["embedding"]

In [19]:
import chromadb

name="rag_experiment"
chroma_client = chromadb.PersistentClient()
db = chroma_client.create_collection(name=name, embedding_function=GeminiEmbeddingFunction())

In [22]:
for i, d in enumerate(insurance_pdfs_data["Page_Text"]):
   db.add(documents=d, ids=str(i))

In [23]:
db = chroma_client.get_collection(name=name, embedding_function=GeminiEmbeddingFunction())

In [None]:
# Let's take a look at the first few entries in the collection

db.get(
    ids = ['0'],
    include = ['embeddings', 'documents', 'metadatas']
)

## Semantic Search

In [25]:
# Read the user query

user_query = "what are Daily Hospital Cash Benefit Option"

In [26]:
def get_semantic_search(collection, query):
  results = collection.query(
    query_texts=user_query,
    n_results=6
  )
  ids = results["ids"][0]
  documents = results["documents"][0]
  distances = results["distances"][0]
  metadatas = results["metadatas"][0]

  results_df = pd.DataFrame({
    'IDs': ids,
    'Documents': documents,
    'Distances': distances,
    'Metadatas': metadatas
  })

  top_3_semantic = results_df.sort_values(by='Distances')
  top_3_semantic = top_3_semantic[:3]
  return top_3_semantic


In [27]:
top_3_semantic = get_semantic_search(db, user_query)

In [28]:
top_3_semantic

Unnamed: 0,IDs,Documents,Distances,Metadatas
0,9,"[[""Plan option"", ""Benefits covered""], [""A"", ""D...",0.391275,
1,8,(16) Intensive Care Unit (ICU) - means an iden...,0.434364,
2,13,"Part E (Applicable charges, Fund name, fund op...",0.504596,


## Generative Search

In [29]:
context = ""
for doc in top_3_semantic["Documents"]:
  context += doc+"\n\n"

In [30]:
model = genai.GenerativeModel('gemini-pro')
chat = model.start_chat(history = [])

In [31]:
prompt = f"You are a helpful assistant in the tax domain who can effectively answer user queries about tax and documents. user query :{user_query}. please do not use your own knowledge take context from given below contex  {context}"
response = chat.send_message(prompt).text

In [32]:
from IPython.display import Markdown

Markdown(response)

The Daily Hospital Cash Benefit Option (DHCB) provides financial assistance to the insured individual in the event of hospitalization due to injury, sickness, or disease. The benefit is paid out daily for the duration of the hospital stay, subject to certain limits and conditions.

**Key Points:**

* The benefit is payable for both ICU and non-ICU hospital admissions.
* The benefit amount is 1% of the sum insured for non-ICU admissions and 2% of the sum insured for ICU admissions.
* The maximum payable period is 20 days for non-ICU admissions and 10 days for ICU admissions, with an overall maximum of 60 days and 30 days, respectively, during the entire policy term.
* A waiting period of 60 days applies before the DHCB benefit can be claimed.
* The benefit is paid as a lump sum after the completion of each continuous hospitalization for more than 24 hours.
* The benefit is calculated as follows: Daily Hospital Cash Benefit * (Number of Days admitted - 1)
* If the maximum benefit limits are reached during the policy term, the DHCB coverage ceases for the remaining policy term, while other benefits (Surgical Benefit and Critical Illness Benefit) remain active.