## 1. Install and Import the Required Libraries

In [87]:
# install required packages
!pip install -q chromadb faiss-cpu pypdf tiktoken docarray pdfplumber

In [88]:
!pip install -U langchain-openai langchain-community



In [89]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [90]:
# import the necessary libraies
import os
import openai
from langchain_openai import ChatOpenAI, OpenAI
import pdfplumber
from pathlib import Path
import pandas as pd
from operator import itemgetter
import json
import tiktoken
import chromadb

In [91]:
# Set the API key
filepath = "/content/drive/MyDrive/GenAI/"

with open(filepath + "OPENAI_API_Key.txt", "r") as f:
  openai.api_key = ' '.join(f.readlines())

# Update the OpenAI API key by updating the environment variable
  os.environ["OPENAI_API_KEY"] = openai.api_key

##2. Model Input Output

In [92]:
# instantiate OpenAI's Chat Model
llm_chat = ChatOpenAI()

##3. Data Connections and Retrieval

In [93]:
pdf_path = "/content/drive/MyDrive/GenAI"

def check_bboxes(word, table_bbox):
    # Check whether word is inside a table bbox.
    l = word['x0'], word['top'], word['x1'], word['bottom']
    r = table_bbox
    return l[0] > r[0] and l[1] > r[1] and l[2] < r[2] and l[3] < r[3]

    # function to extract text from a PDF file.

def extract_text_from_pdf(pdf_path):
    p = 0
    full_text = []


    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_no = f"Page {p+1}"
            text = page.extract_text()

            tables = page.find_tables()
            table_bboxes = [i.bbox for i in tables]
            tables = [{'table': i.extract(), 'top': i.bbox[1]} for i in tables]
            non_table_words = [word for word in page.extract_words() if not any(
                [check_bboxes(word, table_bbox) for table_bbox in table_bboxes])]
            lines = []

            for cluster in pdfplumber.utils.cluster_objects(non_table_words + tables, itemgetter('top'), tolerance=5):

                if 'text' in cluster[0]:
                    try:
                        lines.append(' '.join([i['text'] for i in cluster]))
                    except KeyError:
                        pass

                elif 'table' in cluster[0]:
                    lines.append(json.dumps(cluster[0]['table']))


            full_text.append([page_no, " ".join(lines)])
            p +=1

    return full_text

In [94]:
# Define the directory containing the PDF files
pdf_directory = Path(pdf_path)

# Initialize an empty list to store the extracted texts and document names
data = []

# Loop through all files in the directory
for pdf_path in pdf_directory.glob("*.pdf"):

    # Process the PDF file
    print(f"...Processing {pdf_path.name}")

    # Call the function to extract the text from the PDF
    extracted_text = extract_text_from_pdf(pdf_path)

    # Convert the extracted list to a PDF, and add a column to store document names
    extracted_text_df = pd.DataFrame(extracted_text, columns=['Page No.', 'Page_Text'])
    extracted_text_df['Document Name'] = pdf_path.name

    # Append the extracted text and document name to the list
    data.append(extracted_text_df)

    # Print a message to indicate progress
    print(f"Finished processing {pdf_path.name}")

# Print a message to indicate all PDFs have been processed
print("All PDFs have been processed.")

...Processing Principal-Sample-Life-Insurance-Policy.pdf
Finished processing Principal-Sample-Life-Insurance-Policy.pdf
All PDFs have been processed.


In [95]:
# Concatenate all the DFs in the list 'data' together

pdfs_data = pd.concat(data, ignore_index=True)
pdfs_data

Unnamed: 0,Page No.,Page_Text,Document Name
0,Page 1,DOROTHEA GLAUSE S655 RHODE ISLAND JOHN DOE 01/...,Principal-Sample-Life-Insurance-Policy.pdf
1,Page 2,This page left blank intentionally,Principal-Sample-Life-Insurance-Policy.pdf
2,Page 3,POLICY RIDER GROUP INSURANCE POLICY NO: S655 C...,Principal-Sample-Life-Insurance-Policy.pdf
3,Page 4,This page left blank intentionally,Principal-Sample-Life-Insurance-Policy.pdf
4,Page 5,PRINCIPAL LIFE INSURANCE COMPANY (called The P...,Principal-Sample-Life-Insurance-Policy.pdf
...,...,...,...
59,Page 60,I f a Dependent who was insured dies during th...,Principal-Sample-Life-Insurance-Policy.pdf
60,Page 61,Section D - Claim Procedures Article 1 - Notic...,Principal-Sample-Life-Insurance-Policy.pdf
61,Page 62,A claimant may request an appeal of a claim de...,Principal-Sample-Life-Insurance-Policy.pdf
62,Page 63,This page left blank intentionally,Principal-Sample-Life-Insurance-Policy.pdf


In [96]:
# Let's also check the length of all the texts as there might be some empty pages or pages with very few words that we can drop

pdfs_data['Text_Length'] = pdfs_data['Page_Text'].apply(lambda x: len(x.split(' ')))
pdfs_data['Text_Length']

Unnamed: 0,Text_Length
0,30
1,5
2,230
3,5
4,110
...,...
59,285
60,418
61,322
62,5


In [97]:
# Retain only the rows with a text length of at least 10

pdfs_data = pdfs_data.loc[pdfs_data['Text_Length'] >= 10]
pdfs_data

Unnamed: 0,Page No.,Page_Text,Document Name,Text_Length
0,Page 1,DOROTHEA GLAUSE S655 RHODE ISLAND JOHN DOE 01/...,Principal-Sample-Life-Insurance-Policy.pdf,30
2,Page 3,POLICY RIDER GROUP INSURANCE POLICY NO: S655 C...,Principal-Sample-Life-Insurance-Policy.pdf,230
4,Page 5,PRINCIPAL LIFE INSURANCE COMPANY (called The P...,Principal-Sample-Life-Insurance-Policy.pdf,110
5,Page 6,TABLE OF CONTENTS PART I - DEFINITIONS PART II...,Principal-Sample-Life-Insurance-Policy.pdf,153
6,Page 7,Section A – Eligibility Member Life Insurance ...,Principal-Sample-Life-Insurance-Policy.pdf,176
7,Page 8,Section A - Member Life Insurance Schedule of ...,Principal-Sample-Life-Insurance-Policy.pdf,171
8,Page 9,P ART I - DEFINITIONS When used in this Group ...,Principal-Sample-Life-Insurance-Policy.pdf,387
9,Page 10,T he legally recognized union of two eligible ...,Principal-Sample-Life-Insurance-Policy.pdf,251
10,Page 11,(2) has been placed with the Member or spouse ...,Principal-Sample-Life-Insurance-Policy.pdf,299
11,Page 12,An institution that is licensed as a Hospital ...,Principal-Sample-Life-Insurance-Policy.pdf,352


In [98]:
pdfs_data['Metadata'] = pdfs_data.loc[:, ['Document Name', 'Page No.']].apply(lambda x: {'Policy_Name': x['Document Name'][:-4], 'Page_No.': x['Page No.']}, axis=1)
pdfs_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pdfs_data['Metadata'] = pdfs_data.loc[:, ['Document Name', 'Page No.']].apply(lambda x: {'Policy_Name': x['Document Name'][:-4], 'Page_No.': x['Page No.']}, axis=1)


Unnamed: 0,Page No.,Page_Text,Document Name,Text_Length,Metadata
0,Page 1,DOROTHEA GLAUSE S655 RHODE ISLAND JOHN DOE 01/...,Principal-Sample-Life-Insurance-Policy.pdf,30,{'Policy_Name': 'Principal-Sample-Life-Insuran...
2,Page 3,POLICY RIDER GROUP INSURANCE POLICY NO: S655 C...,Principal-Sample-Life-Insurance-Policy.pdf,230,{'Policy_Name': 'Principal-Sample-Life-Insuran...
4,Page 5,PRINCIPAL LIFE INSURANCE COMPANY (called The P...,Principal-Sample-Life-Insurance-Policy.pdf,110,{'Policy_Name': 'Principal-Sample-Life-Insuran...
5,Page 6,TABLE OF CONTENTS PART I - DEFINITIONS PART II...,Principal-Sample-Life-Insurance-Policy.pdf,153,{'Policy_Name': 'Principal-Sample-Life-Insuran...
6,Page 7,Section A – Eligibility Member Life Insurance ...,Principal-Sample-Life-Insurance-Policy.pdf,176,{'Policy_Name': 'Principal-Sample-Life-Insuran...
7,Page 8,Section A - Member Life Insurance Schedule of ...,Principal-Sample-Life-Insurance-Policy.pdf,171,{'Policy_Name': 'Principal-Sample-Life-Insuran...
8,Page 9,P ART I - DEFINITIONS When used in this Group ...,Principal-Sample-Life-Insurance-Policy.pdf,387,{'Policy_Name': 'Principal-Sample-Life-Insuran...
9,Page 10,T he legally recognized union of two eligible ...,Principal-Sample-Life-Insurance-Policy.pdf,251,{'Policy_Name': 'Principal-Sample-Life-Insuran...
10,Page 11,(2) has been placed with the Member or spouse ...,Principal-Sample-Life-Insurance-Policy.pdf,299,{'Policy_Name': 'Principal-Sample-Life-Insuran...
11,Page 12,An institution that is licensed as a Hospital ...,Principal-Sample-Life-Insurance-Policy.pdf,352,{'Policy_Name': 'Principal-Sample-Life-Insuran...


In [99]:
# Import the OpenAI Embeddings class from LangChain
from langchain.embeddings import OpenAIEmbeddings
embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small")

In [100]:
documents_list = pdfs_data["Page_Text"].tolist()
metadata_list = pdfs_data['Metadata'].tolist()

In [70]:
pdfs_data['text_with_metadata'] = pdfs_data.apply(
    lambda row: f"Policy: {row['Metadata']['Policy_Name']}, Page: {row['Metadata']['Page_No.']}",
    axis=1
)

# 3. Create embeddings
embeddings = pdfs_data['text_with_metadata'].apply(lambda x: embeddings_model.embed_query(x))

# 4. Store the embeddings back in your DataFrame
pdfs_data['embeddings'] = embeddings

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pdfs_data['text_with_metadata'] = pdfs_data.apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pdfs_data['embeddings'] = embeddings


In [71]:
print(pdfs_data[['text_with_metadata', 'embeddings']].head())

                                  text_with_metadata  \
0  Policy: Principal-Sample-Life-Insurance-Policy...   
2  Policy: Principal-Sample-Life-Insurance-Policy...   
4  Policy: Principal-Sample-Life-Insurance-Policy...   
5  Policy: Principal-Sample-Life-Insurance-Policy...   
6  Policy: Principal-Sample-Life-Insurance-Policy...   

                                          embeddings  
0  [0.08173544551763982, -0.007572397206735298, 0...  
2  [0.07976849074714773, -0.00427094923902927, 0....  
4  [0.07455167497946907, 0.010947963535281054, 0....  
5  [0.07910483113174938, 0.002671969282612175, 0....  
6  [0.07164621515850239, 0.00662232089976654, 0.0...  


In [72]:
pdfs_data.head()

Unnamed: 0,Page No.,Page_Text,Document Name,Text_Length,Metadata,text_with_metadata,embeddings
0,Page 1,DOROTHEA GLAUSE S655 RHODE ISLAND JOHN DOE 01/...,Principal-Sample-Life-Insurance-Policy.pdf,30,{'Policy_Name': 'Principal-Sample-Life-Insuran...,Policy: Principal-Sample-Life-Insurance-Policy...,"[0.08173544551763982, -0.007572397206735298, 0..."
2,Page 3,POLICY RIDER GROUP INSURANCE POLICY NO: S655 C...,Principal-Sample-Life-Insurance-Policy.pdf,230,{'Policy_Name': 'Principal-Sample-Life-Insuran...,Policy: Principal-Sample-Life-Insurance-Policy...,"[0.07976849074714773, -0.00427094923902927, 0...."
4,Page 5,PRINCIPAL LIFE INSURANCE COMPANY (called The P...,Principal-Sample-Life-Insurance-Policy.pdf,110,{'Policy_Name': 'Principal-Sample-Life-Insuran...,Policy: Principal-Sample-Life-Insurance-Policy...,"[0.07455167497946907, 0.010947963535281054, 0...."
5,Page 6,TABLE OF CONTENTS PART I - DEFINITIONS PART II...,Principal-Sample-Life-Insurance-Policy.pdf,153,{'Policy_Name': 'Principal-Sample-Life-Insuran...,Policy: Principal-Sample-Life-Insurance-Policy...,"[0.07910483113174938, 0.002671969282612175, 0...."
6,Page 7,Section A – Eligibility Member Life Insurance ...,Principal-Sample-Life-Insurance-Policy.pdf,176,{'Policy_Name': 'Principal-Sample-Life-Insuran...,Policy: Principal-Sample-Life-Insurance-Policy...,"[0.07164621515850239, 0.00662232089976654, 0.0..."


In [73]:
from langchain.vectorstores import Chroma
# Initialize OpenAIEmbeddings
openai_embeddings = OpenAIEmbeddings()

In [74]:
PERSIST_DIR = "chroma_policies"   # folder will be created/updated on disk
COLLECTION  = "policies"

def row_to_text(row):
    meta = row["Metadata"]
    header = f"Policy: {meta['Policy_Name']} | Page: {meta['Page_No.']}"
    body = row["Text"] if "Text" in row and isinstance(row["Text"], str) else ""
    return f"{header}\n\n{body}".strip()

texts = pdfs_data.apply(row_to_text, axis=1).tolist()
metadatas = pdfs_data["Metadata"].tolist()
ids = pdfs_data.index.astype(str).tolist()

In [75]:
# 3) Create/load persistent Chroma collection and add data
db = Chroma(
    collection_name=COLLECTION,
    embedding_function=embeddings_model,
    persist_directory=PERSIST_DIR,
)

In [76]:
db.add_texts(texts=texts, metadatas=metadatas, ids=ids)
# 4) Make it persistent on disk
db.persist()
print(f"Chroma persisted to: {PERSIST_DIR}")

Chroma persisted to: chroma_policies


##Retriver

In [77]:
retriever = db.as_retriever(
    search_type="similarity",   # or "mmr" for diversity
    search_kwargs={"k": 3}      # how many documents to retrieve
)

In [78]:
query = "What is the claim settlement process?"
results = retriever.invoke(query)

for doc in results:
    print(doc.metadata, "\n", doc.page_content[:300], "\n---")

{'Page_No.': 'Page 56', 'Policy_Name': 'Principal-Sample-Life-Insurance-Policy'} 
 Policy: Principal-Sample-Life-Insurance-Policy | Page: Page 56 
---
{'Policy_Name': 'Principal-Sample-Life-Insurance-Policy', 'Page_No.': 'Page 27'} 
 Policy: Principal-Sample-Life-Insurance-Policy | Page: Page 27 
---
{'Policy_Name': 'Principal-Sample-Life-Insurance-Policy', 'Page_No.': 'Page 29'} 
 Policy: Principal-Sample-Life-Insurance-Policy | Page: Page 29 
---


In [79]:
results[0]

Document(metadata={'Page_No.': 'Page 56', 'Policy_Name': 'Principal-Sample-Life-Insurance-Policy'}, page_content='Policy: Principal-Sample-Life-Insurance-Policy | Page: Page 56')

In [80]:
# method for combining all relevant page content
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [81]:
from langchain import hub
prompt = hub.pull("rlm/rag-prompt")

##4. Chains

In [82]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [83]:
# test a query
query = "what is the minimum age for doing a term insurance?"
rag_chain.invoke(query)

'The minimum age for doing a term insurance is typically 18 years old. This information can be found in the Principal-Sample-Life-Insurance-Policy document on Page 25.'

In [84]:
# test a query
query = "what is the maximum age for doing a term insurance?"
rag_chain.invoke(query)

'The maximum age for term insurance can vary depending on the insurance company and the specific policy. Without more specific information from the retrieved context, the maximum age for term insurance cannot be determined. It is best to consult the specific policy documents or contact the insurance company directly for this information.'

In [85]:
# test a query
query = "Can a 50 year plus person do a term insurance?"
rag_chain.invoke(query)

'Yes, a 50-year-old person can typically still qualify for a term insurance policy. It is important to compare different insurance providers to find the best options for coverage and premiums. Reviewing the specific terms and conditions outlined in the policy will provide more detailed information.'

In [86]:
# test another query
query = "what are HDFC Life Sanchay Plus Life Long Income Option ?"
rag_chain.invoke(query)

'HDFC Life Sanchay Plus Life Long Income Option is a life insurance policy that provides a regular income for life. It offers guaranteed income and bonuses to the policyholder. This option aims to provide financial security and stability in retirement.'