In [1]:
# !pip install -U -q pdfplumber tiktoken chromaDB sentence-transformers

In [2]:
import pdfplumber
from pathlib import Path
import pandas as pd
from operator import itemgetter
import json
import tiktoken
import openai
import chromadb
import re
import os
import warnings
from sentence_transformers import CrossEncoder, util
import ast
# Suppress warnings
warnings.filterwarnings('ignore')

  from tqdm.autonotebook import tqdm, trange





In [3]:
# Setting openAI secret Key
openai.api_key = open("../OpenAISeceretKey.txt", "r").read().strip()
os.environ['OPENAI_API_KEY'] = openai.api_key

In [4]:
single_pdf_path ='Principal-Sample-Life-Insurance-Policy.pdf'

In [5]:
# Testing if We are able to read the PDF file or not
tile=[]
with pdfplumber.open(single_pdf_path) as pdf:

    
    # reading 9th Page
    single_page = pdf.pages[9]

    # Extract text from the 9th page
    text = single_page.extract_text()
    

    # Extract tables from the 9th page
    tables = single_page.extract_tables()

    # Splitting tile in PDF text to find topic of page Print the extracted text
    tile=text.split('This policy has been updated effective January 1, 2014')

print(tile)

['T he legally recognized union of two eligible individuals of the same sex established according to\nlaw.\nCivil Union Partner\nFor two persons to establish a Civil Union in Rhode Island, it shall be necessary that they satisfy\nall of the following criteria:\na. not be a party to another Civil Union or marriage in Rhode Island;\nb. be of the same sex and therefore be excluded from the marriage laws of Rhode Island or\nany other state;\nc. be at least 18 years of age;\nd. not be related to the other proposed party to the Civil Union.\nNOTE: For the purposes of this Group Policy, the term "spouse" will include Civil Union\nPartner, except as otherwise provided in this Group Policy.\nDate of Issue\nThe date this Group Policy is placed in force: November 1, 2007.\nDependent\na. A Member\'s spouse, if that spouse:\n(1) is legally married to the Member; and\n(2) is not in the Armed Forces of any country; and\n(3) is not insured under this Group Policy as a Member.\nA Member\'s spouse will 

#### Function to verify if a word is present inside table or not

In [6]:
# This function tries to find if a word is present inside table or not
# word box coordinates are defined as (x0,top,x1,bottom)
# table box coordinates are defined as (x0,bottom,x1,top)
# While bottom left corner of the page is considered as origin (0,0)

def check_bboxes(word, table_bbox):
    # Check whether word is inside a table bbox.
    w = word['x0'], word['top'], word['x1'], word['bottom']
    # table(x0,bottom,x1,top)
    t = table_bbox
    return w[0]>t[0] and w[0]<t[0] and w[1]<t[3] and w[3]>t[1]
    # return w[0] > t[0] and w[1] > t[1] and w[2] < t[2] and w[3] < t[3]

#### This functions "extract_text_from_pdf" extracts data from each page and checks if tables are present in the table. 
#### If table is present convert its data into list of list where outer list represent the table and each inner list represent the rows in table
## Chunking strategy : Per page
- Since this insurance document doesn't contains table and diagrams in the text
- Document is also of just 64 pages
- Hence to keep complete context of the data per page chunking strategy was adopted
- This strategy helps improve performace as chat we need to do less joins to concatinate all the pages when passes to chat completion api and it also provides complete context to API to have efficient results

In [7]:
# Function to extract text from a PDF file.
# 1. Declare a variable p to store the iteration of the loop that will help us store page numbers alongside the text
# 2. Declare an empty list 'full_text' to store all the text files
# 3. Use pdfplumber to open the pdf pages one by one
# 4. Find the tables and their locations in the page
# 5. Extract the text from the tables in the variable 'tables'
# 6. Extract the regular words by calling the function check_bboxes() and checking whether words are present in the table or not
# 7. Use the cluster_objects utility to cluster non-table and table words together so that they retain the same chronology as in the original PDF
# 8. Declare an empty list 'lines' to store the page text
# 9. If a text element in present in the cluster, append it to 'lines', else if a table element is present, append the table
# 10. Append the page number and all lines to full_text, and increment 'p'
# 11. When the function has iterated over all pages, return the 'full_text' list

def extract_text_from_pdf(pdf_path):
    p = 0
    full_text = []


    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_no = f"Page {p+1}"
            text = page.extract_text()

            # Bottom tile of page contains page topic, hence extracting page topic
            page_titleLst=text.split('This policy has been updated effective January 1, 2014')
            if(len(page_titleLst)>1):
                page_title=page_titleLst[1].replace('\n',' ').strip()
                page_title=page_title.upper().replace('PAGE','Subpage')
            else:
                # If Page has no title this shows there is no contetnt on the page
                # hence assigning page title as page no to identify pages with no data
                page_title=page_no
                
                
            

            tables = page.find_tables()
            table_bboxes = [i.bbox for i in tables]
            tables = [{'table': i.extract(), 'top': i.bbox[3]} for i in tables]
            non_table_words = [word for word in page.extract_words() if not any(
                [check_bboxes(word, table_bbox) for table_bbox in table_bboxes])]
            lines = []

            for cluster in pdfplumber.utils.cluster_objects(non_table_words + tables, itemgetter('top'), tolerance=5):

                if 'text' in cluster[0]:
                    try:
                        lines.append(' '.join([i['text'] for i in cluster]))
                    except KeyError:
                        pass

                elif 'table' in cluster[0]:
                    lines.append(json.dumps(cluster[0]['table']))


            full_text.append([page_no,page_title, " ".join(lines)])
            p +=1

    return full_text

#### Removing first 8 pages as they are blank pages or index

In [8]:
extracted_text=extract_text_from_pdf(single_pdf_path)[8:]

#### Creating Dataframe by extracting page no, page topic and page text for each page in the PDF
- Also removing pages which are blank i.e no topic is present on the page

In [9]:
pdfDf=pd.DataFrame(extracted_text, columns=['Page No','Page Title', 'Page_Text'])
print('Before removing blank pages ',pdfDf.shape)
# Removing PDF blank PDF pages (Pages have same page title and page number)
pdfDf=pdfDf[pdfDf['Page No'] != pdfDf['Page Title']]
print('After removing blank pages ',pdfDf.shape)
pdfDf.head(5)

Before removing blank pages  (56, 3)
After removing blank pages  (54, 3)


Unnamed: 0,Page No,Page Title,Page_Text
0,Page 9,"GC 6002 PART I - DEFINITIONS, Subpage 1",P ART I - DEFINITIONS When used in this Group ...
1,Page 10,"GC 6002 PART I - DEFINITIONS, Subpage 2",T he legally recognized union of two eligible ...
2,Page 11,"GC 6002 PART I - DEFINITIONS, Subpage 3",(2) has been placed with the Member or spouse ...
3,Page 12,"GC 6002 PART I - DEFINITIONS, Subpage 4",An institution that is licensed as a Hospital ...
4,Page 13,"GC 6002 PART I - DEFINITIONS, Subpage 5",a . A licensed Doctor of Medicine (M.D.) or Os...


### Creating new column "Metadata" having a dictionary of page no and pagetitle
- example->    {'pagetitle': 'Benefits', 'pageNo': 'Page 7'}

In [10]:
pdfDf['Metadata'] = pdfDf.apply(lambda x: {'pagetitle': x['Page Title'], 'pageNo': x['Page No']}, axis=1)

In [11]:
pdfDf.head(5)

Unnamed: 0,Page No,Page Title,Page_Text,Metadata
0,Page 9,"GC 6002 PART I - DEFINITIONS, Subpage 1",P ART I - DEFINITIONS When used in this Group ...,"{'pagetitle': 'GC 6002 PART I - DEFINITIONS, S..."
1,Page 10,"GC 6002 PART I - DEFINITIONS, Subpage 2",T he legally recognized union of two eligible ...,"{'pagetitle': 'GC 6002 PART I - DEFINITIONS, S..."
2,Page 11,"GC 6002 PART I - DEFINITIONS, Subpage 3",(2) has been placed with the Member or spouse ...,"{'pagetitle': 'GC 6002 PART I - DEFINITIONS, S..."
3,Page 12,"GC 6002 PART I - DEFINITIONS, Subpage 4",An institution that is licensed as a Hospital ...,"{'pagetitle': 'GC 6002 PART I - DEFINITIONS, S..."
4,Page 13,"GC 6002 PART I - DEFINITIONS, Subpage 5",a . A licensed Doctor of Medicine (M.D.) or Os...,"{'pagetitle': 'GC 6002 PART I - DEFINITIONS, S..."


In [13]:
# Importing Open AI embeddings and model

from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction

model = "text-embedding-ada-002"

embedding_function = OpenAIEmbeddingFunction(api_key=openai.api_key, model_name=model)

### Creating ChromaDB at specified location

In [14]:
client = chromadb.PersistentClient(path="../chromavectorDb")

### Creating Main and Cache Collection in chromaDB for the insurance policy 

In [15]:
life_insurance_collection = client.get_or_create_collection(name='RAG_on_LifeInsurance', embedding_function=embedding_function,       
                                                            metadata={"hnsw:space": "cosine"} # l2 is the defaul
                                                           )

life_insurance_collection_cache = client.get_or_create_collection(name='RAG_on_LifeInsurance_cache', embedding_function=embedding_function,       
                                                            metadata={"hnsw:space": "cosine"} # l2 is the defaul
                                                           )

### Extracting PDF doc's Pages text and metadata columns into List
### Also creating index for page present as row in the PDF doc

In [16]:
documents_list = pdfDf["Page_Text"].tolist()
metadata_list = pdfDf['Metadata'].tolist()
ids=[str(i) for i in range(0, pdfDf.shape[0])]

#### Adding Dataframe columns such as pages text and metadata into ChromaDB collection, main collection created 

In [17]:
life_insurance_collection.add(
    documents= documents_list,
    ids = ids,
    metadatas = metadata_list
)


In [18]:
# Verifying total number of pages or record added in the collection
life_insurance_collection.count()

54

#### Instantiating Cross encoder to compare similarity between two text. This will result in a score
- score can range from-infinity to +infinity
- Positive score shows strong relation
- Zero is neutral relation
- negative shows negative relation

In [19]:
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

# Cache Implementation
### This function "checkCache" Identifies if query or related query is already present in cache

In [20]:
def checkCache(query,threshold=.2):
    # Querying Cache collection
    cacheResults=life_insurance_collection_cache.query(           
            query_texts=query,
            n_results=1
        )
    
    print('Distance of result searched in cache: ',cacheResults['distances'][0])

    # If cache is empty and distance of result from cache is greater than .2 (Threshold)
    #  It means whatever is resulted from CACHE is no related to query, hence need to check main collection
    if len(cacheResults['distances'][0])==0 or cacheResults['distances'][0][0]>threshold:
        
        cacheResults=dict()
        cacheResults['found']=False
        
    else:

        # Second level of safeguard (checking via Cross Encoder) 
        # By comparing query which is asked and related query which is returned from cache
        score=cross_encoder.predict([query,cacheResults['ids'][0][0]])

        # If score is negative or zero that means data returned from cache is not related to asked query
        print('Cross Encoder Score for cache search: ', score)
        if score <=0:
            cacheResults=dict()
            cacheResults['found']=False
        else:
            cacheResults['found']=True
            
    return cacheResults 
    

## Cache utilization , data extraction from main collection, and Cross encoder ranking
- This function checks if related data to the asked query is present in cache or not
- If data is present in cache than  data is return from cache.
- If Data is not present in the cache than data is return from main collection and new query is added in cache


In [21]:
def extractDocsFrmDb(queriesLst,chromaDbresults):
    responses=[]
    
    for index,query in enumerate(queriesLst):
        
        print('checking query in cache','\n')
        print(index+1,'.',query,'\n')
        cacheResults=checkCache(query)

        # Related Query data is found in the cache
        if cacheResults['found']==True:
            
            tempResCopyStr=cacheResults['metadatas'][0][0]["metadatastring"]
            tempResCopy = ast.literal_eval(tempResCopyStr)
            print('found result in cache: -> ', tempResCopy['query'])
            tempResCopy['query']=query
            responses.append(tempResCopy)
        else:

            print('Result not found in Cache, hence looking the main collection','\n')
            res=life_insurance_collection.query(
                query_texts=query,n_results=chromaDbresults)
            
            crossenscoreLst=[]
            
            # Extracting PDF pages text from the search result from collection
            for key,value in res.items():
                if key=='documents':
                    # Predicting cross encoder score
                    for doc in value[0]:
                        score=cross_encoder.predict([query,doc])
                        crossenscoreLst.append(score)
                    break

            # Creating a new copy of dictionary returned from search in collection 
            # Adding cross encoder scores as well in this new dictionary
            responseCopy=dict()
            responseCopy["ids"]=res['ids']
            responseCopy["queryindex"]=index
            responseCopy["query"]=query
            responseCopy["distances"]=res['distances']
            responseCopy["crossenscores"]=crossenscoreLst
            responseCopy["metadatas"]=res['metadatas']
            responseCopy["documents"]=res['documents']
            # responseCopy['embeddings']=res['embeddings']
            
            print('Adding result of query to the cache')
            
            flattnedMetadata=[]
            flattnedMetadata.append({"metadatastring":str(responseCopy)})

            # Adding result in the cache as data related to query was not found in the cache
            # Here I have done innovation that instead of iterting through the search result and making it simple dict
            # and then adding to metadata. We have creted a list which contains a flattened dict in form of string
            # making eaiser to extract when data is fetched from cache (result found in cache)
            life_insurance_collection_cache.add(
                        ids = [query],
                        documents= [query],
                        metadatas = flattnedMetadata
                        )
                            
            responses.append(responseCopy)
            print('\n')
            print('-'*100)
    return responses
    

In [22]:
# List of 4 queries asked to the system
queriesLst=["In what order will payments be made if no beneficiary survives the Member or if the Member has not named a beneficiary ?",
            "What is the time frame for a Member to request changes in benefits due to a change in family status ?",
            "What is the amount needs to be paid to purchase the policy ?",
            "Will the member's spouse receive any payment if the member has not survived and has not named a beneficiary ?"]

# Calling Function to extract related data fom chromadb

In [23]:
# We are returning 5 best results
chromaDbresults=5

# Calling function to extract data from chromadb for each queries asked
responses=extractDocsFrmDb(queriesLst,chromaDbresults)
queryResultsDfLst=[]

# creating dataframe of each response for a query and saving in list
for response in responses:
    
    queryLst=[]
    for i in range(0,chromaDbresults):
        queryLst.append(response['query'])
        
    
    queryResultsDf=pd.DataFrame(
        {
        'Query': queryLst,
        'IDs': response['ids'][0],
        'Distances': response['distances'][0],
        'Crossenscores': response['crossenscores'],
        'Metadatas': response['metadatas'][0],
        'Documents': response['documents'][0]
      }
    )
    # List having all the dataframes having results for each query
    queryResultsDfLst.append(queryResultsDf)
    

checking query in cache 

1 . In what order will payments be made if no beneficiary survives the Member or if the Member has not named a beneficiary ? 

Distance of result searched in cache:  []
Result not found in Cache, hence looking the main collection 

Adding result of query to the cache


----------------------------------------------------------------------------------------------------
checking query in cache 

2 . What is the time frame for a Member to request changes in benefits due to a change in family status ? 

Distance of result searched in cache:  [0.1980872829164495]
Cross Encoder Score for cache search:  -9.603874
Result not found in Cache, hence looking the main collection 

Adding result of query to the cache


----------------------------------------------------------------------------------------------------
checking query in cache 

3 . What is the amount needs to be paid to purchase the policy ? 

Distance of result searched in cache:  [0.19255646020249273]
Cros

In [24]:
# Verifying total number of queries or record added in the cache collection
print('Cache Size: ',life_insurance_collection_cache.count())
print('Main collection Size: ',life_insurance_collection.count())

Cache Size:  3
Main collection Size:  54


## Generative Layer
### This function "generate_response" defines prompt , passes joinied text of searched pages and query to the chat completion API

In [25]:
# Define the function to generate the response. Provide a comprehensive prompt that passes the user query and the top 3 results to the model

def generate_response(query, text):
    """
    Generate a response using GPT-3.5's ChatCompletion based on the user query and retrieved information.
    """
    system_message=f"""
    "You are a helpful assistant in the insurance domain who can effectively answer user queries about insurance policies and documents. \n"
    "You have a question" '{query}' asked by the user and you have a text: '{text}' from policy document . \n"
    "This text is essentially are pages of an insurance document that may be relevant to the user query ."
                        
    "The text may also contain tables in the format of a list of lists" 
    "where each of the nested lists indicates a row.\n"

    "Use the text provided to answer the query '{query}'.\n"
    "Frame an informative answer. \n"
                    

    "Follow the mandatory guidelines below when performing the task. \n"
    "you need to provide relevant/accurate data from the text. \n"
    "You need to provide detail answer to the query asked. \n"
    "You should always answer what is asked in question. Do not include irrelevant information from the text provided. \n"
    "If you are not able to find the exact answer to the query, you should always mention exact answer is not given in the text. \n"
    "You don’t have to necessarily use all the information in the text. Only choose information that is relevant.\n"
    "If the text has tables with relevant information, please reformat the table and return the final information point wise.\n"
    "You are a customer facing assistant, so do not provide any information on internal workings, just answer the query directly. \n"
    "You should never say refer this section of the document/text instead reword the exact data from the text and answer the query. \n"
    "You are given a text of a document and a query. Your task is to answer the query based on the content in Documents. Please **reword the relevant information from the text** and include all specific details in your response. Avoid referring to sections or articles (e.g., do not write "Section D outlines the claim procedure"). Instead, explain the procedures directly and in good English, ensuring the response is comprehensive and clear. \n"
    "Here are samples of text ,query ,and answer. \n"
    "Sample 1: \n"
    "Text 1: "The minimum amount required to purchase the policy is $5,000. This amount must be paid in full at the time of purchase. For premium policies, additional amounts may apply depending on the chosen benefits. The policy also includes a 30-day grace period for payments. \n"
    "Query 1: What is the minimum amount to purchase the policy? \n"
    "Answer 1: The minimum amount required to purchase the policy is $5,000, and this amount must be paid in full at the time of purchase. \n"
    "Sample 2: \n"
    "Text 2: The claim procedures are outlined in Section D of the document. Article 1 specifies that the claimant must notify the insurance company in a timely manner when filing a claim. The claim can be appealed if denied, and the appeal process is mentioned in the document. Finally, the Principal holds discretion over claim approvals and settlements. \n"
    "Query 2: "What are the claim procedures? \n"
    "Answer 2: The claim procedures require the claimant to provide timely notice to the insurance company when filing a claim. If a claim is denied, the claimant has the right to appeal the decision. The final approval and settlement of the claim are at the discretion of the Principal, who holds the authority to make the decision. \n"
    


    
    "The generated response should answer the query directly addressing the user and avoiding additional information. \n"
    "The generated response pointwise format written in good english and citations and references from text should be reworded in good english in points. \n"
    "You also needs to reword all the points in good english. \n"
    "If you think that the query is not relevant to the text, reply that the query is irrelevant. \n"
    "If you believe the query cannot be answered based on the text, respond by stating that an exact answer to the query cannot be derived from the document. \n"
    "Provide the final response as a well-formatted and easily readable text along with the citation. \n"
    "Always include summary paragraph in the final response and summary should start from the next line after writing word "Summary:""
    
    
    """

    # print(system_message)
    messages = [{"role": "system", "content":system_message}]
                 
                 
                 
              
    
    response = openai.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=messages,
        seed = 1234,
        temperature=0,
        top_p=0
    )

    return response.choices[0].message.content.split('\n')

## Displaying results of search layer in dataframe for each query
- this dataframe contain cosine distances of chromadb and scross encoder scores for a query asked
- These search results are sorted based on cross encoder score in descending order. Higher the score higer the relevance of text to query
- In next section we will pick top 3 records based on scross encoder score

In [39]:
pd.set_option('display.max_colwidth', 100)

In [44]:
print('Query1. ',queryResultsDfLst[0]['Query'][0],'\n')

queryResultsDfLst[0][['IDs', 'Distances', 'Crossenscores', 'Metadatas', 'Documents']].sort_values(by='Crossenscores',ascending=False)

Query1.  In what order will payments be made if no beneficiary survives the Member or if the Member has not named a beneficiary ? 



Unnamed: 0,IDs,Distances,Crossenscores,Metadatas,Documents
0,39,0.134496,8.474698,"{'pageNo': 'Page 48', 'pagetitle': 'PART IV - BENEFITS GC 6013 SECTION A - MEMBER LIFE INSURANCE...","c . If a beneficiary dies at the same time or within 15 days after the Member dies, but before T..."
2,51,0.14489,7.154831,"{'pageNo': 'Page 60', 'pagetitle': 'PART IV - BENEFITS GC 6016 SECTION C - DEPENDENT LIFE INSURA...",I f a Dependent who was insured dies during the 31-day individual purchase period described in P...
1,38,0.137638,5.536239,"{'pageNo': 'Page 47', 'pagetitle': 'PART IV - BENEFITS GC 6013 SECTION A - MEMBER LIFE INSURANCE...","M ember's death, the Death Benefits Payable may be withheld until additional information has bee..."
3,45,0.172123,1.581529,"{'pageNo': 'Page 54', 'pagetitle': 'PART IV - BENEFITS GC 6015 SECTION B - MEMBER ACCIDENTAL DEA...","f . claim requirements listed in PART IV, Section D, must be satisfied; and g. all medical evide..."
4,23,0.187822,-5.061757,"{'pageNo': 'Page 32', 'pagetitle': 'PART III - INDIVIDUAL REQUIREMENTS AND RIGHTS GC 6007 SECTIO...",(1) marriage or establishment of a Civil Union partnership or divorce or termination of a Civil ...


In [45]:
print('Query2. ',queryResultsDfLst[1]['Query'][0],'\n')
queryResultsDfLst[0][['IDs', 'Distances', 'Crossenscores', 'Metadatas', 'Documents']].sort_values(by='Crossenscores',ascending=False)

Query2.  What is the time frame for a Member to request changes in benefits due to a change in family status ? 



Unnamed: 0,IDs,Distances,Crossenscores,Metadatas,Documents
0,39,0.134496,8.474698,"{'pageNo': 'Page 48', 'pagetitle': 'PART IV - BENEFITS GC 6013 SECTION A - MEMBER LIFE INSURANCE...","c . If a beneficiary dies at the same time or within 15 days after the Member dies, but before T..."
2,51,0.14489,7.154831,"{'pageNo': 'Page 60', 'pagetitle': 'PART IV - BENEFITS GC 6016 SECTION C - DEPENDENT LIFE INSURA...",I f a Dependent who was insured dies during the 31-day individual purchase period described in P...
1,38,0.137638,5.536239,"{'pageNo': 'Page 47', 'pagetitle': 'PART IV - BENEFITS GC 6013 SECTION A - MEMBER LIFE INSURANCE...","M ember's death, the Death Benefits Payable may be withheld until additional information has bee..."
3,45,0.172123,1.581529,"{'pageNo': 'Page 54', 'pagetitle': 'PART IV - BENEFITS GC 6015 SECTION B - MEMBER ACCIDENTAL DEA...","f . claim requirements listed in PART IV, Section D, must be satisfied; and g. all medical evide..."
4,23,0.187822,-5.061757,"{'pageNo': 'Page 32', 'pagetitle': 'PART III - INDIVIDUAL REQUIREMENTS AND RIGHTS GC 6007 SECTIO...",(1) marriage or establishment of a Civil Union partnership or divorce or termination of a Civil ...


In [46]:
print('Query3. ',queryResultsDfLst[2]['Query'][0],'\n')
queryResultsDfLst[0][['IDs', 'Distances', 'Crossenscores', 'Metadatas', 'Documents']].sort_values(by='Crossenscores',ascending=False)

Query3.  What is the amount needs to be paid to purchase the policy ? 



Unnamed: 0,IDs,Distances,Crossenscores,Metadatas,Documents
0,39,0.134496,8.474698,"{'pageNo': 'Page 48', 'pagetitle': 'PART IV - BENEFITS GC 6013 SECTION A - MEMBER LIFE INSURANCE...","c . If a beneficiary dies at the same time or within 15 days after the Member dies, but before T..."
2,51,0.14489,7.154831,"{'pageNo': 'Page 60', 'pagetitle': 'PART IV - BENEFITS GC 6016 SECTION C - DEPENDENT LIFE INSURA...",I f a Dependent who was insured dies during the 31-day individual purchase period described in P...
1,38,0.137638,5.536239,"{'pageNo': 'Page 47', 'pagetitle': 'PART IV - BENEFITS GC 6013 SECTION A - MEMBER LIFE INSURANCE...","M ember's death, the Death Benefits Payable may be withheld until additional information has bee..."
3,45,0.172123,1.581529,"{'pageNo': 'Page 54', 'pagetitle': 'PART IV - BENEFITS GC 6015 SECTION B - MEMBER ACCIDENTAL DEA...","f . claim requirements listed in PART IV, Section D, must be satisfied; and g. all medical evide..."
4,23,0.187822,-5.061757,"{'pageNo': 'Page 32', 'pagetitle': 'PART III - INDIVIDUAL REQUIREMENTS AND RIGHTS GC 6007 SECTIO...",(1) marriage or establishment of a Civil Union partnership or divorce or termination of a Civil ...


In [47]:
print('Query4. ',queryResultsDfLst[3]['Query'][0],'\n')
queryResultsDfLst[0][['IDs', 'Distances', 'Crossenscores', 'Metadatas', 'Documents']].sort_values(by='Crossenscores',ascending=False)

Query4.  Will the member's spouse receive any payment if the member has not survived and has not named a beneficiary ? 



Unnamed: 0,IDs,Distances,Crossenscores,Metadatas,Documents
0,39,0.134496,8.474698,"{'pageNo': 'Page 48', 'pagetitle': 'PART IV - BENEFITS GC 6013 SECTION A - MEMBER LIFE INSURANCE...","c . If a beneficiary dies at the same time or within 15 days after the Member dies, but before T..."
2,51,0.14489,7.154831,"{'pageNo': 'Page 60', 'pagetitle': 'PART IV - BENEFITS GC 6016 SECTION C - DEPENDENT LIFE INSURA...",I f a Dependent who was insured dies during the 31-day individual purchase period described in P...
1,38,0.137638,5.536239,"{'pageNo': 'Page 47', 'pagetitle': 'PART IV - BENEFITS GC 6013 SECTION A - MEMBER LIFE INSURANCE...","M ember's death, the Death Benefits Payable may be withheld until additional information has bee..."
3,45,0.172123,1.581529,"{'pageNo': 'Page 54', 'pagetitle': 'PART IV - BENEFITS GC 6015 SECTION B - MEMBER ACCIDENTAL DEA...","f . claim requirements listed in PART IV, Section D, must be satisfied; and g. all medical evide..."
4,23,0.187822,-5.061757,"{'pageNo': 'Page 32', 'pagetitle': 'PART III - INDIVIDUAL REQUIREMENTS AND RIGHTS GC 6007 SECTIO...",(1) marriage or establishment of a Civil Union partnership or divorce or termination of a Civil ...


### This section of code calls chat completion api with query and relevant text data (top 3 re-ranked based on cross encoder) to generate output
- Generated output of chat completion contains description ,summary and document references
- This code also picks top 3 re-raked search result based on cross encoder score

In [29]:
for index, queryResultDf in enumerate(queryResultsDfLst):
    # Sorting and picking top 3 pages from searched result based on cross encoder score
    tempdf=queryResultDf.sort_values(by='Crossenscores',ascending=False)[:3][['Metadatas','Documents']]
    tempQuery=queryResultDf['Query'][0]
    text='.\n'.join(tempdf['Documents'])
    finalAns=generate_response(tempQuery,text)
    
    print('Query'+str(index+1)+'.',tempQuery, '\n')

    metadataLst=[]
    finalAns.append('Document References')
    for metadata in tempdf['Metadatas']:
        
        finalAns.append(metadata['pageNo']+' - Topic: '+metadata['pagetitle'])
        finalAns.append('\n')
        
    
    for ans in finalAns:
        if ans.strip() !='': 
            print(ans,'\n')
    print('-'*150)

Query1. In what order will payments be made if no beneficiary survives the Member or if the Member has not named a beneficiary ? 

Based on the information provided in the policy document, here is the answer to your query: 

- If no beneficiary survives the Member or if the Member has not named a beneficiary, payments will be made in the following order of precedence: 

  1. To the Member's spouse 

  2. To the Member's children born to or legally adopted by the Member 

  3. To the Member's parents 

  4. To the Member's brothers and sisters 

  5. If none of the above, to the executor or administrator of the Member's estate 

- If The Principal believes a person is not legally able to give a valid receipt for a payment, and no guardian has been appointed, payment may be made to whoever has assumed the care and support of the person. 

Summary: 

In the event that no beneficiary survives the Member or if the Member has not named a beneficiary, payments will be made in a specific order