In [1]:
import os
from tabulate import tabulate
import pdfplumber
from operator import itemgetter
from langchain_core.documents import Document
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community import vectorstores
from langchain.storage import InMemoryStore
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.retrievers import ParentDocumentRetriever
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain.prompts import PromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain_community.llms import Ollama

In [2]:
DIRECTORY_PATH = './PDFs/'

In [3]:
pdfs = [DIRECTORY_PATH + pdf for pdf in os.listdir(DIRECTORY_PATH)]

def check_bboxes(word, table_bbox):
    l = word['x0'], word['top'], word['x1'], word['bottom']
    r = table_bbox
    return l[0] > r[0] and l[1] > r[1] and l[2] < r[2] and l[3] < r[3]

def format_table(table):
    label = table[0][0]
    for lb_ind in range(len(table[0])):
        if table[0][lb_ind]:
            label = table[0][lb_ind]
        else:
            table[0][lb_ind] = label
    return str(tabulate(table, tablefmt='html'))

clean_content = lambda x: ' '.join(x.split()[1 : -1]) + ' ####' if  x!= '' and 'IRC:' in x.split()[0] else ' '.join(x.split()[0 : -1]) + ' ####'   

def clean_documents(documents):
    final_docs = []
    for document in documents:
        if document.metadata['page'] == 1:
            index = documents.index(document)
            while index < len(documents) and isinstance(documents[index].metadata['page'], int):
                final_docs.append(documents[index])
                index += 1
    return final_docs

documents = []
for file in pdfs:
    pdf = pdfplumber.open(file)
    doc_name = str(file[len(DIRECTORY_PATH): -4])
    for page in pdf.pages:
        doc_page = ''
        tables = page.find_tables()
        table_bboxes = [i.bbox for i in tables]
        tables = [{'table': i.extract(), 'top': i.bbox[1]} for i in tables]
        non_table_words = [word for word in page.extract_words() if not any([check_bboxes(word, table_bbox) for table_bbox in table_bboxes])]
        for cluster in pdfplumber.utils.cluster_objects(non_table_words + tables, itemgetter('top'), tolerance=5):
            if 'text' in cluster[0]:
                try: 
                    doc_page += ' ' + ' '.join([i['text'] for i in cluster])
                except:
                    pass                                # SOME PAGES ARE HORIZONTAL, FIX LATER
            elif 'table' in cluster[0]:
                doc_page += ' ' + format_table(cluster[0]['table'])
        page_number = int(doc_page.split()[-1]) if doc_page != '' and doc_page.split()[-1].isdigit() else None
        documents.append(Document(metadata={'source' : doc_name, 'page' : page_number}, page_content=clean_content(doc_page)))
    
documents = clean_documents(documents)
documents

[Document(metadata={'source': 'IRC:37-2018', 'page': 1}, page_content='GUIDELINEs FOR THE DEsIGN OF FLEXIBLE PAVEMENTs 1. INTRODUCTION 1.1 The first guidelines for the design of flexible pavements, published in 1970, were based on (i) subgrade (foundation) strength (California Bearing Ratio) and (ii) traffic, in terms of number of commercial vehicles (having a laden weight of 3 tonnes or more) per day. These guidelines were revised in 1984 considering the design traffic in terms of cumulative number of equivalent standard axle load of 80 kN and design charts were provided for design traffic volumes up to 30 million standard axle (msa) repetitions. The 1970 and 1984 versions of the guidelines were based on empirical (experience based) approach. 1.2 The second revision was carried out in 2001 [1] using semi-mechanistic (or mechanistic- empirical) approach based on the results available from R-6[2], R-56[3] and other research schemes of the Ministry of Road Transport and Highways (MoRTH).

In [None]:
# from unstructured.partition.pdf import partition_pdf
# #OCR
# pdfs = []
# for file in FILES:
#     pdf = partition_pdf(
#         file, 
#         strategy="hi_res",
#         infer_table_structure=True, 
#         mode="elements", 
#         chunking_strategy="by_title",
#         max_characters=1000,
#         combine_text_under_n_chars=500
#     )
#     for pdf_chunks in pdf:
#         pdfs.append(pdf_chunks)
# pdfs

In [4]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device':'cpu'})
vectorstore = vectorstores.FAISS.from_documents(documents=documents, embedding=embeddings)

child_splitter = RecursiveCharacterTextSplitter(separators=['\n'], chunk_size=50, chunk_overlap=0, length_function=len)   
parent_splitter = RecursiveCharacterTextSplitter(separators=['####'], chunk_size=2000, chunk_overlap=100, length_function=len)   

parent_retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=InMemoryStore(),
    child_splitter=child_splitter,
    parent_splitter=parent_splitter
)
parent_retriever.add_documents(documents=documents)

chunks = parent_splitter.split_documents(documents)
keyword_retriever = BM25Retriever.from_documents(chunks)
keyword_retriever.k = 2

hybrid_retriever = EnsembleRetriever(retrievers=[parent_retriever,keyword_retriever], weights=[0.5, 0.5])

  warn_deprecated(
  from tqdm.autonotebook import tqdm, trange


In [5]:
hybrid_retriever.invoke('explain Proforma 1 for visual inspection of roads?')

[Document(metadata={'source': 'IRC:82-2015', 'page': 62}, page_content='APPENDIX 1 PAVEMENT CONDITION ASSESSMENT/VISUAL SURFACE CONDITION ASSESSMENT Surface distress is a measure or indicator of the structural and resulting functional state of a pavement structure and is generally given the prime importance by concerned agencies. Physical distress is identified and a quantitative visual assessment of distress is made by a well experienced/trained team of 3-4 persons while moving in a vehicle at a speed of 8-10 km/hr. While recording the distresses, the details for the following are visually noted and recorded in individual percentage for each km length in Proforma 1. 1. Alligator or fatigue cracking 2. Longitudinal and transverse cracking 3. Bleeding 4. Depressions/Settlements 5. Potholes 6. Patching 7. Ravelling 8. Rutting ####'),
 Document(metadata={'source': 'IRC:79-2019', 'page': 5}, page_content='Fig. 4 Typical Illustration of utility of Delineators during Day and Night Time Crite

In [None]:
template = """
You are a Chatbot and are given information about Public Safety Standards of the Republic of India in the context below.
The context includes tables in html format.
<context>
{context}
</context>
Answer only from the context, if you don't know the answer, just reply "I am sorry, I dont have answer to your query. Please try rephrasing your question."
Dont mention to the user about  where you got the answer from in the context.
Chat History:
{chat_history}
Follow Up Question: {input}
"""
prompt = PromptTemplate.from_template(template)

In [None]:
llm = Ollama(model='phi3:mini', temperature=0)
stuff_documents_chain = create_stuff_documents_chain(llm, prompt)
chain = create_retrieval_chain(hybrid_retriever, stuff_documents_chain)

In [None]:
# UI 
import gradio as gr, time

CSS ="""
.contain { display: flex; flex-direction: column; }
.gradio-container { height: 100vh !important; }
#component-0 { height: 100%; }
#chatbot { flex-grow: 1; overflow: auto;}
"""

with gr.Blocks(css = CSS) as demo:
    gr.Markdown("# RoadGPT 🛣️")

    chatbot = gr.Chatbot(label="Chat history", elem_id="chatbot")
    message = gr.Textbox(label="Ask me a question!")
    clear = gr.Button("Clear")

    def user(user_message, chat_history):
        return gr.update(value="", interactive=False), chat_history + [[user_message, None]]

    def bot(chat_history):
        user_message = chat_history[-1][0]
        #Logging
        print(f'QUESTION: \n{user_message}')
        llm_response = chain.invoke({"input" : user_message, "chat_history" : []})
        #Logging
        print('CONTEXT: \n')
        for doc in llm_response['context']:
            print('<--------------------Page ', doc.metadata['page'], '-------------------->')
            print(doc.page_content)
        print(f'RESPONSE: \n{llm_response['answer']}')
        bot_message = llm_response["answer"]
        chat_history[-1][1] = ""
        for character in bot_message:
            chat_history[-1][1] += character
            time.sleep(0.005)
            yield chat_history

    response = message.submit(user, [message, chatbot], [message, chatbot], queue=False).then(
        bot, chatbot, chatbot
    )
    response.then(lambda: gr.update(interactive=True), None, [message], queue=False)

demo.queue()
demo.launch()

-----------------------------