In [6]:
import sys
sys.path.append('src')
import os 
import src.mapping as m
import src.classes
from src.classes.PDFReader import PDFReader as pdf_reader
from src.classes.Prompter import Prompter
from langchain_community.llms import Ollama
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import create_retrieval_chain
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.documents import Document
import json
from openpyxl import load_workbook
from pprint import pprint

In [19]:

# Load the workbook
wb = load_workbook('ESA eval metrics.xlsx')

# Select the sheet
ws = wb['Sheet1']

# Create a dictionary where column A serves as the key and column B as the value
field_to_question = {ws.cell(row=i, column=2).value: ws.cell(row=i, column=1).value for i in range(1, ws.max_row+1)}
print(f"length of field_to_question: {len(field_to_question)}")
# pprint(field_to_question)

length of field_to_question: 59


In [21]:
datafields = m.execute()
num_datafields = 0
matches_dict = {}
for items in sorted(datafields.items()):
  subsection = items[0]
  questions = items[1]
  num_datafields += len(questions)
  for field, question in questions.items():
    if field in field_to_question.keys():
      # print(f"question: {question}")
      # print(f"field = {field}")
      matches_dict[field] = question
      
print(f"num_datafields: {num_datafields}")
print(f"length of matches_dict: {len(matches_dict)}")

21
{'3.4', '7.0', '2.3', '2.4', '4.2', '6.1', '3.1', '3.5', '6.3', '2.2', '2.1', '6.2', '1.5', '4.1', '5.1', '1.0', '3.3', '5.2', '6.0', '3.2', '6.4'}
num_datafields: 488
length of matches_dict: 58


In [8]:
llm = Ollama(model="phi3", temperature=0, num_predict=40, top_k=5, top_p=.3, mirostat_tau=0, format="json")
context_system_prompt = """Retrieve the datafield defined in input within the context given. 
Anything legal can be found in the context and chat history.
If you cannot answer the question with the context, respond with N/A.

<context>
{context}
</context>

{input} is the prompt.
Again, if you cannot answer the question respond N/A.
Simply respond with only the answer that matches the datafield.
There must only be one JSON object.
Do not include the datafield in your response. Keep the key for the json exactly the same as the input.
"""
# ChatPromptTemplate.from_template()

context_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", context_system_prompt),
        MessagesPlaceholder("chat_history", optional=True)#,
        ,("human", "{input}")
    ]
)


qa_system_prompt = context_system_prompt #"""Reading through a PDF and giving only the answers found within it. If you do not know an answer, you respond N/A."""


qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder("chat_history")#,
        ,("human", "{input}")
    ]
)


qa_document_chain = qa_prompt | llm | JsonOutputParser()
history_aware_retriever = context_prompt | llm | JsonOutputParser()
retrieval_chain = create_retrieval_chain(
    history_aware_retriever, qa_document_chain)

chat_history = []


In [9]:
pdf_path = 'pdf_new_641566.pdf'
subsection_dict = pdf_reader().process_file(pdf_path)


KeyboardInterrupt



In [None]:
answers_dict = {}

for subsection, field_to_questions in sorted(datafields.items()):
    subsection_answers = {}
    if subsection:      # (subsection == *selected section*)
        section_number = subsection.split('.')[0]
        for field, question in field_to_questions.items():
            try:
                subsection_context = subsection_dict[section_number][subsection]
                # -----------------------------INVOKE LLM---------------------------------------
                output_dict = retrieval_chain.invoke({'input':question, 'context': [Document(page_content=subsection_context)], 
                                                    'chat_history':chat_history})
                
                # -----------------------------INVOKE LLM---------------------------------------
                key = list(output_dict['answer'].keys())[0]
                subsection_answers[question] = output_dict['answer'][key]
                answer = subsection_answers[question]
                print(f'> Answer: {answer}\n')
            except Exception as e:
                print("!!!!!!!!!!!!!!!")
                print(f'Error in Subsection {subsection}: {e}')
                print("!!!!!!!!!!!!!!!")
    answers_dict[subsection] = subsection_answers
# Write to JSON
pdf_name = pdf_path.split('.')[0]
with open(f"data/processed/{pdf_name}.json", "w") as outfile: 
    json.dump(answers_dict, outfile)