## PDF Parsing

In [5]:
import sys
import os 
import src.mapping as m
import src.classes
from src.classes.PDFReader import PDFReader as pdf_reader
from src.classes.Prompter import Prompter
import importlib

In [6]:
# Extract by Section
pdf_path = "4444_East.pdf" # ESA BL Data row 5736 - 6666

In [7]:
pdf_path2 = "20-301704.3.pdf" # ESA BL Data row 5736 - 6666

In [8]:
# Extract by Subsection
subsection_dict = pdf_reader().process_file(pdf_path)
for key in subsection_dict:
    print(subsection_dict[key])

{'1.0': ' 1.0 INTRODUCTION Partner Engineering and Science, Inc. (Partner) has performed a Phase I Environmental Site Assessment (ESA) in conformance with the scope and limitations of ASTM Standard Practice E1527-13 and the Environmental Protection Agency Standards and Practices for All Appropriate Inquiries (AAI) (40 CFR Part 312) for the property located at 4444 East 26th Street in Vernon, Los Angeles County, California (the “subject property”).  Any exceptions to, or deletions from, this scope of work are described in the report.', '1.1': ' 1.1 Purpose The purpose of this ESA is to identify existing or potential Recognized Environmental Conditions (as defined by ASTM Standard E-1527-13) affecting the subject property that: 1) constitute or result in a material violation or a potential material violation of any applicable environmental law; 2) impose any material constraints on the operation of the subject property or require a material change in the use thereof; 3) require clean-up,

In [9]:
subsection_dict2 = pdf_reader().process_file(pdf_path2)
#for key in subsection_dict:
    #print(subsection_dict[key])

## LLM Setup + Prompt

In [10]:
prompter = Prompter()

In [11]:
%%time
datafields = m.execute()

CPU times: user 5.67 s, sys: 408 ms, total: 6.08 s
Wall time: 6.49 s


In [33]:
from langchain_community.llms import Ollama
llm = Ollama(model="phi3", temperature=0, num_predict=40, top_k=5, top_p=.3, mirostat_tau=0, format="json")

In [34]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.output_parsers import JsonOutputParser

output_parser = StrOutputParser()
json_parser = JsonOutputParser()

In [35]:
import json

In [36]:
%%time
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

# prompt = ChatPromptTemplate.from_template("""Retrieve the datafield defined in input within the context given.
# Anything legal can be found in the context.
# If you cannot answer the question with the context, respond with N/A.

# <context>
# {context}
# </context>

# {input} is the prompt.
# Again, if you cannot answer the question respond N/A.
# Simply respond with only the answer that matches the datafield.
# There must only be one JSON object.
# Do not include the datafield in your response. Keep the key for the json exactly the same as the input.
# """)

#document_chain = create_stuff_documents_chain(llm, prompt, output_parser=output_parser)

CPU times: user 16 µs, sys: 1 µs, total: 17 µs
Wall time: 23.8 µs


In [49]:
from langchain.chains import create_retrieval_chain
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.output_parsers import JsonOutputParser


# System Chat History
#context_system_prompt = """Given a chat history and the latest question \
#which might reference context in the chat history, answer the question. """
context_system_prompt = """Retrieve the datafield defined in input within the context given. 
Anything legal can be found in the context and chat history.
If you cannot answer the question with the context, respond with N/A.

<context>
{context}
</context>

{input} is the prompt.
Again, if you cannot answer the question respond N/A.
Simply respond with only the answer that matches the datafield.
There must only be one JSON object.
Do not include the datafield in your response. Keep the key for the json exactly the same as the input.
"""
# ChatPromptTemplate.from_template()

context_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", context_system_prompt),
        MessagesPlaceholder("chat_history", optional=True)#,
        ,("human", "{input}")
    ]
)


qa_system_prompt = context_system_prompt #"""Reading through a PDF and giving only the answers found within it. If you do not know an answer, you respond N/A."""


qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder("chat_history")#,
        ,("human", "{input}")
    ]
)


qa_document_chain = qa_prompt | llm | JsonOutputParser()
history_aware_retriever = context_prompt | llm | JsonOutputParser()
retrieval_chain = create_retrieval_chain(
    history_aware_retriever, qa_document_chain)

chat_history = []


In [50]:
#response = retrieval_chain.invoke({'input':input, 'chat_history':chat_history})


In [58]:
%%time
from langchain_core.documents import Document
answers_dict = {}

for items in datafields.items():
    try:
    
        item = {items[0]: items[1]} 
        # Item is of shape : {'1.1' : {'ets_1p_32a': 'Purpose:', 'esa_foia_1': 'ESA Jurisdiction 1', 
        #                     'esa_foia_contact_1': 'ESA Jurisdiction Contact 1', 'ESA Jurisdiction Contact 3'}}
        # print("ITEM IS " , item)
    except Exception as e:
        print('ERROR')
        print(items)
        print(e)
        print()
    for subsection in item:
        print("SUBSECTION:" ,subsection)
        print(subsection == '.1')
        section_number = subsection.split('.')[0]
        print(section_number)
        # TAKE THIS OUT: JUST FOR DEMO PURPOSES:
        if section_number == '2':
            for field in item[subsection]:
                print(field)
                # print(item[subsection][field])
                # for key in subsection_dict.keys():
                #     print('.' + key + '.')
                #     print(type(key))
                    
                #print(subsection_dict.keys())
               # section = subsection.
                #print(' 3' in subsection_dict.keys())
                #print(item[subsection][field])
                try:
                    print(subsection_dict[section_number].keys())
                    subsection_context = subsection_dict[section_number][subsection]
                    print("SUBSECTION CONTEXT: ", subsection_context)
                    print("INPUT QUESTION IS ", item[subsection][field])
                    output_dict = retrieval_chain.invoke({'input':item[subsection][field], 'context': [Document(page_content=subsection_context)], 
                                                       'chat_history':chat_history})
                    key = list(output_dict['answer'].keys())[0]
                    print('KEY', key)
                    #real_output_dict = {}
                    answers_dict[field] = output_dict['answer'][key]
                    print('MODEL OUTPUTS' , output_dict)
                    print('PROCESSED OUTPUT' , field, ":", answers_dict[field])
                    #print(output_dict.keys[0])
                except Exception as e:
                    print('exception: ', e)
                #answers_dict[field] = output_dict[item[subsection][field]]
                #print(answers_dict[field])
print("answers dict is ", answers_dict)
with open("data/processed/phi3_output.json", "w") as outfile: 
    json.dump(answers_dict, outfile)

SUBSECTION: 4.1.11
False
4
SUBSECTION: 5.4.4
False
5
SUBSECTION: 7
False
7
SUBSECTION: 6.3
False
6
SUBSECTION: 6
False
6
SUBSECTION: 4.1.10
False
4
SUBSECTION: 2
False
2
ets_2p_36bc
dict_keys(['2.0', '2.1', '2.2', '2.3', '2.4', '2.4.1', '2.4.2', '2.4.3', '2.4.4'])
exception:  '2'
po_2_1a
dict_keys(['2.0', '2.1', '2.2', '2.3', '2.4', '2.4.1', '2.4.2', '2.4.3', '2.4.4'])
exception:  '2'
po_2_2a
dict_keys(['2.0', '2.1', '2.2', '2.3', '2.4', '2.4.1', '2.4.2', '2.4.3', '2.4.4'])
exception:  '2'
po_2_3a
dict_keys(['2.0', '2.1', '2.2', '2.3', '2.4', '2.4.1', '2.4.2', '2.4.3', '2.4.4'])
exception:  '2'
po_2_4a
dict_keys(['2.0', '2.1', '2.2', '2.3', '2.4', '2.4.1', '2.4.2', '2.4.3', '2.4.4'])
exception:  '2'
po_2_5a
dict_keys(['2.0', '2.1', '2.2', '2.3', '2.4', '2.4.1', '2.4.2', '2.4.3', '2.4.4'])
exception:  '2'
po_2_6a
dict_keys(['2.0', '2.1', '2.2', '2.3', '2.4', '2.4.1', '2.4.2', '2.4.3', '2.4.4'])
exception:  '2'
po_2_7a
dict_keys(['2.0', '2.1', '2.2', '2.3', '2.4', '2.4.1', '2.4.2', '2.4.

In [62]:
# from langchain_community.llms import Ollama
# system = """Reading through a PDF and giving only the answers found within it. If you do not know an answer, you respond N/A."""
# template = """[INST] {{ .System }} {{ .Prompt}} [/INST]"""
# stop_tokens = ["[INST]", "[/INST]"]
# llm = Ollama(model="phi3", system=system, template=template,
#              stop=stop_tokens, temperature=0, num_predict=20, top_k=3, top_p=.03, mirostat_tau=1, format="json")
# document_chain = create_stuff_documents_chain(llm, prompt, output_parser=json_parser)

## Responses

In [63]:
# fields_dict = {'ets_2p_32a': 'Name of Assessor/Appraisal District Agency', 
#                'ets_2p_33a': 'Property Legal Description', 
#                'ets_2p_34a': 'Property Owner Name'}
# #               'ets_2p_35a': "When did the owner acquire the property?",
#  #              "ets_2p_36a": "What was the source of the property owner name and acquisition date?"}

In [65]:
# %%time
# from langchain_core.documents import Document
# answers_dict = {}

# for items in datafields.items():
#     try:
    
#         item = {items[0]: items[1]} 
#         # Item is of shape : {'1.1' : {'ets_1p_32a': 'Purpose:', 'esa_foia_1': 'ESA Jurisdiction 1', 
#         #                     'esa_foia_contact_1': 'ESA Jurisdiction Contact 1', 'ESA Jurisdiction Contact 3'}}
#         # print("ITEM IS " , item)
#     except Exception as e:
#         print('ERROR')
#         print(items)
#         print(e)
#         print()
#     for subsection in item:
#         print("SUBSECTION:" ,subsection)
#         print(subsection == '.1')
#         section_number = subsection.split('.')[0]
#         print(section_number)
#         # TAKE THIS OUT: JUST FOR DEMO PURPOSES:
#         if not section_number == '6':
#             for field in item[subsection]:
#                 print(field)
#                 # print(item[subsection][field])
#                 # for key in subsection_dict.keys():
#                 #     print('.' + key + '.')
#                 #     print(type(key))
                    
#                 #print(subsection_dict.keys())
#                # section = subsection.
#                 #print(' 3' in subsection_dict.keys())
#                 #print(item[subsection][field])
#                 try:
#                     print(subsection_dict[section_number].keys())
#                     subsection_context = subsection_dict[section_number][subsection]
#                     print("SUBSECTION CONTEXT: ", subsection_context)
#                     print("INPUT QUESTION IS ", item[subsection][field])
#                     output_dict = document_chain.invoke({
#                         "input": item[subsection][field],
#                         "context": [Document(page_content=subsection_context)]
#                     })
#                     key = list(output_dict.keys())[0]
#                     #real_output_dict = {}
#                     answers_dict[field] = output_dict[key]
#                     print('MODEL OUTPUTS' , output_dict)
#                     print('PROCESSED OUTPUT' , field, ":", answers_dict[field])
#                     #print(output_dict.keys[0])
#                 except Exception as e:
#                     print('exception: ', e)
#                 #answers_dict[field] = output_dict[item[subsection][field]]
#                 #print(answers_dict[field])
# print("answers dict is ", answers_dict)
# with open("data/processed/phi3_output.json", "w") as outfile: 
#     json.dump(answers_dict, outfile)

SUBSECTION: 5.3.1
False
5
ets_5_96a
dict_keys(['5.0', '5.1', '5.1.1', '5.1.2', '5.1.3', '5.1.4', '5.1.5', '5.2', '5.2.1', '5.2.2', '5.2.3', '5.2.4', '5.2.5', '5.2.6'])
exception:  '5.3.1'
ets_5_97a
dict_keys(['5.0', '5.1', '5.1.1', '5.1.2', '5.1.3', '5.1.4', '5.1.5', '5.2', '5.2.1', '5.2.2', '5.2.3', '5.2.4', '5.2.5', '5.2.6'])
exception:  '5.3.1'
ets_5_98a
dict_keys(['5.0', '5.1', '5.1.1', '5.1.2', '5.1.3', '5.1.4', '5.1.5', '5.2', '5.2.1', '5.2.2', '5.2.3', '5.2.4', '5.2.5', '5.2.6'])
exception:  '5.3.1'
ets_5_99a
dict_keys(['5.0', '5.1', '5.1.1', '5.1.2', '5.1.3', '5.1.4', '5.1.5', '5.2', '5.2.1', '5.2.2', '5.2.3', '5.2.4', '5.2.5', '5.2.6'])
exception:  '5.3.1'
ets_5_100a
dict_keys(['5.0', '5.1', '5.1.1', '5.1.2', '5.1.3', '5.1.4', '5.1.5', '5.2', '5.2.1', '5.2.2', '5.2.3', '5.2.4', '5.2.5', '5.2.6'])
exception:  '5.3.1'
ets_5_101a
dict_keys(['5.0', '5.1', '5.1.1', '5.1.2', '5.1.3', '5.1.4', '5.1.5', '5.2', '5.2.1', '5.2.2', '5.2.3', '5.2.4', '5.2.5', '5.2.6'])
exception:  '5.3.1'


KeyboardInterrupt: 

In [110]:
%%time
from langchain_core.documents import Document

print(type(document_chain.invoke({
    "input": "Property Legal Description",
    "context": [Document(page_content=one_subsection)]
})))


KeyboardInterrupt



In [23]:
%%time
from langchain_core.documents import Document

document_chain.invoke({
    "input": "Property Owner Name",
    "context": [Document(page_content=one_subsection)]
})


KeyboardInterrupt



In [24]:
%%time
from langchain_core.documents import Document

document_chain.invoke({
    "input": "When did the owner acquire the property?",
    "context": [Document(page_content=one_subsection)]
})

NameError: name 'one_subsection' is not defined

In [None]:
%%time
from langchain_core.documents import Document

document_chain.invoke({
    "input": "What was the source of the property owner name and acquisition date?",
    "context": [Document(page_content=one_subsection)]
})