## PDF Parsing

In [1]:
import sys
sys.path.append('../src/')
import ReadPDF as pdfReader
import mapping
from classes.PDFReader import PDFReader
from classes.Prompter import Prompter

In [2]:
# Extract by Section
pdf_path = "20-301704.3.pdf" # ESA BL Data row 5736 - 6666

In [3]:
# Extract by Subsection
parser = PDFReader()
subsection_dict = parser.process_file(pdf_path)
# one_subsection = pdfReader.select_Subsection(subsection_dict, 2, 1)
subsection_dict

{'1.0': '1.0    INTRODUCTION Partner Engineering and Science, Inc. (Partner) has performed a Phase I Environmental Site Assessment (ESA) in conformance with the scope and limitations of ASTM Practice E1527-21 and the Environmental Protection Agency Standards and Practices for All Appropriate Inquiries (AAI) (40 CFR Part 312) for the property located at 115 West Anapamu Street in Santa Barbara, Santa Barbara County, California (the “subject property”).  Any exceptions to, or deletions from, this scope of work are described in the report.',
 '1.1': '1.1    Purpose The purpose of this ESA is to identify existing or potential Recognized Environmental Conditions (as defined by ASTM Standard E1527-21) affecting the subject property that: 1) constitute or result in a material violation or a potential material violation of any applicable environmental law; 2) impose any material constraints on the operation of the subject property or require a material change in the use thereof; 3) require cle

## LLM Setup + Prompt

In [None]:
prompter = Prompter()

In [None]:
%%time
response = prompter.invoke('Name of Assessor/Appraisal District Agency', one_subsection)
print(response)

In [None]:
from langchain_community.llms import Ollama
llm = Ollama(model="phi", temperature=0, num_predict=40, top_k=5, top_p=.5, mirostat_tau=0, format="json")

In [None]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.output_parsers import JsonOutputParser

output_parser = StrOutputParser()
json_parser = JsonOutputParser()

In [None]:
%%time
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import PromptTemplate

prompt = PromptTemplate.from_template("""Retrieve the datafield defined in input within the context given.
Anything legal can be found in the context.
If you cannot answer the question with the context, respond with N/A.

<context>
{context}
</context>

{input} is the key in the dictionary where the value is the prompt for the datafield.
Simply respond with only the answer that matches the datafield.
There must only be one JSON object.
Do not include the datafield in your response. Keep the key for the json exactly the same as the input.
""")

document_chain = create_stuff_documents_chain(llm, prompt, output_parser=json_parser)

## Responses

In [None]:
fields_dict = {'ets_2p_32a': 'Name of Assessor/Appraisal District Agency', 
               'ets_2p_33a': 'Property Legal Description', 
               'ets_2p_34a': 'Property Owner Name', 
               'ets_2p_35a': "When did the owner acquire the property?",
               "ets_2p_36a": "What was the source of the property owner name and acquisition date?"}

In [None]:
mappings = mapping.execute()

In [None]:
mappings
# {subsection_num: {field: question}}

In [None]:
%%time
from langchain_core.documents import Document

document_chain.invoke({
    "input": 'Name of Assessor/Appraisal District Agency',
    "context": [Document(page_content=one_subsection)]
})

In [None]:
%%time
from langchain_core.documents import Document

document_chain.invoke({
    "input": "Property Legal Description",
    "context": [Document(page_content=one_subsection)]
})

In [None]:
%%time
from langchain_core.documents import Document

document_chain.invoke({
    "input": "Property Owner Name",
    "context": [Document(page_content=one_subsection)]
})

In [None]:
%%time
from langchain_core.documents import Document

document_chain.invoke({
    "input": "When did the owner acquire the property?",
    "context": [Document(page_content=one_subsection)]
})

In [None]:
%%time
from langchain_core.documents import Document

document_chain.invoke({
    "input": "What was the source of the property owner name and acquisition date?",
    "context": [Document(page_content=one_subsection)]
})