## PDF Parsing

In [67]:
import sys
sys.path.append('../src/')
import ReadPDF as r

In [68]:
# Extract by Section
pdf_path = "4444_East.pdf"
sections = r.process_into_sections(pdf_path)
sections_text = r.select_Section(sections, 2)
one_section = '\n'.join(sections[1].text)

In [69]:
# Extract by Subsection
subsection_dict = r.process_file(pdf_path)
one_subsection = r.select_Subsection(subsection_dict, 2, 1)
print(one_subsection)

The subject property at 4444 East 26th Street, Vernon, California is located on the southwestern intersection of East 26th Street and Ayers Avenue.  The subject property was inspected by Joseph Kim of Partner on October 28, 2021.  The weather at the time of the site visit was sunny and in the mid-70s (degrees Fahrenheit). According to the Los Angeles County Assessor, the subject property is legally described as OM 3-19-27 EX OF R/W AND STS LOT 3 DIV 105 REG 48 and is owned by LBA RVI – Company VIII, LLC. Please refer to Figure 1: Site Location Map, Figure 2: Site Plan, Figure 3: Topographic Map, and Appendix A: Site Photographs for the location and site characteristics of the subject property.


## LLM Setup + Prompt

In [142]:
from langchain_community.llms import Ollama
llm = Ollama(model="phi", temperature=0, num_predict=40, top_k=5, top_p=5, mirostat_tau=0, format="json")

In [152]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.output_parsers import JsonOutputParser

output_parser = StrOutputParser()
json_parser = JsonOutputParser()

In [154]:
%%time
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import PromptTemplate

prompt = PromptTemplate.from_template("""Retrieve the datafield defined in input within the context given.
Anything legal can be found in the context.
If you cannot answer the question with the context, respond with N/A.

<context>
{context}
</context>

{input} is the datafield to search for.
Simply respond with only the answer that matches the datafield.
Do not include the datafield in your response.
""")

document_chain = create_stuff_documents_chain(llm, prompt, output_parser=json_parser)

CPU times: total: 0 ns
Wall time: 1 ms


## Responses

In [155]:
%%time
from langchain_core.documents import Document

document_chain.invoke({
    "input": "Name of Assessor/Appraisal District Agency",
    "context": [Document(page_content=one_subsection)]
})

CPU times: total: 31.2 ms
Wall time: 19.9 s


{'name': 'Los Angeles County Assessor',
 'agency': 'Los Angeles County Assessor'}

In [156]:
%%time
from langchain_core.documents import Document

document_chain.invoke({
    "input": "Property Legal Description",
    "context": [Document(page_content=one_subsection)]
})

CPU times: total: 15.6 ms
Wall time: 6.9 s


{'property_legal_description': 'OM 3-19-27 EX OF R/W AND STS LOT 3 DIV 105 REG 48'}

In [157]:
%%time
from langchain_core.documents import Document

document_chain.invoke({
    "input": "Property Owner Name",
    "context": [Document(page_content=one_subsection)]
})

CPU times: total: 46.9 ms
Wall time: 5.52 s


{'property_owner': 'LBA RVI – Company VIII, LLC'}

In [148]:
%%time
from langchain_core.documents import Document

document_chain.invoke({
    "input": "When did the owner acquire the property?",
    "context": [Document(page_content=one_subsection)]
})

CPU times: total: 31.2 ms
Wall time: 6.5 s


'{\n  "data": {\n    "property_name": "OM 3-19-27 EX OF R/W AND STS LOT 3 DIV 105 REG 48",\n'

In [149]:
%%time
from langchain_core.documents import Document

document_chain.invoke({
    "input": "What was the source of the property owner name and acquisition date?",
    "context": [Document(page_content=one_subsection)]
})

CPU times: total: 15.6 ms
Wall time: 5.94 s


'{"source": "LBA RVI – Company VIII, LLC", "acquisition_date": "October 28, 2021"}\n'