## PDF Parsing

In [13]:
import sys
sys.path.append('../src/')
import ReadPDF as r

In [14]:
# Extract by Section
pdf_path = "20-301704.3.pdf"
sections = r.process_into_sections(pdf_path)
# sections
# sections_text = r.select_Section(sections, 2)
# one_section = '\n'.join(sections[1].text)

In [15]:
# Extract by Subsection
subsection_dict = r.process_file(pdf_path)
one_subsection = r.select_Subsection(subsection_dict, 2, 1)
print(one_subsection)

2.1    Site Location and Legal Description The subject property at 115 West Anapamu Street in Santa Barbara, California is located on the southeast side of West Anapamu Street, between Chapala Street and De La Vina Street. According to the Santa Barbara County Assessor, the subject property is legally described as Assessor Parcel Numbers 039-222- 028 and 039-222-029. Parcel 028 was a public alley on the southwest side of the subject property deeded by the City of Santa Barbara to Sanctuary Centers of Santa Barbara in May 2019. According to the property owner, ownership is currently vested in Sanctuary Centers of Santa Barbara, which acquired the subject property as Sanctuary House of Santa Barbara, Inc., a California nonprofit public benefit corporation dba Sanctuary Psychiatric Centers of Santa Barbara since 1995. Please refer to Figure 1: Site Location Map, Figure 2: Site Plan, Figure 3: Topographic Map, and Appendix A: Site Photographs for the location and site characteristics of th

## LLM Setup + Prompt

In [16]:
from langchain_community.llms import Ollama
llm = Ollama(model="phi", temperature=0, num_predict=40, top_k=5, top_p=.5, mirostat_tau=0, format="json")

In [17]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.output_parsers import JsonOutputParser

output_parser = StrOutputParser()
json_parser = JsonOutputParser()

In [18]:
%%time
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import PromptTemplate

prompt = PromptTemplate.from_template("""Retrieve the datafield defined in input within the context given.
Anything legal can be found in the context.
If you cannot answer the question with the context, respond with N/A.

<context>
{context}
</context>

{input} is the key in the dictionary where the value is the prompt for the datafield.
Simply respond with only the answer that matches the datafield.
Do not include the datafield in your response. Keep the key for the json exactly the same as the input.
""")

document_chain = create_stuff_documents_chain(llm, prompt, output_parser=json_parser)

CPU times: total: 1.02 s
Wall time: 1.28 s


## Responses

In [19]:
fields_dict = {'Name of Assessor/Appraisal District Agency': 'ets_2p_32a', 'Property Legal Description' : 'ets_2p_33a', 'Property Owner Name' : 'ets_2p_34a'}

In [20]:
%%time
from langchain_core.documents import Document

document_chain.invoke({
    "input": 'Name of Assessor/Appraisal District Agency',
    "context": [Document(page_content=one_subsection)]
})

CPU times: total: 31.2 ms
Wall time: 23.5 s


{'name': 'Santa Barbara County Assessor',
 'agency': 'Santa Barbara County Assessor'}

In [25]:
%%time
from langchain_core.documents import Document

document_chain.invoke({
    "input": "Property Legal Description",
    "context": [Document(page_content=one_subsection)]
})

CPU times: user 31.2 ms, sys: 7.4 ms, total: 38.6 ms
Wall time: 8.64 s


{'property_legal_description': 'OM 3-19-27 EX OF R/W AND STS LOT 3 DIV 105 REG 48'}

In [26]:
%%time
from langchain_core.documents import Document

document_chain.invoke({
    "input": "Property Owner Name",
    "context": [Document(page_content=one_subsection)]
})

CPU times: user 24 ms, sys: 7.18 ms, total: 31.2 ms
Wall time: 6.75 s


{'Property Owner Name': 'LBA RVI – Company VIII, LLC'}

In [27]:
%%time
from langchain_core.documents import Document

document_chain.invoke({
    "input": "When did the owner acquire the property?",
    "context": [Document(page_content=one_subsection)]
})

CPU times: user 29.7 ms, sys: 7.12 ms, total: 36.9 ms
Wall time: 8.79 s


{'datafield': 'owner', 'prompt': 'The owner acquired the property on'}

In [28]:
%%time
from langchain_core.documents import Document

document_chain.invoke({
    "input": "What was the source of the property owner name and acquisition date?",
    "context": [Document(page_content=one_subsection)]
})

CPU times: user 28.9 ms, sys: 6.28 ms, total: 35.2 ms
Wall time: 7.63 s


{'source': 'LBA RVI – Company VIII, LLC',
 'acquisition_date': 'October 28, 2021'}