In [2]:
import fitz
import re
from nltk import tokenize
from nltk.stem import PorterStemmer
from typing import List, Tuple, Dict

In [3]:
class Section():
  def __init__(self):
    self.section_num = "0"
    self.section_title = ""
    self.text = []

  def __str__(self):
    return f"Section {self.section_num} {self.section_title}:"

  def display_text(self):
    for t in self.text:
      print(t)


In [146]:
def extract_by_section(pages: List[Tuple[str, List[str]]]) -> List[Section]:
  """
  Pages is structured as: [('1', ListofLines)]

  Returns: List of Tuples. In each tuple is its section and the lines within the section

  """
  sections = []
  current_section = None
  section_found  = False
  for page, lines in pages:
      for line in lines[4:]:
          if line.endswith(".0"):
              if current_section:
                  sections.append(current_section)
              current_section = Section()
              current_section.section_num = line
              # print(f"Section_num: {current_section.section_num}")
              section_found = True
          if line.isupper() and section_found:
              current_section.section_title = line
          elif section_found:
              # print("Appended: ", line)
              current_section.text.append(line)
  return sections

In [50]:
def extract_subsections(sections: List[Section]) -> Dict[str, List[Section]]:
  section_map = dict()
  current_subsection = None
  nextLineTitle = False
  for section in sections:
    section_num = section.section_num[0]
    subsections = []
    # print(f"Section_num: {section_num}")
    for line in section.text:
      if line.startswith(section_num):
        # print(f"SubSection_num: {line}")
        if current_subsection:
          subsections.append(current_subsection)
        current_subsection = Section()
        subsection_number = line.split()
        current_subsection.section_num = subsection_number[0]
        if len(subsection_number) >= 2:
          for word in subsection_number[1:]:
            current_subsection.section_title += word
        else:
          current_subsection.section_num = line
          nextLineTitle = True
          # print(f"SubSection_num: {current_subsection.section_num}")
          continue

      if nextLineTitle:
        current_subsection.section_title = line
        nextLineTitle = False
      else:
        current_subsection.text.append(line)
    section_map[section_num] = subsections

  return section_map




In [145]:
def display_one_section(sections: List[Section], section_num):
    print(sections[section_num - 1])
    print(sections[section_num - 1].section_num)
    print(sections[section_num - 1].section_title)
    sections[section_num - 1].display_text()

In [128]:
doc = fitz.open("4444 East.pdf")
out = open("output.txt", "wb")
pages = []
for page in doc:
    text = page.get_text().encode("utf8")
    pages.append(text)
    out.write(text)
    out.write(bytes((12,)))
    # pages.append(text)
out.close()

In [129]:
output_file = 'output.txt'
pattern = r'Page\s+([\d]+)'
pages = []
page_num = ""
with open(output_file, 'r', encoding='utf8') as f:
    page = []
    for line in f:
        if line.strip() != "":
            page.append(line.strip())
        matches = re.findall(pattern, line)
        if len(matches) > 0:
            page_num = matches[0]
        if '\f' in line:
            pages.append((page_num, page))
            page = []
            page_num = ""


In [154]:
def remove_table_of_contents(pages):
    return [i for i in pages if i[0] != ""]

In [155]:
# page 1 starts at index 7
content = remove_table_of_contents(pages)
# content[4]
# Header is found in first 4 lines
header = content[0][1][:4]
# header

sections = extract_by_section(content)
# sections[5].section_title

In [157]:
# display_one_section(sections, 4)

In [158]:
section_map = extract_subsections(sections)
display_one_section(section_map["5"], 2)

Section 5.0 In order to qualify for one of the Landowner Liability Protections (LLPs) offered by the Small Business:
5.0
In order to qualify for one of the Landowner Liability Protections (LLPs) offered by the Small Business
Liability Relief and Brownfields Revitalization Act of 2001 (the Brownfields Amendments), the User must
provide the following information (if available) to the environmental professional.  Failure to provide this
information could result in a determination that all appropriate inquiry is not complete.  The user is asked
to provide information or knowledge of the following:
•
Environmental cleanup liens that are filed or recorded against the site.
•
Activity and land use limitations that are in place on the site or that have been filed or recorded in a
registry.
•
Specialized knowledge or experience of the person seeking to qualify for the LLPs.
•
Relationship of the purchase price to the fair market value of the property if it were not contaminated.
•
Commonly know

In [None]:
display_one_section(section_map["7"], 0)

In [85]:
content[30]

('28',
 ['Phase I Environmental Site Assessment',
  'Project No. 21-342562.35',
  'November 15, 2021',
  'Page 28',
  '6.0',
  'SITE RECONNAISSANCE',
  'The weather at the time of the site visit was sunny and clear.  Refer to Section 1.5 for limitations',
  'encountered during the field reconnaissance and Sections 2.1 and 2.2 for subject property operations.',
  'The table below provides the site assessment details:',
  'Site Assessment Data',
  'Site Assessment Performed By:',
  'Joseph Kim',
  'Site Assessment Conducted On:',
  'October 28, 2021',
  'The table below provides the subject property personnel interviewed during the field reconnaissance:',
  'Site Visit Personnel for 4444 East 26th Street (Subject Property)',
  'Name',
  'Title/Role',
  'Contact Number',
  'Site Walk*',
  'Yes/No',
  'Ana Darino',
  'Key Site Manager',
  '(213) 507-0638',
  'Yes',
  '* Accompanied Partner during the field reconnaissance activities and provided information pertaining to',
  'the current op