In [2]:
import fitz
import re
from nltk import tokenize
from nltk.stem import PorterStemmer
from typing import List, Tuple, Dict

In [3]:
class Section():
  def __init__(self):
    self.section_num = "0"
    self.section_title = ""
    self.text = []

  def __str__(self):
    return f"Section {self.section_num} {self.section_title}:"

  def display_text(self):
    for t in self.text:
      print(t)


In [291]:
def extract_by_section(pages: List[Tuple[str, List[str]]]) -> List[Section]:
  """
  Pages is structured as: [('1', ListofLines)]

  Returns: List of Tuples. In each tuple is its section and the lines within the section

  """
  sections = []
  current_section = None
  section_found  = False
  for page, lines in pages:
      for line in lines[4:]:
          if line.endswith(".0"):
              if current_section:
                  sections.append(current_section)
              current_section = Section()
              current_section.section_num = line
              # print(f"> Section_num: {current_section.section_num}")
              section_found = True
          elif line.isupper() and section_found and len(line) > 9:
              current_section.section_title = line
              # print(f"> Section_Title: {current_section.section_title}")
          if section_found:
              # print("Appended: ", line)
              current_section.text.append(line)
  return sections

In [347]:
def extract_subsections(sections: List[Section]) -> Dict[str, List[Section]]:
    section_map = dict()
    current_subsection = None
    nextLineTitle = False
    
    for section in sections:
        section_num = section.section_num[0]
        subsections = []
        print(f"Section_num: {section_num}")
        for line in section.text:
            print(f"Line: {line}")
            if line.startswith(section_num):
                print(f"! SubSection_num: {line}")
                if current_subsection:
                    subsections.append(current_subsection)
                current_subsection = Section()
                subsection_number = line.split()
                current_subsection.section_num = subsection_number[0]
                # Check if the title is in the same line as the section number
                print(subsection_number)
                if len(subsection_number) >= 2:
                    for word in subsection_number[1:]:
                        current_subsection.section_title += word
                    print(f"> 1Section_Title: {current_subsection.section_title}")
                    nextLineTitle = False
                else:
                    current_subsection.section_num = line
                    nextLineTitle = True
                    print(f"SubSection_num: {current_subsection.section_num}")
                    continue

            if nextLineTitle:
                current_subsection.section_title = line
                print(f"> 2Section_Title: {current_subsection.section_title}")
                nextLineTitle = False
            else:
                current_subsection.text.append(line)
        subsections.append(current_subsection)
        section_map[int(section_num)] = subsections
        current_subsection = None

    return section_map


In [352]:
def display_all_sections(sections, displayText):
    for section in sections:
        print("--------------------------------------------------------------------------------------")
        print(f"> Section number: {section}")
        print(f"- Title: {section.section_title}")
        if displayText:
            section.display_text()

In [353]:
def display_all_subsections(section_map, displayText):
    for section in section_map.keys():
        print("--------------------------------------------------------------------------------------")
        for subsection in section_map[section]:
            print(f"> Section number: {section}")
            print(f"- Subsection number: {subsection.section_num}")
            print(f"- Title: {subsection.section_title}")
            if displayText:
                subsection.display_text()

In [342]:
def display_one_section(sections: List[Section], section_num):
    print(f"Section number: {sections[section_num].section_num}")
    print(f"Title: {sections[section_num ].section_title}")
    sections[section_num].display_text()

In [320]:
def display_one_sub_section(subsections: Dict[int, List[Section]], section_num, subsection_index):
    print(f"Section number: {section_num}")
    print(f"Subsection number: {subsections[section_num][subsection_index].section_num}")
    print(f"Title: {subsections[section_num ][subsection_index].section_title}")
    subsections[section_num][subsection_index].display_text()

In [310]:
def remove_table_of_contents(pages):
    return [i for i in pages if i[0] != ""]

In [311]:
doc = fitz.open("4444 East.pdf")
out = open("output.txt", "wb")
pages = []
for page in doc:
    text = page.get_text().encode("utf8")
    pages.append(text)
    out.write(text)
    out.write(bytes((12,)))
    # pages.append(text)
out.close()

In [312]:
output_file = 'output.txt'
pattern = r'Page\s+([\d]+)'
pages = []
page_num = ""
with open(output_file, 'r', encoding='utf8') as f:
    page = []
    for line in f:
        if line.strip() != "":
            page.append(line.strip())
        matches = re.findall(pattern, line)
        if len(matches) > 0:
            page_num = matches[0]
        if '\f' in line:
            pages.append((page_num, page))
            page = []
            page_num = ""


In [328]:
# page 1 starts at index 7
content = remove_table_of_contents(pages)
# content[4]
# Header is found in first 4 lines
header = content[0][1][:4]
# header

sections = extract_by_section(content)

In [329]:
display_one_section(sections, 2)

Section number: 3.0
Title: HISTORICAL INFORMATION
3.0
HISTORICAL INFORMATION
Partner obtained historical use information about the subject property from a variety of sources.
Information regarding past land use was obtained by a review of historical aerial photographs, historical
Sanborn Fire Insurance maps, city directories, and historical topographic maps of the subject property and
surrounding area obtained from Environmental Data Resources (EDR).  Copies of the historical resources are
included in Appendix B.  EDR reported that Sanborn Fire Insurance Maps were available for the subject
property. A chronological listing of the historical data found is summarized in the table below.
Date
Scale or
Address
Source
Summary
1894, 1896,
1899, 1900,
1902, 1923
15-minute
Topographic
Map
The subject property and surrounding
properties are depicted as vacant land.  A
railroad is shown to the north.
1924/25,
1926/28
15-minute
Topographic
Map
Changes include the south-southwest property
which i

In [348]:
section_map = extract_subsections(sections)

Section_num: 1
Line: 1.0
! SubSection_num: 1.0
['1.0']
SubSection_num: 1.0
Line: INTRODUCTION
> 2Section_Title: INTRODUCTION
Line: Partner Engineering and Science, Inc. (Partner) has performed a Phase I Environmental Site Assessment
Line: (ESA) in conformance with the scope and limitations of ASTM Standard Practice E1527-13 and the
Line: Environmental Protection Agency Standards and Practices for All Appropriate Inquiries (AAI) (40 CFR Part
Line: 312) for the property located at 4444 East 26th Street in Vernon, Los Angeles County, California (the
Line: “subject property”).  Any exceptions to, or deletions from, this scope of work are described in the report.
Line: 1.1
! SubSection_num: 1.1
['1.1']
SubSection_num: 1.1
Line: Purpose
> 2Section_Title: Purpose
Line: The purpose of this ESA is to identify existing or potential Recognized Environmental Conditions (as
Line: defined by ASTM Standard E-1527-13) affecting the subject property that: 1) constitute or result in a
Line: material vio

In [349]:
section_map

{1: [<__main__.Section at 0x185fc70c4f0>,
  <__main__.Section at 0x185fc70ca60>,
  <__main__.Section at 0x185fc70c880>,
  <__main__.Section at 0x185fcd34c40>,
  <__main__.Section at 0x185fcd34a60>,
  <__main__.Section at 0x185fd889760>],
 2: [<__main__.Section at 0x185fd8890d0>,
  <__main__.Section at 0x185fd8894c0>,
  <__main__.Section at 0x185fd889b80>,
  <__main__.Section at 0x185fd889be0>,
  <__main__.Section at 0x185fc54a1c0>,
  <__main__.Section at 0x185fc54a190>,
  <__main__.Section at 0x185fc54af40>,
  <__main__.Section at 0x185fc54adf0>,
  <__main__.Section at 0x185fc54a850>],
 3: [<__main__.Section at 0x185f7999250>],
 4: [<__main__.Section at 0x185f7999100>,
  <__main__.Section at 0x185fefc87f0>,
  <__main__.Section at 0x185ff322cd0>,
  <__main__.Section at 0x185ff322a00>,
  <__main__.Section at 0x185f7f022b0>,
  <__main__.Section at 0x185fd55bf10>,
  <__main__.Section at 0x185fd55bf70>,
  <__main__.Section at 0x185fd55ba60>,
  <__main__.Section at 0x185fd55bc70>,
  <__main_

In [354]:
display_all_subsections(section_map, displayText=True)

> Section number: 1
- Subsection number: 1.0
- Title: INTRODUCTION
Partner Engineering and Science, Inc. (Partner) has performed a Phase I Environmental Site Assessment
(ESA) in conformance with the scope and limitations of ASTM Standard Practice E1527-13 and the
Environmental Protection Agency Standards and Practices for All Appropriate Inquiries (AAI) (40 CFR Part
312) for the property located at 4444 East 26th Street in Vernon, Los Angeles County, California (the
“subject property”).  Any exceptions to, or deletions from, this scope of work are described in the report.
> Section number: 1
- Subsection number: 1.1
- Title: Purpose
The purpose of this ESA is to identify existing or potential Recognized Environmental Conditions (as
defined by ASTM Standard E-1527-13) affecting the subject property that: 1) constitute or result in a
material violation or a potential material violation of any applicable environmental law; 2) impose any
material constraints on the operation of the subject

In [351]:
display_all_sections(sections, displayText=True)

> Section number: Section 1.0 INTRODUCTION:
- Title: INTRODUCTION
> Section number: Section 2.0 SITE DESCRIPTION:
- Title: SITE DESCRIPTION
> Section number: Section 3.0 HISTORICAL INFORMATION:
- Title: HISTORICAL INFORMATION
> Section number: Section 4.0 REGULATORY RECORDS REVIEW:
- Title: REGULATORY RECORDS REVIEW
> Section number: Section 5.0 USER PROVIDED INFORMATION AND INTERVIEWS:
- Title: USER PROVIDED INFORMATION AND INTERVIEWS
> Section number: Section 6.0 SITE RECONNAISSANCE:
- Title: SITE RECONNAISSANCE
> Section number: Section 7.0 FINDINGS AND CONCLUSIONS:
- Title: FINDINGS AND CONCLUSIONS
> Section number: Section 8.0 SIGNATURES OF ENVIRONMENTAL PROFESSIONALS:
- Title: SIGNATURES OF ENVIRONMENTAL PROFESSIONALS
