In [1]:
import fitz
import re
from nltk import tokenize
from nltk.stem import PorterStemmer
from typing import List, Tuple, Dict
import os

import sys
sys.path.append('../src/')
from classes.Section import Section

In [2]:
def extract_by_section(pages: List[Tuple[str, List[str]]]) -> List[Section]:
  """
  Pages is structured as: [('1', ListofLines)]

  Returns: List of Tuples. In each tuple is its section and the lines within the section

  """
  sections = []
  current_section = None
  section_found  = False
  for page, lines in pages:
      for line in lines[4:]:
          if line.endswith(".0"):
              if current_section:
                  sections.append(current_section)
              current_section = Section()
              current_section.section_num = line
              # print(f"> Section_num: {current_section.section_num}")
              section_found = True
          elif line.isupper() and section_found and len(line) > 9:
              current_section.section_title = line
              # print(f"> Section_Title: {current_section.section_title}")
          if section_found:
              # print("Appended: ", line)
              current_section.text.append(line)
  return sections

In [3]:
def extract_subsections(sections: List[Section]) -> Dict[str, List[Section]]:
    section_map = dict()
    current_subsection = None
    nextLineTitle = False
    
    for section in sections:
        section_num = section.section_num[0]
        subsections = []
        print(f"Section_num: {section_num}")
        for line in section.text:
            print(f"Line: {line}")
            if line.startswith(section_num):
                print(f"! SubSection_num: {line}")
                if current_subsection:
                    subsections.append(current_subsection)
                current_subsection = Section()
                subsection_number = line.split()
                current_subsection.section_num = subsection_number[0]
                # Check if the title is in the same line as the section number
                print(subsection_number)
                if len(subsection_number) >= 2:
                    for word in subsection_number[1:]:
                        current_subsection.section_title += word
                    print(f"> 1Section_Title: {current_subsection.section_title}")
                    nextLineTitle = False
                else:
                    current_subsection.section_num = line
                    nextLineTitle = True
                    print(f"SubSection_num: {current_subsection.section_num}")
                    continue

            if nextLineTitle:
                current_subsection.section_title = line
                print(f"> 2Section_Title: {current_subsection.section_title}")
                nextLineTitle = False
            else:
                current_subsection.text.append(line)
        subsections.append(current_subsection)
        section_map[int(section_num)] = subsections
        current_subsection = None

    return section_map


In [4]:
def display_all_sections(sections, displayText):
    for section in sections:
        print("--------------------------------------------------------------------------------------")
        print(f"> Section number: {section}")
        print(f"- Title: {section.section_title}")
        if displayText:
            section.display_text()

In [5]:
def display_all_subsections(section_map, displayText):
    for section in section_map.keys():
        print("--------------------------------------------------------------------------------------")
        for subsection in section_map[section]:
            print(f"> Section number: {section}")
            print(f"- Subsection number: {subsection.section_num}")
            print(f"- Title: {subsection.section_title}")
            if displayText:
                subsection.display_text()

In [6]:
def display_one_section(sections: List[Section], section_num):
    print(f"Section number: {sections[section_num].section_num}")
    print(f"Title: {sections[section_num ].section_title}")
    sections[section_num].display_text()

In [7]:
def display_one_sub_section(subsections: Dict[int, List[Section]], section_num, subsection_index):
    print(f"Section number: {section_num}")
    print(f"Subsection number: {subsections[section_num][subsection_index].section_num}")
    print(f"Title: {subsections[section_num ][subsection_index].section_title}")
    subsections[section_num][subsection_index].display_text()

In [8]:
def remove_table_of_contents(pages):
    return [i for i in pages if i[0] != ""]

In [9]:
pdf_name = "4444 east.pdf"
pdf_path = os.path.join("..", "data","raw", pdf_name)
output_path = os.path.join("..", "data","processed", "output.txt")
doc = fitz.open(pdf_path)
out = open(output_path, "wb")
for page in doc:
    text = page.get_text().encode("utf8")
    out.write(text)
    out.write(bytes((12,)))
out.close()
doc.close()

In [10]:
pattern = r'Page\s+([\d]+)'
pages = []
page_num = ""
with open(output_path, 'r', encoding='utf8') as f:
    page = []
    for line in f:
        if line.strip() != "":
            page.append(line.strip())
        matches = re.findall(pattern, line)
        if len(matches) > 0:
            page_num = matches[0]
        if '\f' in line:
            pages.append((page_num, page))
            page = []
            page_num = ""


In [11]:
content = remove_table_of_contents(pages)
# Header is found in first 4 lines
header = content[0][1][:4]

sections = extract_by_section(content)

In [18]:
display_one_section(sections, 1)

Section number: 2.0
Title: SITE DESCRIPTION
2.0
SITE DESCRIPTION
2.1
Site Location and Legal Description
The subject property at 4444 East 26th Street,
Vernon, California is located on the southwestern
intersection of East 26th Street and Ayers Avenue.
The subject property was inspected by Joseph Kim of
Partner on October 28, 2021.  The weather at the time
of the site visit was sunny and in the mid-70s
(degrees Fahrenheit). According to the Los Angeles
County Assessor, the subject property is legally
described as OM 3-19-27 EX OF R/W AND STS LOT 3 DIV
105 REG 48 and is owned by LBA RVI –
Company VIII, LLC.
Please refer to Figure 1: Site Location Map, Figure
2: Site Plan, Figure 3: Topographic Map, and Appendix
A: Site Photographs for the location and site
characteristics of the subject property.
2.2
Current Property Use
The subject property is currently occupied by
JSource, a sub-tenant of ACCO (HVAC equipment and
installation company).  Onsite operations consist of
the warehousing and

In [13]:
section_map = extract_subsections(sections)

Section_num: 1
Line: 1.0
! SubSection_num: 1.0
['1.0']
SubSection_num: 1.0
Line: INTRODUCTION
> 2Section_Title: INTRODUCTION
Line: Partner Engineering and Science, Inc. (Partner) has
Line: performed a Phase I Environmental Site Assessment
Line: (ESA) in conformance with the scope and limitations
Line: of ASTM Standard Practice E1527-13 and the
Line: Environmental Protection Agency Standards and
Line: Practices for All Appropriate Inquiries (AAI) (40 CFR
Line: Part
Line: 312) for the property located at 4444 East 26th
Line: Street in Vernon, Los Angeles County, California (the
Line: “subject property”).  Any exceptions to, or deletions
Line: from, this scope of work are described in the report.
Line: 1.1
! SubSection_num: 1.1
['1.1']
SubSection_num: 1.1
Line: Purpose
> 2Section_Title: Purpose
Line: The purpose of this ESA is to identify existing or
Line: potential Recognized Environmental Conditions (as
Line: defined by ASTM Standard E-1527-13) affecting the
Line: subject property that:

In [14]:
section_map

{1: [<classes.Section.Section at 0x2041f58dd30>,
  <classes.Section.Section at 0x2041f58d4f0>,
  <classes.Section.Section at 0x2041f58d100>,
  <classes.Section.Section at 0x2041f58d820>,
  <classes.Section.Section at 0x2041f58d2b0>],
 2: [<classes.Section.Section at 0x2041f58d430>,
  <classes.Section.Section at 0x2041f58d670>,
  <classes.Section.Section at 0x2041f58da00>,
  <classes.Section.Section at 0x2041f58dd60>,
  <classes.Section.Section at 0x2041f58dee0>,
  <classes.Section.Section at 0x2041f58dbb0>,
  <classes.Section.Section at 0x2041f58dc70>],
 3: [<classes.Section.Section at 0x2041f58d880>],
 4: [<classes.Section.Section at 0x2041f58dca0>,
  <classes.Section.Section at 0x2041f58d4c0>,
  <classes.Section.Section at 0x2041f58d940>,
  <classes.Section.Section at 0x2041f58d3a0>,
  <classes.Section.Section at 0x2041f58d850>,
  <classes.Section.Section at 0x2041f58dc10>,
  <classes.Section.Section at 0x2041f58dfa0>,
  <classes.Section.Section at 0x2041f58d400>,
  <classes.Section.

In [15]:
display_all_subsections(section_map, displayText=True)

--------------------------------------------------------------------------------------
> Section number: 1
- Subsection number: 1.0
- Title: INTRODUCTION
Partner Engineering and Science, Inc. (Partner) has
performed a Phase I Environmental Site Assessment
(ESA) in conformance with the scope and limitations
of ASTM Standard Practice E1527-13 and the
Environmental Protection Agency Standards and
Practices for All Appropriate Inquiries (AAI) (40 CFR
Part
312) for the property located at 4444 East 26th
Street in Vernon, Los Angeles County, California (the
“subject property”).  Any exceptions to, or deletions
from, this scope of work are described in the report.
> Section number: 1
- Subsection number: 1.1
- Title: Purpose
The purpose of this ESA is to identify existing or
potential Recognized Environmental Conditions (as
defined by ASTM Standard E-1527-13) affecting the
subject property that: 1) constitute or result in a
material violation or a potential material violation
of any applicabl

In [16]:
display_all_sections(sections, displayText=True)

--------------------------------------------------------------------------------------
> Section number: Section 1.0 INTRODUCTION:
- Title: INTRODUCTION
1.0
INTRODUCTION
Partner Engineering and Science, Inc. (Partner) has
performed a Phase I Environmental Site Assessment
(ESA) in conformance with the scope and limitations
of ASTM Standard Practice E1527-13 and the
Environmental Protection Agency Standards and
Practices for All Appropriate Inquiries (AAI) (40 CFR
Part
312) for the property located at 4444 East 26th
Street in Vernon, Los Angeles County, California (the
“subject property”).  Any exceptions to, or deletions
from, this scope of work are described in the report.
1.1
Purpose
The purpose of this ESA is to identify existing or
potential Recognized Environmental Conditions (as
defined by ASTM Standard E-1527-13) affecting the
subject property that: 1) constitute or result in a
material violation or a potential material violation
of any applicable environmental law; 2) impose any