In [None]:
%pip install pdfminer.six
%pip install pdfplumber
%pip install numpy pandas
%pip install textdistance
%pip install regex
%pip install strictyaml


In [9]:
%pip install pyyaml

Collecting pyyaml
  Downloading PyYAML-6.0.1-cp312-cp312-win_amd64.whl.metadata (2.1 kB)
Downloading PyYAML-6.0.1-cp312-cp312-win_amd64.whl (138 kB)
   ---------------------------------------- 0.0/138.7 kB ? eta -:--:--
   -- ------------------------------------- 10.2/138.7 kB ? eta -:--:--
   ----------- --------------------------- 41.0/138.7 kB 653.6 kB/s eta 0:00:01
   ---------------------------------------- 138.7/138.7 kB 1.4 MB/s eta 0:00:00
Installing collected packages: pyyaml
Successfully installed pyyaml-6.0.1
Note: you may need to restart the kernel to use updated packages.


In [1]:
from pdfminer.high_level import extract_pages, extract_text
from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
# To extract text from tables in PDF
import pdfplumber
import numpy as np
import pandas as pd
from pathlib import Path
from typing import List, Optional, Iterable, Dict, Tuple
import regex
from textdistance import hamming
import strictyaml 
import yaml

In [58]:
class pdf_data():
    def __init__(self, filepath: str | Path, config: str | Path) -> None:
        self.filepath = filepath
        self.config = config

    # def get_text_containers(self, pages: Optional[Iterable[int]] = None) -> List[List]:
    #     self.text_containers = list()
    #     for page_idx, page_layout in enumerate(extract_pages(self.filepath)):
    #         page_text = []
    #         ypos      = []
    #         for el_idx, element in enumerate(page_layout):
    #             if isinstance(element, LTTextContainer):
    #                 page_text.append(element)
    #                 ypos.append(element.y0)
            
    #         indexes = sorted(range(len(ypos)), key=ypos.__getitem__)[::-1]
    #         page_text = list(map(page_text.__getitem__, indexes))
    #         self.text_containers.append(page_text)  

    def get_text_containers(self, pages: Optional[Iterable[int]] = None) -> pd.DataFrame:
        text_dict = dict(page=[], y_bottom=[], y_top=[], x_left=[], x_right=[], width=[], text=[],)
        for page_idx, page_layout in enumerate(extract_pages(self.filepath)):
            for el_idx, element in enumerate(page_layout):
                if isinstance(element, LTTextContainer):
                    text_dict['page'].append(page_idx)
                    text_dict['y_bottom'].append(element.y0)
                    text_dict['y_top'].append(element.y1)
                    text_dict['x_left'].append(element.x0)
                    text_dict['x_right'].append(element.x1)
                    text_dict['width'].append(element.width)
                    text_dict['text'].append(element.get_text().lower())
                else:
                    print(f'page = {page_idx}, ypos={element.y0}, type={type(element)}')

        self.text_df = pd.DataFrame(text_dict)
        y_min, y_max = self.text_df['y_bottom'].min(), self.text_df['y_top'].max()
        self.text_df['norm_y_top']    = self.text_df['page'] + ((y_max - self.text_df['y_top'   ]) / (y_max - y_min))
        self.text_df['norm_y_bottom'] = self.text_df['page'] + ((y_max - self.text_df['y_bottom']) / (y_max - y_min))
        # self.text_df['norm_y_top']    = self.text_df['page'] + ((self.text_df['y_bottom'] - y_min) / y_max)

    def get_section_headers(self):
        with open(self.config, 'r') as file:
            self.config_data = yaml.safe_load(file) 
        
        self.sections = dict()
        for header in self.config_data['section_headers']:
            self.sections[header] = dict(bounds = pd.DataFrame())

    def get_section_bounds(self):

        self.get_section_headers()

        # detect start and end of sections
        prev_section_info = dict(header=None, text_container=None, page_idx=None)
        for page_idx, page_text in enumerate(self.text_containers):
            for text_container in page_text:  
                detected_section = [header for header in self.sections.keys() if hamming.normalized_similarity(text_container.get_text().lower(), header) > 0.75]
                if len(detected_section) == 1:
                    if prev_section_info['header'] is not None:   # store end of sections

                        prev_end_page = prev_section_info['page_idx']
                        prev_end_y    = prev_section_info['text_container'].y0
                        prev_df_idx = self.sections[prev_section_info['header']]['bounds'].index[-1] 
                        self.sections[prev_section_info['header']]['bounds'].loc[prev_df_idx, ['end_page', 'end_y']] = [prev_end_page, prev_end_y]

                    # store start of sections
                    tmp_df = pd.DataFrame(data    = zip([page_idx], [text_container.y1], [np.nan], [np.nan]),
                                          columns = ['start_page', 'start_y', 'end_page', 'end_y'])
                    header = detected_section[0]
                    self.sections[header]['bounds'] = pd.concat((self.sections[header]['bounds'], tmp_df), ignore_index=True)

                    prev_section_info['header'] = header
                prev_section_info['text_container'] = text_container
                prev_section_info['page_idx'] = page_idx

        # store end of final section
        prev_end_page = prev_section_info['page_idx']
        prev_end_y    = prev_section_info['text_container'].y0
        prev_df_idx = self.sections[prev_section_info['header']]['bounds'].index[-1] 
        self.sections[prev_section_info['header']]['bounds'].loc[prev_df_idx, ['end_page', 'end_y']] = [prev_end_page, prev_end_y]

    def identify_section_structure(self, section_header): #TODO
        section_dict = self.sections[section_header]
        for sub_idx, subsection_bounds in section_dict['bounds'].iterrows():
            for page_idx, page_text in enumerate(self.text_containers):
                for text_container in page_text:
                    tmp = []
                    # if (page_idx == subsection_bounds(text_container.y1 <= subsection_bounds. 

    def store_section_data(self, section_header): #TODO
        self.identify_section_structure(section_header)

    def print_text(self):
        max_x = 0
        for page_text in self.text_containers:
            for text_container in page_text:
                if text_container.x1 > max_x:
                    max_x = text_container.x1
        for page_num, page_text in enumerate(self.text_containers):
            print('###############################################')
            print(f'Page {page_num}')
            print('###############################################') 
            for text_container in page_text:  
                text = text_container.get_text()
                # print(f'{text_container.y0} to {text_container.y1}', text)
                print(text_container.y1, text_container.y0, text)


class exhibit_data(pdf_data):
    def __init__(self, filepath: str | Path,  config: str | Path) -> None:
        super().__init__(filepath, config)

    def get_exhibit_name(self):
        for text_container in self.text_containers[0]:
            text = text_container.get_text()
            if 'exhibit' in text.lower():
                pattern = regex.compile(r'^\s+')
                exhibit = pattern.sub('', text.lower().replace('exhibit', '').replace('\n', ''))
                self.exhibit = exhibit
    
    def get_ground_space_requirements(self, keyphrases: List[str]):
        for page_text in self.text_containers:
            examine_text  = False
            get_next_line = True
            line_iter = iter(page_text)
            while get_next_line:
                text_container = line_iter.__next__()
                text = text_container.get_text().lower()
                if examine_text:
                    split_text = text.split(' ')
                    if any([   hamming.normalized_similarity(t, 'requirements')   > 0.75 
                            or hamming.normalized_similarity(t, 'specifications') > 0.75
                            for t in split_text]):
                        get_next_line = False
                    else:
                        for key in keyphrases:
                            pattern = fr'((?<={key})\w+)'
                            value = regex.findall(f'{pattern}{{e<=3}}', text, regex.BESTMATCH)
                            print(value)
                # elif any([   hamming.normalized_similarity(t, 'requirements')   > 0.75 
                #           or hamming.normalized_similarity(t, 'specifications') > 0.75
                #             for t in split_text])::

# fuzzy_pattern = f'({pattern}){{e<=3}}'
# text = 'I went to the doctor on 7022-O3-I6.'

# match = regex.search(fuzzy_pattern, text, regex.BESTMATCH)
# match.groupdict()

                else:
                    if hamming.normalized_similarity(text.lower(), 'ground space requirements') > 0.75:
                        examine_text = True
    
    def get_equipment_specifications(self):
        for page_text in self.text_containers:
            examine_text  = False
            get_next_line = True
            line_iter = iter(page_text)
            while get_next_line:
                text_container = line_iter.__next__()
                text = text_container.get_text().lower()
                if examine_text:
                    split_text = text.split(' ')
                    if any([   hamming.normalized_similarity(t, 'requirements')   > 0.75 
                            or hamming.normalized_similarity(t, 'specifications') > 0.75
                            for t in split_text]):
                        get_next_line = False

                
                

In [59]:
pdf_path = Path('amendments/Old_Exhibit_Redacted_OCR_new.pdf')
config_path = Path(r'C:\Users\Dalton\Documents\personal_records\apex_consulting\materials_and_amendments_OCR\configs\atc_config.yaml')

old_exhibit = exhibit_data(pdf_path, config_path)
old_exhibit.get_text_containers()
old_exhibit.get_section_bounds()


page = 0, ypos=7.38e-07, type=<class 'pdfminer.layout.LTFigure'>
page = 1, ypos=-5.348e-06, type=<class 'pdfminer.layout.LTFigure'>


AttributeError: 'exhibit_data' object has no attribute 'text_containers'

In [56]:
old_exhibit.sections['equipment specifications']['bounds']

In [60]:
old_exhibit.text_df

Unnamed: 0,page,y_bottom,y_top,x_left,x_right,width,text,norm_y_top,norm_y_bottom
0,0,746.00,752.50,232.5000,278.500005,46.000005,exhibit a-3\n,0.000000,0.010806
1,0,680.50,685.50,37.0000,515.000025,478.000025,total lease area sq.ft:21600° primary contiguo...,0.111388,0.119701
2,0,694.50,701.50,203.0000,348.500160,145.500160,ground space requirements\n,0.084788,0.096426
3,0,666.00,671.00,155.0000,515.000050,360.000050,"concrete pad 1000 16.00"" nia 160.00]\n",0.135495,0.143807
4,0,651.00,656.00,154.5000,515.000000,360.500000,outside primary lease area nia nia nia sq. ft:...,0.160432,0.168745
...,...,...,...,...,...,...,...,...,...
60,1,357.84,362.70,37.6199,488.159999,450.540099,line configuration n/a n/a 2 n/a n/a n/a\n,1.648047,1.656126
61,1,350.46,355.32,245.7000,310.320000,64.620000,"2 - fiber/hybrid; 0.19""\n",1.660316,1.668396
62,1,343.08,347.94,253.8000,302.579992,48.779992,(4.8mm) fiber; 2\n,1.672585,1.680665
63,1,372.78,377.64,252.1800,303.479948,51.299948,2 - control cable;\n,1.623209,1.631288


In [None]:

old_exhibit.get_text_containers()
old_exhibit.get_exhibit_name()
old_exhibit.get_ground_space_requirements(keyphrases=['lease area',  'concrete pad'])

In [10]:
old_exhibit.print_text()

###############################################
Page 0
###############################################
752.5 746.0 Exhibit A-3

701.5 694.5 GROUND SPACE REQUIREMENTS

685.5 680.5 Total Lease Area Sq.Ft:21600° Primary Contiguous Lease Area L:1200° W:1800 H:1000 Sq.Ft: 216.00)

671.0 666.0 Concrete Pad 1000 16.00" NIA 160.00]

656.0 651.0 Outside Primary Lease Area NIA NIA NIA Sq. Ft: N/A|

641.5 634.5 BACKUP POWER REQUIREMENTS

625.5 620.5 Generator: NIA Capacity(KW): N/A Fuel Tank Size(gal): N/A Fuel Type: NIA Fuel Tank Setback{radiu:

611.0 604.0 UTILITY REQUIREMENTS

595.0 590.0 Power Provided By: Utiity Company Direct

580.5 575.5 uA

580.5 575.5 Telco/lnterconnect:

565.5 559.0 TRANSMITTER & RECEIVER SPECIFICATIONS

550.0 545.0 Type: N/A Quantity: NIA TX Power(watts): NIA ERP(watts): N/A

535.5 528.5 EQUIPMENT SPECIFICATIONS

525.5 485.0 Type PANEL PANEL TTA RRURRH RRURRH RRURRH
Manufacturer CellMax RFS RFS Ericsson Ericsson Ericsson
Model # CMABIGS21/E06 | APXVAA24_43-U-A20| ATM19

In [171]:
tmp = iter([1, 2, 3])
tmp.__next__()
tmp.__next__()

AttributeError: 'list_iterator' object has no attribute 'next'

In [149]:
old_exhibit.print_text()

###############################################
Page 0
###############################################
Exhibit A-3

GROUND SPACE REQUIREMENTS

Total Lease Area Sq.Ft:21600° Primary Contiquous Lease Area L:1200 W:1800 H:1000° Sq.Ft:216.00)

Concrete Pad 1000 1600 NA 160.00|

Outside Primary Lease Area NA NA NA Sq. Ft: N/A|

BACKUP POWER REQUIREMENTS

Generator: NIA Capacity(KW): N/A Fuel Tank Size(gal): N/A Fuel Type: NIA Fuel Tank Setback{radiu:

UTILITY REQUIREMENTS

Power Provided By: Utiity Company Direct

uA

Telco/nterconnect:

TRANSMITTER & RECEIVER SPECIFICATIONS

Type: N/A Quantity: NIA TX Power(watts): NIA ERP(watts): N/A

EQUIPMENT SPECIFICATIONS

Type PANEL PANEL TTA RRURRH RRURRH RRURRH
Manufacturer CellMax RFS RFS Ericsson Ericsson Ericsson
Model # CMABIGS21/E06 | APXVAA24_43-U-A20| ATM1900D-1CWA RRUZ2 RRU22 Radio 4478 B71

Dimensions HxXWxD | 81.1'x7.7'x48 | 96'x24'x85 | 86x10x26 | 202 x132x69" | 202 x132°x68" | 16'x 132 x74"

[Weight(ibs.) 350 1014 84 529 529 600
Locatio

In [153]:
for page_layout in extract_pages(pdf_path):
    for idx, element in enumerate(page_layout):
        if isinstance(element, LTTextContainer):
            for text_line in element:
                text_line.get_text()

KeyboardInterrupt: 