In [None]:
%pip install pdfminer.six
%pip install pdfplumber
%pip install numpy pandas
%pip install textdistance
%pip install regex
%pip install strictyaml


In [9]:
%pip install pyyaml

Collecting pyyaml
  Downloading PyYAML-6.0.1-cp312-cp312-win_amd64.whl.metadata (2.1 kB)
Downloading PyYAML-6.0.1-cp312-cp312-win_amd64.whl (138 kB)
   ---------------------------------------- 0.0/138.7 kB ? eta -:--:--
   -- ------------------------------------- 10.2/138.7 kB ? eta -:--:--
   ----------- --------------------------- 41.0/138.7 kB 653.6 kB/s eta 0:00:01
   ---------------------------------------- 138.7/138.7 kB 1.4 MB/s eta 0:00:00
Installing collected packages: pyyaml
Successfully installed pyyaml-6.0.1
Note: you may need to restart the kernel to use updated packages.


In [1]:
from pdfminer.high_level import extract_pages, extract_text
from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
# To extract text from tables in PDF
import pdfplumber
import numpy as np
import pandas as pd
from pathlib import Path
from typing import List, Optional, Iterable, Dict, Tuple
import regex
from textdistance import hamming
import strictyaml 
import yaml

In [10]:
from io import StringIO

from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser

In [8]:
class pdf_data():
    def __init__(self, filepath: str | Path, config: str | Path) -> None:
        self.filepath = filepath
        self.config = config

    def get_text_containers(self, pages: Optional[Iterable[int]] = None) -> pd.DataFrame:
        text_dict = dict(page=[], y_bottom=[], y_top=[], x_left=[], x_right=[], width=[], text=[],)
        for page_idx, page_layout in enumerate(extract_pages(self.filepath)):
            for el_idx, element in enumerate(page_layout):
                if isinstance(element, LTTextContainer):
                    text_dict['page'].append(page_idx)
                    text_dict['y_bottom'].append(element.y0)
                    text_dict['y_top'].append(element.y1)
                    text_dict['x_left'].append(element.x0)
                    text_dict['x_right'].append(element.x1)
                    text_dict['width'].append(element.width)
                    text_dict['text'].append(element.get_text().lower())
                else:
                    print(f'page = {page_idx}, ypos={element.y0}, type={type(element)}')

        self.text_df = pd.DataFrame(text_dict)
        y_min, y_max = self.text_df['y_bottom'].min(), self.text_df['y_top'].max()
        self.text_df['norm_y_top']    = self.text_df['page'] + ((y_max - self.text_df['y_top'   ]) / (y_max - y_min))
        self.text_df['norm_y_bottom'] = self.text_df['page'] + ((y_max - self.text_df['y_bottom']) / (y_max - y_min))
        # self.text_df['norm_y_top']    = self.text_df['page'] + ((self.text_df['y_bottom'] - y_min) / y_max)

    def get_section_headers(self):
        with open(self.config, 'r') as file:
            self.config_data = yaml.safe_load(file) 
        
        self.sections = dict()
        for header in self.config_data['section_headers']:
            self.sections[header] = dict(bounds = pd.DataFrame())

    def get_section_bounds(self):

        self.get_section_headers()

        # detect start and end of sections
        prev_section_info = dict(header=None, df_row=None)
        for row_idx, text_data in self.text_df.iterrows():
            detected_section = None
            for header in self.sections.keys():
                match = regex.search(f'{header}{{s<=3,i<=3,d<=3}}', text_data['text'], regex.BESTMATCH)    
                if match is not None:
                    detected_section = header
                    break

            if detected_section is not None:
                if prev_section_info['header'] is not None:   # store end of sections

                    prev_y_bottom = self.text_df.loc[prev_section_info['df_row'], 'norm_y_bottom']
                    prev_df_idx = self.sections[prev_section_info['header']]['bounds'].index[-1] 
                    self.sections[prev_section_info['header']]['bounds'].loc[prev_df_idx, ['y_bottom']] = [prev_y_bottom]

                # store start of sections
                tmp_df = pd.DataFrame(data    = zip([text_data['norm_y_top']], [np.nan]),
                                      columns = ['y_top', 'y_bottom'],
                                      )
                header = detected_section
                self.sections[header]['bounds'] = pd.concat((self.sections[header]['bounds'], tmp_df), ignore_index=True)

                prev_section_info['header'] = header
            prev_section_info['df_row'] = row_idx
        
        # store end of final section
        prev_y_bottom = self.text_df.loc[prev_section_info['df_row'], 'norm_y_bottom']
        prev_df_idx = self.sections[prev_section_info['header']]['bounds'].index[-1] 
        self.sections[prev_section_info['header']]['bounds'].loc[prev_df_idx, ['y_bottom']] = [prev_y_bottom]

    def identify_section_structure(self, section_header): #TODO
        section_dict = self.sections[section_header]
        for sub_idx, subsection_bounds in section_dict['bounds'].iterrows():
            mask = (self.text_df['norm_y_top'] > subsection_bounds['y_top']) & \
                   (self.text_df['norm_y_bottom'] < subsection_bounds['y_bottom']) 
            subsect_df = self.text_df.loc[mask, :]
            for l_idx, line in subsect_df.iterrows():
                print(line['text'].split(' '))

    def store_section_data(self, section_header): #TODO
        self.identify_section_structure(section_header)

    def print_text(self):
        max_x = 0
        for page_text in self.text_containers:
            for text_container in page_text:
                if text_container.x1 > max_x:
                    max_x = text_container.x1
        for page_num, page_text in enumerate(self.text_containers):
            print('###############################################')
            print(f'Page {page_num}')
            print('###############################################') 
            for text_container in page_text:  
                text = text_container.get_text()
                # print(f'{text_container.y0} to {text_container.y1}', text)
                print(text_container.y1, text_container.y0, text)


class exhibit_data(pdf_data):
    def __init__(self, filepath: str | Path,  config: str | Path) -> None:
        super().__init__(filepath, config)

    def get_exhibit_name(self):
        for text_container in self.text_containers[0]:
            text = text_container.get_text()
            if 'exhibit' in text.lower():
                pattern = regex.compile(r'^\s+')
                exhibit = pattern.sub('', text.lower().replace('exhibit', '').replace('\n', ''))
                self.exhibit = exhibit
    
    def get_ground_space_requirements(self, keyphrases: List[str]):
        for page_text in self.text_containers:
            examine_text  = False
            get_next_line = True
            line_iter = iter(page_text)
            while get_next_line:
                text_container = line_iter.__next__()
                text = text_container.get_text().lower()
                if examine_text:
                    split_text = text.split(' ')
                    if any([   hamming.normalized_similarity(t, 'requirements')   > 0.75 
                            or hamming.normalized_similarity(t, 'specifications') > 0.75
                            for t in split_text]):
                        get_next_line = False
                    else:
                        for key in keyphrases:
                            pattern = fr'((?<={key})\w+)'
                            value = regex.findall(f'{pattern}{{e<=3}}', text, regex.BESTMATCH)
                            print(value)
                # elif any([   hamming.normalized_similarity(t, 'requirements')   > 0.75 
                #           or hamming.normalized_similarity(t, 'specifications') > 0.75
                #             for t in split_text])::

# fuzzy_pattern = f'({pattern}){{e<=3}}'
# text = 'I went to the doctor on 7022-O3-I6.'

# match = regex.search(fuzzy_pattern, text, regex.BESTMATCH)
# match.groupdict()

                else:
                    if hamming.normalized_similarity(text.lower(), 'ground space requirements') > 0.75:
                        examine_text = True
    
    def get_equipment_specifications(self):
        for page_text in self.text_containers:
            examine_text  = False
            get_next_line = True
            line_iter = iter(page_text)
            while get_next_line:
                text_container = line_iter.__next__()
                text = text_container.get_text().lower()
                if examine_text:
                    split_text = text.split(' ')
                    if any([   hamming.normalized_similarity(t, 'requirements')   > 0.75 
                            or hamming.normalized_similarity(t, 'specifications') > 0.75
                            for t in split_text]):
                        get_next_line = False

                
                

In [28]:
pdf_path = Path('amendments/Old_Exhibit_Redacted_OCR_new.pdf')
original_pdf_path = Path('amendments/Old_Exhibit_Redacted.pdf')
config_path = Path(r'C:\Users\Dalton\Documents\personal_records\apex_consulting\materials_and_amendments_OCR\configs\atc_config.yaml')


In [26]:
output_string = StringIO()
with open(original_pdf, 'rb') as in_file:
    parser = PDFParser(in_file)
    doc = PDFDocument(parser)
    rsrcmgr = PDFResourceManager()
    device = TextConverter(rsrcmgr, output_string, laparams=LAParams(line_margin=0.25))
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in PDFPage.create_pages(doc):
        interpreter.process_page(page)

In [27]:
print(output_string.getvalue())

Type

DISH-HP

DISH-HP

Radio/ODU

Radio/ODU

DISH-HP

Radio/ODU

Manufacturer

RFS

Commscope

Model #

SB6-W60BC

USX10-11W

Ceragon

RFU-D

Ceragon

RFS

Ceragon

FibeAir IP-20E

SB4-W60

FibeAir IP-20E

Dimensions HxWxD

6.23' x 6.23' x 2.98'

10.00' x -' x -'

9.1" x 9.2" x 3.9"

9.2" x 9.1" x 3.9"

4.14' x 4.14' x -'

9.2" x 9.1" x 3.9"

Weight(lbs.)

Location

RAD Center AGL

Equipment Tip
Height

Equipment Base
Height

198.0

Tower

155.0'

158.1'

151.9'

579.8

Tower

155.0'

160.0'

150.0'

14.3

Tower

155.0'

155.4'

154.6'

14.3

Tower

155.0'

155.4'

154.6'

77.0

Tower

125.0'

127.1'

122.9'

14.3

Tower

125.0'

125.4'

124.6'

Mount Type

Pole Mount

Pole Mount

Pole Mount

Pole Mount

Pole Mount

Pole Mount

Quantity

1

1

292.15

184.14

Azimuths/Dir. of
Radiation

Quant. Per
Azimuth/Sector

TX/RX Frequency
Units

TX Frequency

RX Frequency

Using Unlicensed
Frequencies?

1

MHz

6400

6400

No

Equipment Gain

35.7/ 36.7/ 37.3

Total # of Lines

Line Quant. Per

In [29]:

old_exhibit = exhibit_data(original_pdf_path, config_path)
old_exhibit.get_text_containers()
old_exhibit.get_section_bounds()
old_exhibit.store_section_data('equipment specifications')

page = 0, ypos=0, type=<class 'pdfminer.layout.LTFigure'>
page = 1, ypos=339.33029632800003, type=<class 'pdfminer.layout.LTRect'>
page = 1, ypos=732.5789288880001, type=<class 'pdfminer.layout.LTRect'>
page = 1, ypos=732.5789288880001, type=<class 'pdfminer.layout.LTRect'>
page = 1, ypos=732.5789288880001, type=<class 'pdfminer.layout.LTRect'>
page = 1, ypos=717.739357848, type=<class 'pdfminer.layout.LTRect'>
page = 1, ypos=717.739357848, type=<class 'pdfminer.layout.LTRect'>
page = 1, ypos=717.739357848, type=<class 'pdfminer.layout.LTRect'>
page = 1, ypos=717.739357848, type=<class 'pdfminer.layout.LTRect'>
page = 1, ypos=717.739357848, type=<class 'pdfminer.layout.LTRect'>
page = 1, ypos=717.739357848, type=<class 'pdfminer.layout.LTRect'>
page = 1, ypos=717.739357848, type=<class 'pdfminer.layout.LTRect'>
page = 1, ypos=717.739357848, type=<class 'pdfminer.layout.LTRect'>
page = 1, ypos=717.739357848, type=<class 'pdfminer.layout.LTRect'>
page = 1, ypos=717.739357848, type=<class

KeyError: None

In [13]:
old_exhibit.sections['ground space requirements']['bounds']

Unnamed: 0,y_top,y_bottom
0,0.084788,0.168745


In [30]:
old_exhibit.text_df

Unnamed: 0,page,y_bottom,y_top,x_left,x_right,width,text,norm_y_top,norm_y_bottom
0,1,722.260063,728.938064,37.161801,52.251453,15.089652,type\n,1.000000,1.017242
1,1,722.260063,728.938064,126.920802,154.376253,27.455450,dish-hp\n,1.000000,1.017242
2,1,722.260063,728.938064,194.906555,222.362005,27.455450,dish-hp\n,1.000000,1.017242
3,1,722.260063,728.938064,261.084053,295.227286,34.143233,radio/odu\n,1.000000,1.017242
4,1,722.260063,728.938064,330.229552,364.372784,34.143233,radio/odu\n,1.000000,1.017242
...,...,...,...,...,...,...,...,...,...
156,1,432.888453,439.566453,480.641907,484.355892,3.713985,0\n,1.747121,1.764363
157,1,415.822453,422.500453,476.932649,488.064823,11.132174,n/a\n,1.791183,1.808425
158,1,399.498452,406.176452,476.934133,488.066307,11.132174,n/a\n,1.833330,1.850571
159,1,384.658452,391.336452,476.934875,488.067049,11.132174,n/a\n,1.871645,1.888886


In [None]:

old_exhibit.get_text_containers()
old_exhibit.get_exhibit_name()
old_exhibit.get_ground_space_requirements(keyphrases=['lease area',  'concrete pad'])

In [7]:
old_exhibit.print_text()

AttributeError: 'exhibit_data' object has no attribute 'text_containers'

AttributeError: 'list_iterator' object has no attribute 'next'

In [149]:
old_exhibit.print_text()

###############################################
Page 0
###############################################
Exhibit A-3

GROUND SPACE REQUIREMENTS

Total Lease Area Sq.Ft:21600° Primary Contiquous Lease Area L:1200 W:1800 H:1000° Sq.Ft:216.00)

Concrete Pad 1000 1600 NA 160.00|

Outside Primary Lease Area NA NA NA Sq. Ft: N/A|

BACKUP POWER REQUIREMENTS

Generator: NIA Capacity(KW): N/A Fuel Tank Size(gal): N/A Fuel Type: NIA Fuel Tank Setback{radiu:

UTILITY REQUIREMENTS

Power Provided By: Utiity Company Direct

uA

Telco/nterconnect:

TRANSMITTER & RECEIVER SPECIFICATIONS

Type: N/A Quantity: NIA TX Power(watts): NIA ERP(watts): N/A

EQUIPMENT SPECIFICATIONS

Type PANEL PANEL TTA RRURRH RRURRH RRURRH
Manufacturer CellMax RFS RFS Ericsson Ericsson Ericsson
Model # CMABIGS21/E06 | APXVAA24_43-U-A20| ATM1900D-1CWA RRUZ2 RRU22 Radio 4478 B71

Dimensions HxXWxD | 81.1'x7.7'x48 | 96'x24'x85 | 86x10x26 | 202 x132x69" | 202 x132°x68" | 16'x 132 x74"

[Weight(ibs.) 350 1014 84 529 529 600
Locatio

In [153]:
for page_layout in extract_pages(pdf_path):
    for idx, element in enumerate(page_layout):
        if isinstance(element, LTTextContainer):
            for text_line in element:
                text_line.get_text()

KeyboardInterrupt: 