# Next Steps

1. Extract/organize line configuration and conduit configuration data from old_exhibit
2. Write pdf_comparison class
3. Write pdf_markup class


In [None]:
%pip install pdfminer.six
%pip install pdfplumber
%pip install numpy pandas
%pip install textdistance
%pip install regex
%pip install pyyaml

In [1]:
from pdfminer.high_level import extract_pages, extract_text
from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
# To extract text from tables in PDF
import pdfplumber
import numpy as np
import pandas as pd
from pathlib import Path
from typing import List, Optional, Iterable, Dict, Tuple
import regex
from textdistance import hamming, jaro, levenshtein
import yaml
import itertools

In [None]:
# from io import StringIO

# from pdfminer.converter import TextConverter
# from pdfminer.layout import LAParams
# from pdfminer.pdfdocument import PDFDocument
# from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
# from pdfminer.pdfpage import PDFPage
# from pdfminer.pdfparser import PDFParser

In [2]:
class pdf_data():
    def __init__(self,   
                 orig_filepath: str | Path, 
                 ocr_filepath:  str | Path, 
                 config:        str | Path,
                 key_val_sep:   str = ':' ,) -> None:
        self.orig_filepath = orig_filepath
        self.ocr_filepath = ocr_filepath
        self.config = config  
        self.text_df = None  
        self.px_col_sep = 8
        self.px_word_sep = 2
        self.col_sep_str = ' | '
        self.key_val_sep = key_val_sep

    def combine_key_value_pairs_in_words_df(self, words_df):
        drop_idxs = list()
        for wIdx, word in words_df.iterrows():
            test = (word['text'][-1] == self.key_val_sep) and \
                    (wIdx != words_df.index[-1]) and \
                    (words_df.loc[wIdx+1, 'top'] == word['top'])  
            if test:
                drop_idxs.append(wIdx+1)
                words_df.loc[wIdx, 'text'] = f"{word['text']}{words_df.loc[wIdx+1, 'text']}"  
                words_df.loc[wIdx, 'right'] = words_df.loc[wIdx+1, 'right']       
        words_df = words_df.drop(drop_idxs).reset_index(drop=True)    

        return words_df

    def fill_implicit_keys(self, section_header, left_mult=2, right_mult=2):
        section_dict = self.sections[section_header]
        for sub_idx, subsection_bounds in section_dict['bounds'].iterrows():    
            mask = (self.text_df['norm_y_top'   ] > subsection_bounds['y_top'   ]) & \
                   (self.text_df['norm_y_bottom'] < subsection_bounds['y_bottom']) 
            subsect_df = self.text_df.loc[mask, :]

            if any(subsect_df['source'] == 'ocr'):
                page = pdfplumber.open(self.ocr_filepath ).pages[subsect_df.loc[subsect_df.index[0], 'page']]
            else:
                page = pdfplumber.open(self.orig_filepath).pages[subsect_df.loc[subsect_df.index[0], 'page']]
            
            words_df = self.get_words_df(page, subsect_df['y_top'].min(), subsect_df['y_bottom'].max())
            words_df.loc[:, 'top'] = np.round(words_df['top']) 
            words_df.sort_values(by=['top', 'left'], ignore_index=True, inplace=True)   

            words_df = self.combine_key_value_pairs_in_words_df(words_df)

            split_lines     = list()
            split_lines_row = list()
            split_lines_col = list()
            drop_idxs = list()
            for rIdx, line in subsect_df.iterrows():
                text_by_col = line['text'].split(self.col_sep_str)
                split_lines.extend(text_by_col)
                split_lines_row.extend([rIdx for k in range(len(text_by_col))])
                split_lines_col.extend(list(range(len(text_by_col))))
                
                # combine words_df to match phrases in split_lines
                for tIdx, token in enumerate(text_by_col):
                    combine_idxs = [idx for idx, word in words_df.iterrows() 
                                    if  word['text'] in token
                                    and word['top']+0.5 >= line['y_top']
                                    and word['bottom'] <= line['y_bottom']]
                    tmp_df = words_df.loc[combine_idxs, :]
                    
                    first_word = token.split(' ')[0]
                    possible_start_idx = tmp_df.index[tmp_df['text'] == first_word]
                    for start_idx in possible_start_idx:
                        phrase_idxs = range(start_idx, start_idx+len(token.split(' ')))
                        if all([True if idx in tmp_df.index else False for idx in phrase_idxs]): 
                            phrase = ' '.join(tmp_df.loc[phrase_idxs, 'text'])
                            if phrase == token:
                                words_df.loc[start_idx, ['text', 'right']] = [phrase, tmp_df.loc[phrase_idxs[-1], 'right']]
                                drop_idxs.extend(phrase_idxs[1:])             
                                break                                  
            words_df = words_df.drop(drop_idxs).reset_index(drop=True)

            assert all([True if phrase==token else False for phrase, token in zip(split_lines, words_df['text'])])

            for token, rIdx, cIdx, (wIdx, word) in zip(split_lines, split_lines_row, split_lines_col, words_df.iterrows()):
                if self.key_val_sep in token:
                    continue
                token_bounds = word[['left', 'right']]
                col_mask = ((words_df['right' ]- token_bounds['left'] > -left_mult*self.px_col_sep) & \
                            (words_df['left' ] - token_bounds['left'] <=  0                )) | \
                           ((words_df['right'] - token_bounds['right'] < right_mult*self.px_col_sep) & \
                            (words_df['right'] - token_bounds['right'] >= 0                ))
                same_column_tokens = words_df.loc[col_mask, 'text']
                
                implicit_key = [item.split(self.key_val_sep)[0] for item in same_column_tokens.values if len(item.split(self.key_val_sep)) == 2]
                if len(implicit_key) >= 1:
                    original_text = self.text_df.loc[rIdx, 'text']
                    text_cols = original_text.split(self.col_sep_str)
                    text_cols[cIdx] = f'{implicit_key[0]}{self.key_val_sep}{text_cols[cIdx]}'
                    self.text_df.loc[rIdx, 'text'] = self.col_sep_str.join(text_cols)  

    def identify_line_merge_sets(self):
        merge_sets = list()
        for idx, line in self.text_df.iterrows():
            merge_set = np.where((self.text_df['norm_y_top']    < line['norm_y_bottom']) &
                                 (self.text_df['norm_y_top']    > line['norm_y_top']   )  )[0]
            merge_set = self.text_df.index[merge_set]
            if len(merge_set) > 0:
                merge_set = [idx] + merge_set.to_list()
                same_merge_set   = any([True if m_set == merge_set else False for m_set in merge_sets])
                overlapping_sets = [set_idx for set_idx, m_set in enumerate(merge_sets) if any(i for i in m_set if i in merge_set)]
                if same_merge_set:
                    continue
                elif len(overlapping_sets) == 1:
                    merge_sets[overlapping_sets[0]] = np.unique(merge_sets[overlapping_sets[0]] + merge_set).tolist() 
                elif len(overlapping_sets) > 1:
                    print('Have not written code to manage more than one overlapping set when combining lines')
                else:
                    merge_sets.append(merge_set)
        return merge_sets
    
    def get_words_df(self, page, y_top, y_bottom):
        page_crop = page.within_bbox((         0, y_top, 
                                        page.width, y_bottom))  

        words = page_crop.extract_words()  
        words_dict = dict(text=[], left=[], right=[], top=[], bottom=[])
        for word in words:
            if word['text'] == '|':
                continue
            word['text'] = word['text'].lower().replace('|','')
            for dict_key, word_key in zip(['text', 'left', 'right', 'top', 'bottom'],
                                            ['text',   'x0',    'x1', 'top', 'bottom']):
                words_dict[dict_key].append(word[word_key])
        
        words_df = pd.DataFrame.from_dict(words_dict)
        words_df.sort_values(by='left', ignore_index=True, inplace=True)

        return words_df

    def identify_columns_from_words_df(self, words_df):
        col_id = []
        col_num = 0
        prev_w_info = None
        for w_idx, w_info in words_df.iterrows():
            if prev_w_info is not None: 
                if (w_info['left'] - prev_w_info['right'] > self.px_col_sep):
                    col_num += 1
                elif (w_info['left'] - prev_w_info['right'] < 0):
                    w_info['right'] = prev_w_info['right'] 
            col_id.append(col_num)
            prev_w_info = w_info.copy()
        words_df['col_id'] = col_id
        words_df.sort_values(by=['col_id', 'top', 'left'], ignore_index=True, inplace=True)

        col_phrases = []
        for col_id in words_df['col_id'].unique():
            col_df = words_df.loc[words_df['col_id'] == col_id, :]
            col_phrases.append(' '.join(col_df['text']))

        return col_phrases 

    def organize_single_lines(self, merge_sets, ocr_pdf, orig_pdf):
        merge_list = list(itertools.chain.from_iterable(merge_sets))
        for idx, line in self.text_df.iterrows():
            if idx not in merge_list:   
                if line['source'] == 'ocr':
                    page = ocr_pdf.pages [line['page']]
                else:
                    page = orig_pdf.pages[line['page']]

                words_df = self.get_words_df(page, line['y_top'], line['y_bottom'])
                words_df.loc[:, 'top'] = np.round(words_df['top']) 

                words_df = self.combine_key_value_pairs_in_words_df(words_df)

                col_phrases = self.identify_columns_from_words_df(words_df)

                self.text_df.loc[idx, 'text'] = self.col_sep_str.join(col_phrases)

    def combine_merge_sets(self, merge_sets, ocr_pdf, orig_pdf):

        drop_idxs = list()
        for merge_set in merge_sets:
            partial_df = self.text_df.loc[merge_set, :]
            y_top    = partial_df['y_top'].min()    
            y_bottom = partial_df['y_bottom'].max()

            if any(partial_df['source'] == 'ocr'):
                page = ocr_pdf.pages [partial_df.loc[merge_set[0], 'page']]
            else:
                page = orig_pdf.pages[partial_df.loc[merge_set[0], 'page']]

            words_df = self.get_words_df(page, y_top, y_bottom)
            
            col_phrases = self.identify_columns_from_words_df(words_df)

            # replace first line in merge set with merged text and position info, then store indices of 
            # remaining merge set to drop at end of combine method
            self.text_df.loc[merge_set[0], 
                             ['text', 'y_bottom', 'x_left', 'x_right', 'norm_y_bottom',]] = [self.col_sep_str.join(col_phrases),
                                                                                             partial_df['y_bottom'].max(),
                                                                                             partial_df['x_left'].min(),
                                                                                             partial_df['x_right'].max(),
                                                                                             partial_df['norm_y_bottom'].max(),] 

            drop_idxs.extend(merge_set[1:])             

        self.text_df = self.text_df.drop(drop_idxs).reset_index(drop=True)

    def organize_text_lines_by_row_and_column(self):
        merge_sets = self.identify_line_merge_sets()
        ocr_pdf  = pdfplumber.open(self.ocr_filepath)
        orig_pdf = pdfplumber.open(self.orig_filepath) 
        self.organize_single_lines(merge_sets, ocr_pdf, orig_pdf)
        self.combine_merge_sets   (merge_sets, ocr_pdf, orig_pdf)
    
    def split_text_lines_with_pdfplumber(self, pdf, element, page_idx, text_dict,):
        did_split = False
        page = pdf.pages[page_idx]
        page_crop = page.within_bbox((         0, page.height - element.y1, 
                                      page.width, page.height - element.y0))  

        text = page_crop.extract_text_lines()
        
        if len(text) > 1:
            did_split = True
            for line in text:

                norm_y_top    = page_idx + (line['top'   ] / page.height)
                norm_y_bottom = page_idx + (line['bottom'] / page.height)
                if np.logical_not(np.any((np.isclose(self.text_df['norm_y_top'   ], norm_y_top   , atol=1e-2)) &
                                         (np.isclose(self.text_df['norm_y_bottom'], norm_y_bottom, atol=1e-2))  )):
                    text_dict['page'         ].append(page_idx)
                    text_dict['y_bottom'     ].append(line['bottom'])
                    text_dict['y_top'        ].append(line['top'])
                    text_dict['x_left'       ].append(line['x0'])
                    text_dict['x_right'      ].append(line['x1'])
                    text_dict['text'         ].append(line['text'].lower().replace('|', ''))
                    text_dict['norm_y_top'   ].append(norm_y_top)
                    text_dict['norm_y_bottom'].append(norm_y_bottom)
                    text_dict['source'       ].append('ocr')

                    print(f'\nstoring text from OCR on page {page_idx}:\n"{line['text'].lower()}"')

        return did_split    

    def get_text_lines_from_original(self):
        pdf = pdfplumber.open(self.orig_filepath)
        text_dict = dict(text=[], source=[], page=[], y_top=[], y_bottom=[], x_left=[], x_right=[], norm_y_top=[], norm_y_bottom=[],)
        for page_idx, page in enumerate(pdf.pages):
            text_lines = page.extract_text_lines()
            for line in text_lines:
                norm_y_top    = page_idx + (line['top'   ] / page.height)
                norm_y_bottom = page_idx + (line['bottom'] / page.height)

                text_dict['page'         ].append(page_idx)
                text_dict['y_bottom'     ].append(line['bottom'])
                text_dict['y_top'        ].append(line['top'])
                text_dict['x_left'       ].append(line['x0'])
                text_dict['x_right'      ].append(line['x1'])
                text_dict['text'         ].append(line['text'].lower().replace('|', ''))
                text_dict['norm_y_top'   ].append(norm_y_top)
                text_dict['norm_y_bottom'].append(norm_y_bottom)
                text_dict['source'       ].append('original')
        
        if self.text_df is None:
            self.text_df = pd.DataFrame(text_dict)
        else:
            tmp_text_df = pd.DataFrame(text_dict)
            self.text_df = pd.concat((self.text_df, tmp_text_df), axis=0, ignore_index=True)

        self.text_df.sort_values(by='norm_y_top', inplace=True, ignore_index=True, ascending=True)

    def get_text_lines_from_ocr(self, pages: Optional[Iterable[int]] = None) -> pd.DataFrame:
        text_dict = dict(text=[], source=[], page=[], y_top=[], y_bottom=[], x_left=[], x_right=[], norm_y_top=[], norm_y_bottom=[],)
        for page_idx, page in enumerate(extract_pages(self.ocr_filepath)):
            pdf = pdfplumber.open(self.ocr_filepath)
            for el_idx, element in enumerate(page):
                if isinstance(element, LTTextContainer):
                    did_split = self.split_text_lines_with_pdfplumber(pdf, element, page_idx, text_dict)
                    if not did_split:
                        y_top = page.height - element.y1
                        y_bottom = page.height - element.y0 
                        norm_y_top    = page_idx + y_top    / page.height
                        norm_y_bottom = page_idx + y_bottom / page.height
                        if np.logical_not(np.any((np.isclose(self.text_df['norm_y_top'   ], norm_y_top   , atol=1e-2)) &
                                                 (np.isclose(self.text_df['norm_y_bottom'], norm_y_bottom, atol=1e-2))  )):
                            text_dict['page'         ].append(page_idx)
                            text_dict['y_bottom'     ].append(y_bottom)
                            text_dict['y_top'        ].append(y_top)
                            text_dict['x_left'       ].append(element.x0)
                            text_dict['x_right'      ].append(element.x1)
                            text_dict['text'         ].append(element.get_text().lower().replace('\n', '').replace('|', ''))
                            text_dict['norm_y_top'   ].append(norm_y_top)
                            text_dict['norm_y_bottom'].append(norm_y_bottom)
                            text_dict['source'       ].append('ocr')
    
                            print(f'\nstoring text from OCR on page {page_idx}:\n"{element.get_text().lower().replace('\n', '')}"')

                elif isinstance(element, LTRect):
                    print(f'page = {page_idx}, ypos={element.y0}, type={type(element)}')
                else:
                    print(f'page = {page_idx}, ypos={element.y0}, type={type(element)}')

        if self.text_df is None:
            self.text_df = pd.DataFrame(text_dict)
        elif len(text_dict['text']) > 0:
            tmp_text_df  = pd.DataFrame(text_dict)
            self.text_df = pd.concat((self.text_df, tmp_text_df), axis=0, ignore_index=True)

        self.text_df.sort_values(by='norm_y_top', inplace=True, ignore_index=True, ascending=True)

    def get_section_headers(self):
        with open(self.config, 'r') as file:
            self.config_data = yaml.safe_load(file) 
        
        self.sections = dict()
        for section in self.config_data['sections']:
            self.sections[section['header']] = dict(bounds = pd.DataFrame(),
                                                    extract = section['extract'])
            for key in section.keys():
                if key in ['header', 'extract']: continue
                self.sections[section['header']][key] = section[key]

    def get_section_bounds(self):

        self.get_section_headers()

        # detect start and end of sections
        prev_section_info = dict(header=None, df_row=None)
        prev_page = 0
        for row_idx, text_data in self.text_df.iterrows():
            detected_section = None
            for header in self.sections.keys():
                match = regex.search(f'{header}{{s<=3,i<=3,d<=3}}', text_data['text'], regex.BESTMATCH)    
                if match is not None:
                    detected_section = header
                    break

            if text_data['page'] > prev_page or detected_section is not None:
                if prev_section_info['header'] is not None:   # store end of sections
                    prev_df_idx = self.sections[prev_section_info['header']]['bounds'].index[-1] 
                    prev_bottom = prev_page+0.9999 if (text_data['page'] > prev_page) else text_data['norm_y_top'] 
                    self.sections[prev_section_info['header']]['bounds'].loc[prev_df_idx, ['y_bottom',]] = [prev_bottom]
                # store start of sections
                top = text_data['page'] if (text_data['page'] > prev_page) else text_data['norm_y_top']
                tmp_df = pd.DataFrame(data    = zip([top], [np.nan], [False]),
                                      columns = ['y_top', 'y_bottom', 'data_extracted'],)
                header = detected_section
                self.sections[header]['bounds'] = pd.concat((self.sections[header]['bounds'], tmp_df), ignore_index=True)
                prev_section_info['header'] = header
                if text_data['page'] > prev_page:
                    prev_page+=1
            prev_section_info['df_row'] = row_idx
        
        # store end of final section
        prev_y_bottom = self.text_df.loc[prev_section_info['df_row'], 'norm_y_bottom']
        prev_df_idx = self.sections[prev_section_info['header']]['bounds'].index[-1] 
        self.sections[prev_section_info['header']]['bounds'].loc[prev_df_idx, ['y_bottom',]] = [np.ceil(prev_y_bottom),]
    
    def extract_table(self, subsection_bounds):  
        page_num = int(np.floor(subsection_bounds['y_top'])) 
        pdf = pdfplumber.open(self.orig_filepath)
        table_page = pdf.pages[page_num]
        table_crop = table_page.within_bbox((               0, (subsection_bounds['y_top'   ]-page_num)*table_page.height, 
                                             table_page.width, (subsection_bounds['y_bottom']-page_num)*table_page.height))
        table = table_crop.extract_table()
        # print((subsection_bounds['y_top'   ]-page_num)*table_page.height, (subsection_bounds['y_bottom']-page_num)*table_page.height, table)
        return table
    
    def extract_text_lines_from_original(self, subsection_bounds):
        page_num = int(np.floor(subsection_bounds['y_top'])) 
        pdf = pdfplumber.open(self.orig_filepath)
        page = pdf.pages[page_num]
        page_crop = page.within_bbox((         0, (subsection_bounds['y_top'   ]-page_num)*page.height, 
                                      page.width, (subsection_bounds['y_bottom']-page_num)*page.height))  

        text = page_crop.extract_text_lines()

        for line in text:
            line['norm_y_top']    = page_num + (line['top']    / page.height)
            line['norm_y_bottom'] = page_num + (line['bottom'] / page.height)
            line['page']          = page_num
            line['y_bottom']      = page.height - line['bottom']
            line['y_top']         = page.height - line['top']
            line['text'] = line['text'].lower().replace('|', '')

        return text  

    def table_to_df(self, 
                    table: List[List[str]], 
                    extract_params: str | Dict | List[Dict],
                    ) -> pd.DataFrame:
        if type(extract_params) == str and 'col' in extract_params.lower():
            info_keys = list()
            data = list()
            for row in table:
                iKey = row.pop(0)
                info_keys.append(iKey.replace('\n', ' '))
                data.append(row)
            df = pd.DataFrame(data=np.array(data).transpose(), columns=info_keys)

        elif type(extract_params) == str and 'row' in extract_params.lower():
            info_keys = table[0]
            data = table[1:]
            df = pd.DataFrame(data=data, columns=info_keys)

        else:
            print('There is no method implemented for converting data with this extraction method to a DataFrame')
            df = None
        
        return df 
 
    def extract_table_data(self):
        for section_header, section_dict in self.sections.items():
            for sub_idx, subsection_bounds in section_dict['bounds'].iterrows():
                if subsection_bounds['data_extracted']:
                    continue

                table = self.extract_table(subsection_bounds)
                if table is not None:
                    data_df = self.table_to_df(table, section_dict['extract'])
                    data_df = data_df.loc[:, [col for col in data_df.columns if col.lower() != section_header]]
                    if 'data' in section_dict.keys(): 
                        section_dict['data'] = pd.concat((section_dict['data'], data_df),
                                                         axis=0,
                                                         ignore_index=True)
                    else:
                        section_dict['data'] = data_df
                    
                    section_dict['bounds'].loc[sub_idx, 'data_extracted'] = True

    def align_data_to_existing_df(self,
                                  section_dict: Dict,
                                  subsect_df: pd.DataFrame,
                                 ):
        if type(section_dict['extract']) == str and 'col' in section_dict['extract'].lower():
            info_keys_to_match = list(section_dict['data'].columns)
            original_info_keys = list(section_dict['data'].columns)
            info_keys = list()
            data      = list()
            for text in subsect_df['text']:
                row = text.split(self.col_sep_str)
                iKey = row.pop(0)
                iKey = iKey.replace('\n', ' ')
                match_score = np.array([levenshtein.normalized_similarity(iKey, matchKey) for matchKey in info_keys_to_match])
                sorted_match_score, sorted_info_keys = zip(*sorted(zip(match_score, info_keys_to_match), reverse=True))
                if iKey == 'equipment tip':
                    stop = []
                try:
                    top_key_matches = sorted_info_keys[:3]
                    
                    #correct any instances in which OCR dropped the last word (probably b/c it was on a second line), causing poor matching
                    if (len(iKey.split(' ')) == len(top_key_matches[0].split(' ')) 
                        and any([len(iKey.split(' ')) < len(key.split(' ')) for key in top_key_matches])):
                        match_score = np.array([levenshtein.normalized_similarity(iKey, ' '.join(matchKey.split(' ')[:-1])) for matchKey in info_keys_to_match])
            
                    matched_key = info_keys_to_match.pop(np.where(match_score == match_score.max())[0][0])
                    info_keys.append(matched_key)
                    data.append(row)
                    print(iKey, matched_key)
                except:
                    print(f'\nNo matched key: iKey={iKey}, text={text}')
            
            # Add dummy data for unmatched keys
            for key in info_keys_to_match:
                info_keys.append(key)
                data.append(['data_not_found' for k in range(expected_num_items)])

            expected_num_items = pd.Series([len(d) for d in data]).mode()[0]
            wrong_count_info = [(idx, len(d)) for idx, d in enumerate(data) if len(d) != expected_num_items]
            for idx, num_items in wrong_count_info:
                data[idx] = ['wrong_num_columns' for k in range(expected_num_items)]
                print(f'\n"{info_keys[idx]}" contained the wrong number of columns in the line.')

            correct_order = [np.where(np.array(original_info_keys) == key)[0][0] for key in info_keys]
            _, info_keys = zip(*sorted(zip(correct_order, info_keys)))
            _, data      = zip(*sorted(zip(correct_order, data))) 

            data_df = pd.DataFrame(data=np.array(data).transpose(), columns=info_keys)
            section_dict['data'] = pd.concat((section_dict['data'], data_df),
                                              axis=0,
                                              ignore_index=True)
        return

    def get_multilevel_key_value_pairs(self, section_dict, items):
        filling_subheader = False
        for item in items: 
            split_item = item.split(self.key_val_sep)
            if len(split_item) == 2:
                key, value = split_item
                if len(value) == 0:
                    value = None
                else:
                    value = value[1:]  if value[0]  == ' '  else value
                    value = value[:-1] if value[-1] == '\n' else value
                    
                if filling_subheader:
                    section_dict['data'][stored_key][key] = value
                    print(f'{stored_key} - {key}{self.key_val_sep} {value}')
                else:
                    section_dict['data'][key] = value     
                    print(f'{key}{self.key_val_sep} {value}')
            elif len(split_item) == 1:
                stored_key = split_item[0]
                section_dict['data'][stored_key] = dict()
                filling_subheader = True

    def extract_key_value_pairs(self, 
                                section_dict: Dict, 
                                subsect_df: pd.DataFrame, 
                                ) -> Dict:
        section_dict['data'] = dict()
        for l_idx, line in subsect_df.iterrows(): 
            items = line['text'].split(self.col_sep_str)
            key_value_pairs = [item for item in items if len(item.split(self.key_val_sep)) == 2]
            if len(key_value_pairs) == len(items):
                for item in key_value_pairs:
                    key, value = item.split(self.key_val_sep)
                    if len(value) == 0:
                        value = None
                    else:
                        value = value[1:]  if value[0]  == ' '  else value
                        value = value[:-1] if value[-1] == '\n' else value
                    section_dict['data'][key] = value     
                    print(f'{key}{self.key_val_sep} {value}') 
            else:
                if line['source'] == 'original':
                    pdf = pdfplumber.open(self.orig_filepath)
                elif line['source'] == 'ocr':
                    pdf = pdfplumber.open(self.ocr_filepath)
                self.get_multilevel_key_value_pairs(section_dict, items)

    def extract_text_data(self):
        for section_header, section_dict in self.sections.items():
            for sub_idx, subsection_bounds in section_dict['bounds'].iterrows():
                if subsection_bounds['data_extracted']:
                    continue

                mask = (self.text_df['norm_y_top'   ] > subsection_bounds['y_top'   ]) & \
                       (self.text_df['norm_y_bottom'] < subsection_bounds['y_bottom']) 
                subsect_df = self.text_df.loc[mask, :]
                
                if 'data' in section_dict.keys() and type(section_dict['data']) == pd.DataFrame:
                    self.align_data_to_existing_df(section_dict, subsect_df)
                else:
                    self.extract_key_value_pairs(section_dict, subsect_df)
                
                section_dict['bounds'].loc[sub_idx, 'data_extracted'] = True

    def print_text(self):
        max_x = 0
        for page_text in self.text_containers:
            for text_container in page_text:
                if text_container.x1 > max_x:
                    max_x = text_container.x1
        for page_num, page_text in enumerate(self.text_containers):
            print('###############################################')
            print(f'Page {page_num}')
            print('###############################################') 
            for text_container in page_text:  
                text = text_container.get_text()
                # print(f'{text_container.y0} to {text_container.y1}', text)
                print(text_container.y1, text_container.y0, text)


class ATC_amendment(pdf_data):
    def __init__(self, 
                 orig_filepath: str | Path, 
                 ocr_filepath:  str | Path, 
                 config:        str | Path,
                 key_val_sep:   str = ':' ,) -> None:        
        super().__init__(orig_filepath, ocr_filepath, config, key_val_sep)

    def extract_line_config_data_from_cells_containing_all_key_value_pairs(self, 
                                                                           equipment_df, 
                                                                           configuration_col,
                                                                           data_dict, 
                                                                           data_keys,
                                                                           storage_key):
        line_config_df = equipment_df.loc[:, configuration_col]
        for equipIdx, cell in line_config_df.items():
            for dKey in data_keys:
                num_match = len(regex.findall(f'{dKey}{{e<=1}}', cell))
                
                if num_match > 0:
                    # move thru each match (there may be multiple line configs in single cell)
                    for idx in range(num_match):
                        match = regex.search(f'{dKey}{{e<=1}}', cell, pos=idx)
                        key_span = match.span()
                        
                        # find the next key match in the string to know where the value for this key ends
                        possible_next_keys_pos = list()
                        for next_dKey in data_keys:
                            next_key_match = regex.search(f'{next_dKey}{{e<=1}}', cell[key_span[1]:], pos=0) 
                            if next_key_match is not None:
                                possible_next_keys_pos.append(next_key_match.span()[0])
                        # extract the data token from the string
                        if len(possible_next_keys_pos) == 0:
                            data_token = cell[key_span[0] : ]
                        else:
                            data_token = cell[key_span[0] : key_span[1] + min(possible_next_keys_pos)]
                        data_token = data_token[ :-1] if data_token[-1] == '\n' else data_token
                        data_token = data_token.replace('\n', '')
                        print(data_token)
                        val = data_token.split(self.key_val_sep)[1]
                        val = val[1:] if val[0] == ' ' else val

                        data_dict[str(idx)][dKey][equipIdx] = val

        for line_num, line_data in data_dict.items():
            if any([True if any([True if val is not None else False for val in data_list]) else False 
                    for tmp_key, data_list in line_data.items()]):
                for dKey, values in line_data.items():
                    key = f'{storage_key}_{line_num}_{dKey}'
                    equipment_df[key] = values 
        
    def extract_line_config_data_from_separated_cells(self, 
                                                      equipment_df, 
                                                      configuration_cols,
                                                      data_dict, 
                                                      data_keys,
                                                      storage_key):
        line_config_df = equipment_df.loc[:, configuration_cols]
        for equipIdx, row in line_config_df.iterrows():
            for dKey, col in zip(data_dict['0'].keys(), line_config_df.columns):
                print(line_config_df.loc[equipIdx, :])
                print(f'{dKey}: {row[col]}')        
                

    def align_line_configuration_data(self, group_key, max_line_types=2, type_key=None): # TODO
        
        pdf_data_keys    = self.sections['equipment specifications'][f'{group_key} keys']
        equipment_df = self.sections['equipment specifications']['data']
        
        if 'separated' in group_key:
            storage_key = f"{type_key.split(' ')[0]}_config" 
            storage_data_keys = [self.sections['equipment specifications'][config_group_key] 
                                 for config_group_key in self.sections['equipment specifications'].keys() 
                                 if type_key in config_group_key][0]
            configuration_cols = list()
            for dKey in pdf_data_keys:
                config_col_match_scores = [(col, levenshtein.normalized_similarity(col, dKey)) for col in equipment_df.columns]
                scores = np.array([score for _, score in config_col_match_scores])
                config_col = config_col_match_scores[np.argmax(scores)][0]    
                configuration_cols.append(config_col)
        else:
            storage_key = f"{group_key.split(' ')[0]}_config"
            storage_data_keys = pdf_data_keys
            configuration_cols = [col for col in equipment_df if group_key in col.lower()]
            
        data_dict = dict()
        for idx in range(max_line_types): 
            data_dict[str(idx)] = dict()
            for dKey in storage_data_keys:
                data_dict[str(idx)][dKey] = [None for k in range(equipment_df.shape[0])]

        if 'separated' in group_key:
            self.extract_line_config_data_from_separated_cells(equipment_df,
                                                               configuration_cols,
                                                               data_dict,
                                                               pdf_data_keys,
                                                               storage_key)
        else:
            self.extract_line_config_data_from_cells_containing_all_key_value_pairs(equipment_df, 
                                                                                    configuration_cols[0],
                                                                                    data_dict, 
                                                                                    pdf_data_keys,
                                                                                    storage_key,)
           
    def get_exhibit_name(self):
        for text_container in self.text_containers[0]:
            text = text_container.get_text()
            if 'exhibit' in text.lower():
                pattern = regex.compile(r'^\s+')
                exhibit = pattern.sub('', text.lower().replace('exhibit', '').replace('\n', ''))
                self.exhibit = exhibit

In [3]:
new_ocr_pdf_path  = Path('amendments/New_Exhibit_Redacted_docTR.pdf')
new_orig_pdf_path = Path('amendments/New_Exhibit_Redacted.pdf')
old_ocr_pdf_path  = Path('amendments/Old_Exhibit_Redacted_docTR.pdf')
old_orig_pdf_path = Path('amendments/Old_Exhibit_Redacted.pdf')

config_path = Path(r'C:\Users\Dalton\Documents\personal_records\apex_consulting\materials_and_amendments_OCR\configs\atc_extra_info_config.yaml')

In [4]:
new_exhibit = ATC_amendment(new_orig_pdf_path, new_ocr_pdf_path, config_path, ':')
new_exhibit.get_text_lines_from_original()
new_exhibit.get_text_lines_from_ocr()
new_exhibit.organize_text_lines_by_row_and_column()
new_exhibit.get_section_bounds()
new_exhibit.fill_implicit_keys('ground space requirements', left_mult=2, right_mult=2)
new_exhibit.extract_table_data()
new_exhibit.extract_text_data()



storing text from OCR on page 0:
"exhibit a-4 "

storing text from OCR on page 0:
"- e "

storing text from OCR on page 0:
"total lease area sq.ft: "

storing text from OCR on page 0:
"n/a "

storing text from OCR on page 0:
"requirements "

storing text from OCR on page 0:
"grsopuranecqdeu irements"

storing text from OCR on page 0:
"tolteaaalsr seqe2a. 1f 6tp:. 0ri0cm oanlrtyeiag aursleoe1a: u2 s.0 w0'1 :8 .0h01:'0 .s0q0.2'f 1t6: .00"

storing text from OCR on page 0:
"copnacdre te 10.00'1 6.00n' /a 160.00"

storing text from OCR on page 0:
"ouptrsilimdeaeaa rrseyea n /a n/a n/asq .nf/ta:"

storing text from OCR on page 0:
"bapcokrwueepqr u irements"

storing text from OCR on page 0:
"genne/raat or:f tueasln(i gzknea l/)a: f tueynlp /ea: ftueaslne k(trb aandci/kua s"

storing text from OCR on page 0:
"utrielqituyi rements"

storing text from OCR on page 0:
"popwroebvruy icdt:i eloiddtmy i preacnty"

storing text from OCR on page 0:
"telconlln/ate rconnect:"

storing text from OCR on

ValueError: Bounding box (0, 35.24400000000003, 293.76, 710.5299999999996) is not fully within parent page bounding box (0, 0.0, 293.76, 380.16)

In [21]:
new_exhibit.align_line_configuration_data(group_key = 'line configuration'   , max_line_types=2)
new_exhibit.align_line_configuration_data(group_key = 'conduit configuration', max_line_types=2)

Qty: 3
Type: Coax
Diameter: 1/2" Coax
Azimuth/Sector: 1/1/1
Qty: 6
Type: Coax
Diameter: 1 5/8"Coax
Azimuth/Sector: 2/2/2
Qty: 1
Type: Fiber/Hybrid
Diameter: 1 5/8"(1.63"-41.3mm) Fiber
Azimuth/Sector: 1/0/0
Qty: 3
Type: Coax
Diameter: 1/4" Coax
Azimuth/Sector: 1/1/1
Qty: 3
Type: Hard Line
Diameter: 1/8" HardLine
Azimuth/Sector: 2/1
Qty: 4
Qty: 4
Type: Control Cable
Type: Control Cable
Diameter: 0.31" (7.8mm) Cable
Diameter: 0.31" (7.8mm) Cable
Azimuth/Sector: 4
Azimuth/Sector: 4
Qty: 2
Qty: 2
Type: Control Cable
Type: Control Cable
Diameter: 0.31" (7.8mm) Cable
Diameter: 0.31" (7.8mm) Cable
Azimuth/Sector: 2
Azimuth/Sector: 2
Qty: 1
Qty: 1
Type: 2" conduit
Type: 2" conduit
containing:-;
containing:-;
Azimuth/Sector: 1
Azimuth/Sector: 1


In [22]:
new_exhibit.sections['ground space requirements']['data']
new_exhibit.sections['backup power requirements']['data']
new_exhibit.sections['utility requirements']['data']
new_exhibit.sections['transmitter & receiver specifications']['data']
new_exhibit.sections['equipment specifications']['data']

Unnamed: 0,Type,Manufacturer,Model #,Dimensions HxWxD,Weight (lbs.),Location,RAD Center AGL,Tip Height,Base Height,Mount Type,...,line_config_1_Diameter,line_config_1_Azimuth/Sector,conduit_config_0_Qty,conduit_config_0_Type,conduit_config_0_containing,conduit_config_0_Azimuth/Sector,conduit_config_1_Qty,conduit_config_1_Type,conduit_config_1_containing,conduit_config_1_Azimuth/Sector
0,GPS,Generic,GPS,"12"" x 9"" x 6""",10.0,Ground,,,,,...,,,,,,,,,,
1,PANEL,RFS,APXVAA24_43-U-\nA20,"96"" x 24"" x 8.5""",101.4,Tower,180.0',184.0',176.0',Side Arm,...,,,,,,,,,,
2,PANEL,CellMax,CMA-B/6521/E0-6,"81.1"" x 7.7"" x 4.8""",35.0,Tower,180.0',183.4',176.6',Side Arm,...,,,,,,,,,,
3,TTA,RFS,ATM1900D-1CWA,"8.6"" x 10"" x 2.6""",8.4,Tower,180.0',180.4',179.6',Side Arm,...,,,,,,,,,,
4,RRU/RRH,Ericsson,Radio 4478 B71,"15"" x 13.2"" x 7.4""",60.0,Tower,180.0',180.6',179.4',Side Arm,...,,,,,,,,,,
5,RRU/RRH,Ericsson,RRU22,"20.2"" x 13.2"" x 6.9""",52.9,Tower,180.0',180.8',179.2',Side Arm,...,,,1.0,"2"" conduit",-;,1.0,1.0,"2"" conduit",-;,1.0
6,RRU/RRH,Ericsson,RRU22,"20.2"" x 13.2"" x 6.9""",52.9,Tower,180.0',180.8',179.2',Side Arm,...,,,,,,,,,,
7,DISH-HP,Commscope,USX6-6W,6.23' x 6.23' x 4.32',198.0,Tower,155.0',158.1',151.9',Pole Mount,...,,,,,,,,,,
8,DISH-HP,Commscope,USX10-11W,10' x -' x -',579.8,Tower,155.0',160.0',150.0',Pole Mount,...,,,,,,,,,,
9,Radio/ODU,Ceragon,RFU-D,"9.1"" x 9.2"" x 3.9""",14.3,Tower,155.0',155.4',154.6',Pole Mount,...,"0.31"" (7.8mm) Cable",4.0,,,,,,,,


In [59]:
old_exhibit = ATC_amendment(old_orig_pdf_path, old_ocr_pdf_path, config_path, ':')
old_exhibit.get_text_lines_from_original()
old_exhibit.get_text_lines_from_ocr()
old_exhibit.organize_text_lines_by_row_and_column()
old_exhibit.get_section_bounds()
old_exhibit.fill_implicit_keys('ground space requirements', left_mult=2, right_mult=2)
old_exhibit.extract_table_data()



storing text from OCR on page 0:
"exhibit a-3"

storing text from OCR on page 0:
"total lease area sq.ft:21600° primary contiguous lease area l:1200° w:1800 h:1000 sq.ft: 216.00)"

storing text from OCR on page 0:
"ground space requirements"

storing text from OCR on page 0:
"concrete pad 1000 16.00" nia 160.00]"

storing text from OCR on page 0:
"outside primary lease area nia nia nia sq. ft: n/a|"

storing text from OCR on page 0:
"backup power requirements"

storing text from OCR on page 0:
"generator: nia capacity(kw): n/a fuel tank size(gal): n/a fuel type: nia fuel tank setback{radiu:"

storing text from OCR on page 0:
"utility requirements"

storing text from OCR on page 0:
"power provided by: utiity company direct"

storing text from OCR on page 0:
"telco/lnterconnect:"

storing text from OCR on page 0:
"ua"

storing text from OCR on page 0:
"transmitter & receiver specifications"

storing text from OCR on page 0:
"type: n/a quantity: nia tx power(watts): nia erp(watts): n/a"


In [60]:
old_exhibit.extract_text_data()

total lease area sq.ft: 21600°
primary contiguous lease area - l: 1200°
primary contiguous lease area - w: 1800
primary contiguous lease area - h: 1000
primary contiguous lease area - sq.ft: 216.00)
concrete pad - l: 1000
concrete pad - w: 16.00"
concrete pad - h: nia
concrete pad - sq.ft: 160.00]
outside primary lease area - l: nia
outside primary lease area - w: nia
outside primary lease area - h: nia
outside primary lease area - sq. ft: n/a
generator: nia
capacity(kw): n/a
fuel tank size(gal): n/a
fuel type: nia
fuel tank setback{radiu: None
power provided by: utiity company direct
telco/lnterconnect: ua
telco/lnterconnect: ua
type: n/a
quantity: nia
tx power(watts): nia
erp(watts): n/a
type Line Type
manufacturer Quant. Per Azimuth/Sector
model # Model #
dimensions hxwxd Dimensions HxWxD
[weight(ibs.) Using Unlicensed Frequencies?
location Total # of Lines
rad center agl RAD Center AGL
equipment tip Equipment Tip Height
equipment base eight Equipment Base Height
mount type Mount Ty

In [32]:
old_exhibit.align_line_configuration_data(group_key = 'separated configuration', max_line_types=2, type_key = 'line')
old_exhibit.align_line_configuration_data(group_key = 'separated configuration', max_line_types=2, type_key = 'conduit')

0     1
1     0
2     4
3     0
4     1
5     0
6     6
7     3
8     1
9     2
10    3
11    3
Name: Total # of Lines, dtype: object
0         Coax
1          N/A
2     Multiple
3          N/A
4         Coax
5          N/A
6           na
7           nn
8        11000
9            2
10           n
11          nn
Name: Line Type, dtype: object
0      0.29" (7.2mm) RG-8
1                     N/A
2     See Config. Summary
3                     N/A
4      0.29" (7.2mm) RG-8
5                     N/A
6          18.31 18.7/192
7                 132/136
8                       2
9                      na
10                     na
11                     na
Name: Line Diameter Size, dtype: object
0                       1
1                     N/A
2     See Config. Summary
3                     N/A
4                       1
5                     N/A
6       wrong_num_columns
7       wrong_num_columns
8       wrong_num_columns
9       wrong_num_columns
10      wrong_num_columns
11      wrong_num

In [33]:
old_exhibit.sections['ground space requirements']['data']
old_exhibit.sections['backup power requirements']['data']
old_exhibit.sections['utility requirements']['data']
old_exhibit.sections['transmitter & receiver specifications']['data']
old_exhibit.sections['equipment specifications']['data']

Unnamed: 0,Unnamed: 1,Type,Manufacturer,Model #,Dimensions HxWxD,Weight(lbs.),Location,RAD Center AGL,Equipment Tip Height,Equipment Base Height,...,TX/RX Frequency Units,TX Frequency,RX Frequency,Using Unlicensed Frequencies?,Equipment Gain,Total # of Lines,Line Quant. Per Azimuth/Sector,Line Type,Line Diameter Size,Line Configuration
0,,DISH-HP,RFS,SB6-W60BC,6.23' x 6.23' x 2.98',198.0,Tower,155.0',158.1',151.9',...,MHz,6400,6400,No,35.7/ 36.7/ 37.3,1,1,Coax,"0.29"" (7.2mm) RG-8",
1,,DISH-HP,Commscope,USX10-11W,10.00' x -' x -',579.8,Tower,155.0',160.0',150.0',...,GHz,11,1,No,,0,,,,
2,,Radio/ODU,Ceragon,RFU-D,"9.1"" x 9.2"" x 3.9""",14.3,Tower,155.0',155.4',154.6',...,,,,No,,4,See Config. Summary,Multiple,See Config. Summary,"2 - Control Cable;\n0.31"" (7.8mm) Cable;\n2\n2..."
3,,Radio/ODU,Ceragon,FibeAir IP-20E,"9.2"" x 9.1"" x 3.9""",14.3,Tower,155.0',155.4',154.6',...,,,,No,,0,,,,
4,,DISH-HP,RFS,SB4-W60,4.14' x 4.14' x -',77.0,Tower,125.0',127.1',122.9',...,MHz,6400,6400,No,32.4 / 32.7 / 33.4,1,1,Coax,"0.29"" (7.2mm) RG-8",
5,,Radio/ODU,Ceragon,FibeAir IP-20E,"9.2"" x 9.1"" x 3.9""",14.3,Tower,125.0',125.4',124.6',...,,,,No,,0,,,,
6,na,panel,cellmax,cmabigs21/e06,81.1'x7.7'x48,350.0,"tower,","180.0""",na,1766,...,mhz,1930-1945.2145- 215517351740,"11875505-12816355,21174450-",no,1834,6,wrong_num_columns,na,18.31 18.7/192,coax
7,na,panel,rfs,apxvaa24_43-u-a20,96'x24'x85,1014.0,"tower,","180.0""",111,1760°,...,mhz,668-638,622642,no,1840,3,wrong_num_columns,nn,132/136,coax
8,na,tta,rfs,atm1900d-1cwa,86x10x26,84.0,tower,180.0,nn,1796,...,na,nia,na,no,1804°,1,wrong_num_columns,11000,2,fiberfhybrid
9,na,rrurrh,ericsson,rruz2,"202x132 x69""",529.0,tower,"180.0""",1,1792,...,mhz,"21452155, 1736 1740",74512715450 2135,no,1808,2,wrong_num_columns,2,na,‘conduit
