In [114]:
from src.app.drivers import BaseDriver
from src.pages.loginpage import LoginPage
from src.pages.basepage import BasePage
from src.pages.dataconnectpage import DataConnectPage
from src.utils.pdf_processor import PdfProcessor
from src.utils.extraction_handler import ExtractionHandler
from src.utils.file_handler import FileHandler
from src.utils.mappings import doc_type_patterns
import os
import logging
import pikepdf
import re
from datetime import datetime
bd = BaseDriver()
lp = LoginPage(bd)
dcp =DataConnectPage(BasePage(bd))

processor = PdfProcessor()
extraction_handler = ExtractionHandler()
file_handler = FileHandler()


KeyboardInterrupt: 

In [2]:
bd.setup_driver()
dcp.set_group_filter_to_credit_card()

True

In [11]:
# dcp.check_all_then_click_print()
filepath = '/Users/ekim/downloads/messages.pdf'

In [None]:
def get_pdf(filepath):
    if not os.path.exists(filepath):
        logging.info(f'File path does not exist: "{filepath}"')
        return None
    else:
        print(f'Filepath: "{filepath}" exists. Returning opened PikePdf object')
        return pikepdf.open(filepath)


In [None]:
def update_pdf_data(pdf_file_path):
    pdf_data = get_pdf(pdf_file_path)
    return pdf_data


In [28]:
pdf_data = update_pdf_data(filepath)
pdf_data

Filepath: "/Users/ekim/downloads/messages.pdf" exists. Returning opened PikePdf object


<pikepdf.Pdf description='/Users/ekim/downloads/messages.pdf'>

In [93]:
today_str = datetime.today().strftime('%m-%d-%y')
today_str

'10-15-23'

In [None]:
def construct_final_output_filepath(new_file_name):
    company_dir = '/Users/ekim/workspace/personal/dtn-bot/test'
    output_file_path = os.path.join(company_dir, new_file_name)
    return output_file_path


In [None]:
def get_new_file_name(doc_type, total_target_amt):
    if re.match(r'EFT-\s*\d+', doc_type) and re.match(r'-?[\d,]+\.\d+-?', total_target_amt):
        logging.critical(f'******************************************* total_target_amt: {total_target_amt} ***********************')
        # todo: debug why this formatting conditional doesn't work
        if "-" in total_target_amt:
            total_target_amt = total_target_amt.replace("-", "")
            new_file_name = f'{doc_type}-{today_str}-({total_target_amt}).pdf'
        else:
            new_file_name = f'{doc_type}-{today_str}-{total_target_amt}.pdf'
    elif (re.match(r'CBK-\s*\d+', doc_type) or re.match(r'RTV-\s*\d+', doc_type)):
        new_file_name = f'{doc_type}-{today_str}-CHARGEBACK REQUEST.pdf'
    else:
        new_file_name = f'{doc_type}-{today_str}-{total_target_amt}.pdf'
    return new_file_name


In [None]:
def get_doc_type(cur_page_text):
    """
    Helper func for getting doc_type_pattern instance
    :param cur_page_text: Text from the current page
    :param doc_type_patterns: List of document type patterns to search for
    :return: Matching document type pattern or None
    """
    try:
        for doc_type_pattern in doc_type_patterns:
            if re.search(doc_type_pattern, cur_page_text, re.IGNORECASE):
                print(f'Found matching document type using regex pattern: "{doc_type_pattern}" in current page text.')
                return doc_type_pattern
        # If loop completes without a return, it means no match was found
        print(f'Could not find matching document type using regex pattern in current page text:\n{cur_page_text}\n')
        return None
    except Exception as e:
        print(f'An unexpected error occurred: {e}')
        return None


In [None]:
def process_multi_page(cur_page_text, page_num, doc_type_pattern):
    page_objs = []
    page_text_strings = []
    
    # Enter loop by checking for absence of end marker in first page of multi page spanning text
    while 'END MSG' not in cur_page_text and page_num < len(pdf_data.pages):
        print(f'ENTERING MULTI PAGE PROCESSING FUNC for page: {page_num + 1}....................')
        cur_page = pdf_data.pages[page_num]
        page_objs.append(cur_page)
        cur_page_text = extraction_handler.extract_text_from_pdf_page(cur_page)
        page_text_strings.append(cur_page_text)
        # doc_type_pattern = get_doc_type(cur_page_text)
        page_num += 1
        if page_num >= len(pdf_data.pages):
            break
    cur_page_text = "".join(page_text_strings)
    # print(f'------------cur_page_text--------------------\n')
    # print(cur_page_text)
    # print(f'\n--------------------------------')
    print(f'Extracting Document Type and Total Target Amount....')
    doc_type, total_target_amt = extraction_handler.extract_doc_type_and_total_target_amt(doc_type_pattern, cur_page_text)
    print(f'Document Type: {doc_type} | Total Target Amount: {total_target_amt}')

    # Construct new file name instance
    new_file_name = get_new_file_name(doc_type, total_target_amt)

    # Construct final output filepath using wrapper
    final_output_filepath = construct_final_output_filepath(new_file_name)

    print(f'final_output_filepath: {final_output_filepath}\nnew_file_name: {new_file_name}')

    # Move (save) new file to final output path
    # multi_page_pdf_created_and_saved = create_and_save_pdf(page_objs)
    # print('\n--------------------------------------------------------------------')
    # return multi_page_pdf_created_and_saved



In [111]:
def process_single_page(cur_page_text, page_num, doc_type_pattern):

    # end marker and current instance company name in text
    if 'END MSG' in cur_page_text and page_num < len(pdf_data.pages):
        cur_page = pdf_data.pages[page_num] # single pikepdf page obj
        # @dev: cur_page_text instance is the same instance to extract text from b/c single page
        # doc_type_pattern = get_doc_type(cur_page_text)
        # fetch target data
        doc_type, total_target_amt = extraction_handler.extract_doc_type_and_total_target_amt(doc_type_pattern, cur_page_text)
        logging.info(f'Document Type: {doc_type} | Total Target Amount: {total_target_amt}')

        if page_num >= len(pdf_data.pages):
            return # exit func b/c finished with pdf

        # move page cursor after check; ensures that when last_page_num == len(last_page), it exits and prevents misleading final "error" message that last_page_num + 1 could not be processed
        page_num +=1
        # fetch file name
        new_file_name = get_new_file_name(doc_type, total_target_amt)

        #  fetch output filepath
        final_output_filepath = construct_final_output_filepath(new_file_name)

        logging.info(f'final_output_filepath: {final_output_filepath}\nnew_file_name: {new_file_name}')

        # Create single page pdf and save in correct dir
        # single_page_pdf_created_and_saved = create_and_save_pdf(cur_page)
        # logging.info(f'single_page_pdf_created_and_saved: {single_page_pdf_created_and_saved}')
        # return single_page_pdf_created_and_saved

In [112]:
def process_pages():
    """
    main processing func
    """
    # print(f'Prior to updating pdf data instance: {pdf_data}')
    print(f'After updating pdf_data instance using setter: {pdf_data}')

    try:
        page_num = 0  # You will need to define this somewhere before the loop
        while page_num < len(pdf_data.pages):

            print(f'Processing page number: {page_num + 1}')
            page = pdf_data.pages[page_num]

            cur_page_text = extraction_handler.extract_text_from_pdf_page(page)  # extraction_handler must be defined
            print(f'cur_page_text: \n************************************************************** \n{cur_page_text}\n **************************************************************')

            company_name = processor.get_company_name(cur_page_text)  # get_company_name must be defined
            print(f'company_name: \n{company_name}\n')

            doc_type_pattern = get_doc_type(cur_page_text)  # get_doc_type must be defined
            print(f'doc_type_pattern: \n{doc_type_pattern}\n')

            if company_name not in cur_page_text:
                print(f'Company name "{company_name}" not found in current page.')
                page_num += 1
                continue
                
            if re.search(doc_type_pattern, cur_page_text, re.IGNORECASE) and ('END MSG' not in cur_page_text):
                if not process_multi_page(cur_page_text,page_num, doc_type_pattern):  # process_multi_page must be defined
                    raise ValueError(f"Failed processing multi-page PDF at page {page_num + 1}.")
                
            elif re.search(doc_type_pattern, cur_page_text, re.IGNORECASE) and ('END MSG' in cur_page_text):
                if not process_single_page(cur_page_text,page_num, doc_type_pattern):  # process_single_page must be defined
                    raise ValueError(f"Failed processing single-page PDF at page {page_num + 1}.")
            else:
                print(f"Pattern '{doc_type_pattern}' not found in current page.")
                page_num += 1

            if page_num >= len(pdf_data.pages):
                break

        print("Completed processing all pages.")
        return True

    except Exception as e:
        print(f"An error occurred: {e}")
        return False


In [None]:
process_pages()