In [1]:
from tqdm import tqdm
from utils.ocr import OCRTool
from utils.pdf_splitter import split_pdf, process_pdf
import base64
from app.config import settings
from utils.emailclient import EmailAttachmentExtractor
from datetime import datetime, timedelta
from utils.sheet import SheetsClient
import pandas as pd
import logging
logger = logging.getLogger(__name__)

In [2]:
email_client = EmailAttachmentExtractor(
        email_address=settings.EMAIL_ADDRESS,
        password=settings.EMAIL_PASSWORD,
        imap_server=settings.IMAP_SERVER
        )

In [6]:
if email_client.connect():
    print("Connected to email server")
    today = datetime.now().strftime("%d-%b-%Y")
    yesterday = (datetime.now() - timedelta(days=1)).strftime("%d-%b-%Y")
    pdfs = email_client.extract_pdf_attachments(num_emails=200,
                                            subject_contains=settings.WORD_IN_SUBJECT)
                                            # date_from=yesterday,
                                            # date_to=today)

Login failed: command LOGIN illegal in state LOGOUT, only allowed in states NONAUTH


In [5]:
results = []
try:
    for pdf in pdfs:
        result = process_pdf(pdf['binary_data'], OCRTool(), pdf['filename'])
        results.append(result)
except Exception as e:
    logger.error(f"Error processing pdf: {e}")
    email_client.disconnect()

Error processing pdf: 'filename'


AttributeError: 'EmailAttachmentExtractor' object has no attribute 'disconnect'

In [23]:
sheets_client = SheetsClient(credentials_file_path=settings.CREDENTIALS_FILE_PATH)


[{'image': 'iVBORw0KGgoAAAANSUhEUgAAAmQAAAMYCAIAAADq5GzlAAAACXBIWXMAAA7EAAAOxAGVKw4bAANAeUlEQVR4nOydB1gUR/+ATfm+f758STS9atREjTH23mPvvVcUFVvs2HtXRMUKKoqKIgrSRMECqGADFZWmRsGCiIqKFJEi/F9vwn7HoXAqeAfM+/Dw7M3Nzs7uzsw7v7u93UJpEolEIpFIsqSQrisgkUj+ITUjuq6ORCL5H1KWEom+oGjy+fPnUpYSiV4hZSmR6AtSlhKJ3iJlKZHoC1KWEoneImUpkegXaDIlJeW5CqlMiURPkLKUSPQLKUuJRA+RspRI9AspS4lED5GylEj0CylLiUQPkbKUSPQLBJmcnCx8qeu6SCSSf5CylEj0CylLiUQPkbKUSPQLKUuJRA+RspRI9AspS4lED5GylEj0C3lTAolED5GylEj0CylLiUQPkbKUSPQLKUuJRA+RspRI9AspS4lED5GylEj0CDSZokLKUiLRK6QsJRI9QspSItFPpCwlEj1CylIi0U+kLCUSPUKRpfzaUiLRK6QsJRI9QshS3pRAItE3pCwlEj1CylIi0U+kLCUSPULKUiLRT6QsJRJ9IVVFUlJSYmKivMZHItErpCwlEj2CaFLKUiLRQ6QsJRJ9ATviyCQVUpYSiV4hZSmR6Avi4VyElVKWEom+IWUpKSik5gS5/fNHHJkbstTV7kgk+QYpS0lBQVw78/fff5/LyPnz50lXVwgvr169euPGDeGSqKiouLi4p0+fshwdHZ2QkPD48eNsN4ftrl27RvkBAQGxsbGvyvbs2TNRDTKLyJIUKUuJRN+QspQUFLDCkydPRowYUSkjlStXHjp06KVLl4Q5hB0NDQ0NDAwI8ng5Y8aMNm3aeHt7s7xp06b27dsvXLgwiw1RDvJj9Ro1apQtW7ZChQqtWrViReyrkTM8PLx///5lVQwfPlxc3YOMxQU+ISEhRkZGY8eO

Processing BOLs: 100%|██████████| 11/11 [01:07<00:00,  6.12s/it]
Processing BOLs: 100%|██████████| 15/15 [01:12<00:00,  4.85s/it]
100%|██████████| 2/2 [02:22<00:00, 71.05s/it]


In [25]:
df = pd.DataFrame(result)

In [2]:
PDF_PATH = "test_documents/chai6.11.pdf"
with open(PDF_PATH, "rb") as f:
    pdf_data = f.read()
result = process_pdf(pdf_data, OCRTool())

Processing BOLs: 100%|██████████| 15/15 [01:13<00:00,  4.87s/it]


In [None]:
file_path = "AMAZON FREIGHT & CENTRAL FREIGHT (11 ORDERS) 05.30.24 (LIFEPRO)_page-0001.jpg"
image_data = base64.b64encode(open(file_path, "rb").read()).decode("utf-8")

ocr_tool = OCRTool()
result = ocr_tool.run(image_data)

In [16]:
PDF_PATH = "test_documents/chai6.11.pdf"
with open(PDF_PATH, "rb") as f:
    pdf_data = f.read()

bol_info_list = split_pdf(pdf_data)

ocr_tool = OCRTool()
for bol_info in tqdm(bol_info_list, desc="Processing BOLs"):
    img = bol_info["image"]
    pdf_bytes = bol_info["pdf"]
    shipment_info = ocr_tool.run(img)
    if shipment_info is None:
        continue
    order_number = shipment_info.get("customer_order_information", {}).get("order_number", "")
    shipment_id = shipment_info.get("customer_order_information", {}).get("shipment_id", "")
    file_name = f"Order {order_number} - Shipment {shipment_id}.pdf"
    bol_info["shipment_info"] = shipment_info
    bol_info["file_name"] = file_name

Processing BOLs: 100%|██████████| 15/15 [01:09<00:00,  4.61s/it]
