# Analysing Adobe error messages

For issue https://github.com/climatepolicyradar/navigator/issues/348. Keeping while we're figuring out the root cause of some of these errors with the Adobe team.



In [142]:
from pathlib import Path
import glob
import re
import os
import shutil

from tqdm.auto import tqdm
import pandas as pd

pd.set_option('max_colwidth', 400)

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
PDF_DIR = Path("../../data/cclw-en-pdf-docs/")
DATA_DIR = Path("../../data/pdf2text/intermediate/")
OUTPUT_DIR = Path("../../data/pdf2text/output/")

how many ran through the pipeline?

In [29]:
num_pdfs_in = len(glob.glob(str(PDF_DIR / "*.pdf")))
num_jsons_out = len(glob.glob(str(OUTPUT_DIR / "*.json")))

print(f"{num_pdfs_in} in; {num_jsons_out} out" )

939 in; 939 out


how many ran through adobe vs the pdf processor?

In [31]:
adobe_successes = []
embedded_text_extractor_successes = []

for file_or_folder in DATA_DIR.iterdir():
    if file_or_folder.is_dir():
        adobe_successes.append(file_or_folder.name)
    elif file_or_folder.suffix == ".xml" and not file_or_folder.stem.endswith("metadata") and not file_or_folder.stem.endswith("outline"):
        embedded_text_extractor_successes.append(file_or_folder)

# check that all adobe folders for a split PDF are populated. otherwise remove from successes
split_pdfs = list(set([i.split("_")[0] for i in adobe_successes if len(i.split("_")) == 2]))

for pdf_name in split_pdfs:
    split_dirs = [_dir for _dir in adobe_successes if _dir.startswith(pdf_name)]
    
    # number of files in each directory created from the split pdfs
    split_dir_num_files = [len(list((DATA_DIR / _dir).iterdir())) for _dir in split_dirs]
    if not all([i > 0 for i in split_dir_num_files]):
        adobe_successes = [i for i in adobe_successes if not i.startswith(pdf_name)]
        
# remove numbering suffixes from successful split PDFs, then add the extension '.pdf' to all of them
adobe_successes = list(set([i.split("_")[0] for i in adobe_successes]))

embedded_text_extractor_successes = [filename.with_suffix(".pdf") for filename in embedded_text_extractor_successes]

len(adobe_successes), len(embedded_text_extractor_successes)
perc_successful_adobe = len(adobe_successes) / num_jsons_out * 100

print(f"{len(adobe_successes)} ({round(perc_successful_adobe, 2)}%) passed through adobe. {len(embedded_text_extractor_successes)} failed in adobe and passed through the embedded text extractor.")

808 (86.05%) passed through adobe. 131 failed in adobe and passed through the embedded text extractor.


what were the reasons for failures?

In [77]:
LOG_PATH = Path("../../data/pdf2text/logs 220314.txt")

with open(LOG_PATH, "r") as f:
    logs = f.read()
    
print(logs)

2022-03-14 12:50:29,351 - __main__ - INFO - Processing PDFs
8it [03:34, 22.95s/it]ERROR:root:Exception encountered while executing operation for /pdf-in/cclw-10012-74d645f813a348aea52d4d1cba9dbc66.pdf: description =Request could not be completed. Possible cause attached!, requestTrackingId=(<class 'requests.exceptions.SSLError'>, SSLError(MaxRetryError("HTTPSConnectionPool(host='cpf-ue1.adobe.io', port=443): Max retries exceeded with url: /ops/id/yF30u4Uu9ihhwQCyAqHWjTEFgYpty3a6 (Caused by SSLError(SSLEOFError(8, 'EOF occurred in violation of protocol (_ssl.c:1131)')))")), <traceback object at 0x7f267316cb80>)
Traceback (most recent call last):
  File "/usr/local/lib/python3.8/site-packages/urllib3/connectionpool.py", line 699, in urlopen
    httplib_response = self._make_request(
  File "/usr/local/lib/python3.8/site-packages/urllib3/connectionpool.py", line 382, in _make_request
    self._validate_conn(conn)
  File "/usr/local/lib/python3.8/site-packages/urllib3/connectionpool.py", l

In [80]:
# logs_per_doc = re.split(r"(?:\d+)it \[.*]", logs)
logs_per_doc = re.split(r"ERROR:root:", logs)
len(logs_per_doc)

128

In [200]:
df = pd.DataFrame(logs_per_doc[1:], columns=['msg'])

def clean_logs(logs: str):
    logs = logs.strip().strip("\n")
    
    return logs

df['msg'] = df['msg'].apply(clean_logs)

# df['adobe_error'] = df['msg'].str.startswith('ERROR:root')
df['pdf_filename'] = df['msg'].apply(lambda i: re.findall(r"cclw-(?:\d+)-(?:\w+).pdf", i)).apply(lambda i: i[0] if len(i) == 1 else None)
df['on_split'] = df['msg'].str.contains('_split_')
df['ssl_error'] = df['msg'].str.contains('EOF occurred in violation of protocol')
# TODO: retry subset of write timeout on ec2
df['write_operation_timeout'] = df['msg'].str.contains('The write operation timed out')
df['read_operation_timeout'] = df['msg'].str.contains('requests.exceptions.ReadTimeout|ConnectionError\(ReadTimeoutError')
df['adobe_500'] = df['msg'].str.contains('statusCode=500')
df['disqualified'] = df['msg'].str.contains('DISQUALIFIED')
df['disqualified_exceeds_page_limit'] = df['msg'].str.contains('DISQUALIFIED - File not suitable for content extraction: File exceeds page limit')

df.head(5)

Unnamed: 0,msg,pdf_filename,on_split,ssl_error,write_operation_timeout,read_operation_timeout,adobe_500,disqualified,disqualified_exceeds_page_limit
0,"Exception encountered while executing operation for /pdf-in/cclw-10012-74d645f813a348aea52d4d1cba9dbc66.pdf: description =Request could not be completed. Possible cause attached!, requestTrackingId=(<class 'requests.exceptions.SSLError'>, SSLError(MaxRetryError(""HTTPSConnectionPool(host='cpf-ue1.adobe.io', port=443): Max retries exceeded with url: /ops/id/yF30u4Uu9ihhwQCyAqHWjTEFgYpty3a6 (Caus...",cclw-10012-74d645f813a348aea52d4d1cba9dbc66.pdf,False,True,False,False,False,False,False
1,"Exception encountered while executing operation for /pdf-in/cclw-10059-84dfe3f70fd74fd0b5599b9174a5439e.pdf: description =Request could not be completed. Possible cause attached!, requestTrackingId=(<class 'requests.exceptions.ConnectionError'>, ConnectionError(ProtocolError('Connection aborted.', timeout('The write operation timed out'))), <traceback object at 0x7f2677758380>)\nTraceback (mos...",cclw-10059-84dfe3f70fd74fd0b5599b9174a5439e.pdf,False,False,True,False,False,False,False
2,"Exception encountered while executing operation for /pdf-in/cclw-10086-357cef7658b8440b823e4c76c0b09745.pdf: description =Request could not be completed. Possible cause attached!, requestTrackingId=(<class 'requests.exceptions.ConnectionError'>, ConnectionError(ProtocolError('Connection aborted.', timeout('The write operation timed out'))), <traceback object at 0x7f267784c240>)\nTraceback (mos...",cclw-10086-357cef7658b8440b823e4c76c0b09745.pdf,False,False,True,False,False,False,False
3,"Exception encountered while executing operation for /temp/cclw-10060-cfc1953eafa945b2b1060c023003d97f_split_3_maxpages_75.pdf: description =ERROR - Unable to extract content. Internal error: Processing timeout; requestTrackingId=Ymk8xdeIxEUPG7jzWKO5bkah7yX68NMu; statusCode=500; errorCode=UNKNOWN\nTraceback (most recent call last):\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservic...",cclw-10060-cfc1953eafa945b2b1060c023003d97f_split_3_maxpages_75.pdf,True,False,False,False,True,True,True
4,"Exception encountered while executing operation for /temp/cclw-1055-bf17ca3b41b943fe83f0bd5c5ff36823_split_9_maxpages_75.pdf: description =Request could not be completed. Possible cause attached!, requestTrackingId=(<class 'requests.exceptions.SSLError'>, SSLError(MaxRetryError(""HTTPSConnectionPool(host='cpf-ue1.adobe.io', port=443): Max retries exceeded with url: /ops/id/76TioTACg8glBjdNAivkC...",cclw-1055-bf17ca3b41b943fe83f0bd5c5ff36823_split_9_maxpages_75.pdf,True,True,False,False,False,True,True


In [118]:
df.sum()

msg                                Exception encountered while executing operation for /pdf-in/cclw-10012-74d645f813a348aea52d4d1cba9dbc66.pdf: description =Request could not be completed. Possible cause attached!, requestTrackingId=(<class 'requests.exceptions.SSLError'>, SSLError(MaxRetryError("HTTPSConnectionPool(host='cpf-ue1.adobe.io', port=443): Max retries exceeded with url: /ops/id/yF30u4Uu9ihhwQCyAqHWjTEFgYpty3a6 (Caused by SSLError(SSLEOFError(8, 'EOF occurred in violation of protocol (_ssl.c:1131)')))")), <traceback object at 0x7f267316cb80>)\nTraceback (most recent call last):\n  File "/usr/local/lib/python3.8/site-packages/urllib3/connectionpool.py", line 699, in urlopen\n    httplib_response = self._make_request(\n  File "/usr/local/lib/python3.8/site-packages/urllib3/connectionpool.py", line 382, in _make_request\n    self._validate_conn(conn)\n  File "/usr/local/lib/python3.8/site-packages/urllib3/connectionpool.py", line 1010, in _validate_conn\n    conn.connect()\n  F

In [201]:
with pd.option_context('max_colwidth', 10000):
    # display(df[~df['read_operation_timeout'] & ~df['write_operation_timeout'] & ~df['adobe_500'] & ~df['ssl_error'] & ~df['disqualified_exceeds_page_limit']])
    display(df[df['adobe_500']] )

Unnamed: 0,msg,pdf_filename,on_split,ssl_error,write_operation_timeout,read_operation_timeout,adobe_500,disqualified,disqualified_exceeds_page_limit
3,"Exception encountered while executing operation for /temp/cclw-10060-cfc1953eafa945b2b1060c023003d97f_split_3_maxpages_75.pdf: description =ERROR - Unable to extract content. Internal error: Processing timeout; requestTrackingId=Ymk8xdeIxEUPG7jzWKO5bkah7yX68NMu; statusCode=500; errorCode=UNKNOWN\nTraceback (most recent call last):\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/pdfops/extract_pdf_operation.py"", line 134, in execute\n ExtractPDFAPI.download_and_save(location=location, context=execution_context, file_location=file_location)\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/service/extract_pdf_api.py"", line 48, in download_and_save\n response = CPFApi.cpf_status_api(location, context)\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/api/cpf_api.py"", line 86, in cpf_status_api\n response = polling2.poll(\n File ""/usr/local/lib/python3.8/site-packages/polling2.py"", line 191, in poll\n val = target(*args, **kwargs)\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/api/cpf_api.py"", line 87, in <lambda>\n lambda: http_client.process_request(http_request=http_request,\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/http/http_client.py"", line 42, in process_request\n if _handle_response_and_retry(response, success_status_codes,\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/http/http_client.py"", line 105, in _handle_response_and_retry\n raise OperationException(message=""Error response received for request"",\nadobe.pdfservices.operation.internal.exceptions.OperationException\n\nDuring handling of the above exception, another exception occurred:\n\nTraceback (most recent call last):\n File ""/app/extract/extract.py"", line 655, in pdf_to_data\n result = self._get_adobe_api_result(pdf_filepath)\n File ""/app/extract/extract.py"", line 621, in _get_adobe_api_result\n result = extract_pdf_operation.execute(_execution_context)\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/pdfops/extract_pdf_operation.py"", line 138, in execute\n raise ServiceApiException(message=oex.error_message, error_code=oex.error_code,\nadobe.pdfservices.operation.exception.exceptions.ServiceApiException: description =DISQUALIFIED - File not suitable for content extraction: File exceeds page limit; requestTrackingId=nHN9n10ze6KhEGcuuhAm2tHQuechg2uP; statusCode=400; errorCode=UNKNOWN\n\nDuring handling of the above exception, another exception occurred:\n\nTraceback (most recent call last):\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/pdfops/extract_pdf_operation.py"", line 134, in execute\n ExtractPDFAPI.download_and_save(location=location, context=execution_context, file_location=file_location)\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/service/extract_pdf_api.py"", line 48, in download_and_save\n response = CPFApi.cpf_status_api(location, context)\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/api/cpf_api.py"", line 86, in cpf_status_api\n response = polling2.poll(\n File ""/usr/local/lib/python3.8/site-packages/polling2.py"", line 191, in poll\n val = target(*args, **kwargs)\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/api/cpf_api.py"", line 87, in <lambda>\n lambda: http_client.process_request(http_request=http_request,\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/http/http_client.py"", line 42, in process_request\n if _handle_response_and_retry(response, success_status_codes,\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/http/http_client.py"", line 105, in _handle_response_and_retry\n raise OperationException(message=""Error response received for request"",\nadobe.pdfservices.operation.internal.exceptions.OperationException\n\nDuring handling of the above exception, another exception occurred:\n\nTraceback (most recent call last):\n File ""/app/extract/extract.py"", line 655, in pdf_to_data\n result = self._get_adobe_api_result(pdf_filepath)\n File ""/app/extract/extract.py"", line 621, in _get_adobe_api_result\n result = extract_pdf_operation.execute(_execution_context)\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/pdfops/extract_pdf_operation.py"", line 138, in execute\n raise ServiceApiException(message=oex.error_message, error_code=oex.error_code,\nadobe.pdfservices.operation.exception.exceptions.ServiceApiException: description =ERROR - Unable to extract content. Internal error: Processing timeout; requestTrackingId=Ymk8xdeIxEUPG7jzWKO5bkah7yX68NMu; statusCode=500; errorCode=UNKNOWN\n48it [23:38, 16.76s/it]",cclw-10060-cfc1953eafa945b2b1060c023003d97f_split_3_maxpages_75.pdf,True,False,False,False,True,True,True
38,"Exception encountered while executing operation for /pdf-in/cclw-4725-30f05ef84cf54d049263628936ef16c9.pdf: description =ERROR - Unable to extract content. Internal error; requestTrackingId=5EyajhJeeUbpQTREBZ4cU7hkDgFaf3Of; statusCode=500; errorCode=UNKNOWN\nTraceback (most recent call last):\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/pdfops/extract_pdf_operation.py"", line 134, in execute\n ExtractPDFAPI.download_and_save(location=location, context=execution_context, file_location=file_location)\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/service/extract_pdf_api.py"", line 48, in download_and_save\n response = CPFApi.cpf_status_api(location, context)\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/api/cpf_api.py"", line 86, in cpf_status_api\n response = polling2.poll(\n File ""/usr/local/lib/python3.8/site-packages/polling2.py"", line 191, in poll\n val = target(*args, **kwargs)\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/api/cpf_api.py"", line 87, in <lambda>\n lambda: http_client.process_request(http_request=http_request,\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/http/http_client.py"", line 42, in process_request\n if _handle_response_and_retry(response, success_status_codes,\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/http/http_client.py"", line 105, in _handle_response_and_retry\n raise OperationException(message=""Error response received for request"",\nadobe.pdfservices.operation.internal.exceptions.OperationException\n\nDuring handling of the above exception, another exception occurred:\n\nTraceback (most recent call last):\n File ""/app/extract/extract.py"", line 655, in pdf_to_data\n result = self._get_adobe_api_result(pdf_filepath)\n File ""/app/extract/extract.py"", line 621, in _get_adobe_api_result\n result = extract_pdf_operation.execute(_execution_context)\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/pdfops/extract_pdf_operation.py"", line 138, in execute\n raise ServiceApiException(message=oex.error_message, error_code=oex.error_code,\nadobe.pdfservices.operation.exception.exceptions.ServiceApiException: description =ERROR - Unable to extract content. Internal error; requestTrackingId=5EyajhJeeUbpQTREBZ4cU7hkDgFaf3Of; statusCode=500; errorCode=UNKNOWN\n396it [2:35:39, 23.01s/it]",cclw-4725-30f05ef84cf54d049263628936ef16c9.pdf,False,False,False,False,True,False,False
50,"Exception encountered while executing operation for /pdf-in/cclw-4934-92bc365e725341d2bb254920007c6e3a.pdf: description =ERROR - Unable to extract content. Internal error; requestTrackingId=eDzgJbLkcuQIH4jlJDhPLVCtIGFnjMGJ; statusCode=500; errorCode=UNKNOWN\nTraceback (most recent call last):\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/pdfops/extract_pdf_operation.py"", line 134, in execute\n ExtractPDFAPI.download_and_save(location=location, context=execution_context, file_location=file_location)\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/service/extract_pdf_api.py"", line 48, in download_and_save\n response = CPFApi.cpf_status_api(location, context)\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/api/cpf_api.py"", line 86, in cpf_status_api\n response = polling2.poll(\n File ""/usr/local/lib/python3.8/site-packages/polling2.py"", line 191, in poll\n val = target(*args, **kwargs)\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/api/cpf_api.py"", line 87, in <lambda>\n lambda: http_client.process_request(http_request=http_request,\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/http/http_client.py"", line 42, in process_request\n if _handle_response_and_retry(response, success_status_codes,\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/http/http_client.py"", line 105, in _handle_response_and_retry\n raise OperationException(message=""Error response received for request"",\nadobe.pdfservices.operation.internal.exceptions.OperationException\n\nDuring handling of the above exception, another exception occurred:\n\nTraceback (most recent call last):\n File ""/app/extract/extract.py"", line 655, in pdf_to_data\n result = self._get_adobe_api_result(pdf_filepath)\n File ""/app/extract/extract.py"", line 621, in _get_adobe_api_result\n result = extract_pdf_operation.execute(_execution_context)\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/pdfops/extract_pdf_operation.py"", line 138, in execute\n raise ServiceApiException(message=oex.error_message, error_code=oex.error_code,\nadobe.pdfservices.operation.exception.exceptions.ServiceApiException: description =ERROR - Unable to extract content. Internal error; requestTrackingId=eDzgJbLkcuQIH4jlJDhPLVCtIGFnjMGJ; statusCode=500; errorCode=UNKNOWN\n455it [2:55:21, 10.55s/it]PdfReadWarning: Xref table not zero-indexed. ID numbers for objects will be corrected. [pdf.py:1736]",cclw-4934-92bc365e725341d2bb254920007c6e3a.pdf,False,False,False,False,True,False,False
57,"Exception encountered while executing operation for /pdf-in/cclw-8119-575ab4f296b94d9faef557fc145fa44c.pdf: description =ERROR - Unable to extract content. Internal error: Processing timeout; requestTrackingId=HEhYW9NVQA0eHaQIa4ZWTkcu2D44chf1; statusCode=500; errorCode=UNKNOWN\nTraceback (most recent call last):\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/pdfops/extract_pdf_operation.py"", line 134, in execute\n ExtractPDFAPI.download_and_save(location=location, context=execution_context, file_location=file_location)\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/service/extract_pdf_api.py"", line 48, in download_and_save\n response = CPFApi.cpf_status_api(location, context)\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/api/cpf_api.py"", line 86, in cpf_status_api\n response = polling2.poll(\n File ""/usr/local/lib/python3.8/site-packages/polling2.py"", line 191, in poll\n val = target(*args, **kwargs)\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/api/cpf_api.py"", line 87, in <lambda>\n lambda: http_client.process_request(http_request=http_request,\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/http/http_client.py"", line 42, in process_request\n if _handle_response_and_retry(response, success_status_codes,\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/http/http_client.py"", line 105, in _handle_response_and_retry\n raise OperationException(message=""Error response received for request"",\nadobe.pdfservices.operation.internal.exceptions.OperationException\n\nDuring handling of the above exception, another exception occurred:\n\nTraceback (most recent call last):\n File ""/app/extract/extract.py"", line 655, in pdf_to_data\n result = self._get_adobe_api_result(pdf_filepath)\n File ""/app/extract/extract.py"", line 621, in _get_adobe_api_result\n result = extract_pdf_operation.execute(_execution_context)\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/pdfops/extract_pdf_operation.py"", line 138, in execute\n raise ServiceApiException(message=oex.error_message, error_code=oex.error_code,\nadobe.pdfservices.operation.exception.exceptions.ServiceApiException: description =ERROR - Unable to extract content. Internal error: Processing timeout; requestTrackingId=HEhYW9NVQA0eHaQIa4ZWTkcu2D44chf1; statusCode=500; errorCode=UNKNOWN\n483it [3:01:10, 9.69s/it]",cclw-8119-575ab4f296b94d9faef557fc145fa44c.pdf,False,False,False,False,True,False,False
59,"Exception encountered while executing operation for /pdf-in/cclw-8150-18799253bec74b1db4ce7ba0ac34a1fb.pdf: description =Unable to process the message even after retries; requestTrackingId=mRmTzyqHKItMgM0CSyCjqFh0yUNLggSK; statusCode=500; errorCode=UNKNOWN\nTraceback (most recent call last):\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/pdfops/extract_pdf_operation.py"", line 134, in execute\n ExtractPDFAPI.download_and_save(location=location, context=execution_context, file_location=file_location)\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/service/extract_pdf_api.py"", line 48, in download_and_save\n response = CPFApi.cpf_status_api(location, context)\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/api/cpf_api.py"", line 86, in cpf_status_api\n response = polling2.poll(\n File ""/usr/local/lib/python3.8/site-packages/polling2.py"", line 191, in poll\n val = target(*args, **kwargs)\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/api/cpf_api.py"", line 87, in <lambda>\n lambda: http_client.process_request(http_request=http_request,\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/http/http_client.py"", line 42, in process_request\n if _handle_response_and_retry(response, success_status_codes,\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/http/http_client.py"", line 105, in _handle_response_and_retry\n raise OperationException(message=""Error response received for request"",\nadobe.pdfservices.operation.internal.exceptions.OperationException\n\nDuring handling of the above exception, another exception occurred:\n\nTraceback (most recent call last):\n File ""/app/extract/extract.py"", line 655, in pdf_to_data\n result = self._get_adobe_api_result(pdf_filepath)\n File ""/app/extract/extract.py"", line 621, in _get_adobe_api_result\n result = extract_pdf_operation.execute(_execution_context)\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/pdfops/extract_pdf_operation.py"", line 138, in execute\n raise ServiceApiException(message=oex.error_message, error_code=oex.error_code,\nadobe.pdfservices.operation.exception.exceptions.ServiceApiException: description =Unable to process the message even after retries; requestTrackingId=mRmTzyqHKItMgM0CSyCjqFh0yUNLggSK; statusCode=500; errorCode=UNKNOWN\n490it [3:04:12, 24.05s/it]",cclw-8150-18799253bec74b1db4ce7ba0ac34a1fb.pdf,False,False,False,False,True,False,False
71,"Exception encountered while executing operation for /pdf-in/cclw-8511-e3e3d4239cd243ffa0f89bb9c695cc5d.pdf: description =ERROR - Unable to extract content. Internal error; requestTrackingId=ZdtGesUUEQG3ZeRnmDG1ypforyo4opjp; statusCode=500; errorCode=UNKNOWN\nTraceback (most recent call last):\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/pdfops/extract_pdf_operation.py"", line 134, in execute\n ExtractPDFAPI.download_and_save(location=location, context=execution_context, file_location=file_location)\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/service/extract_pdf_api.py"", line 48, in download_and_save\n response = CPFApi.cpf_status_api(location, context)\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/api/cpf_api.py"", line 86, in cpf_status_api\n response = polling2.poll(\n File ""/usr/local/lib/python3.8/site-packages/polling2.py"", line 191, in poll\n val = target(*args, **kwargs)\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/api/cpf_api.py"", line 87, in <lambda>\n lambda: http_client.process_request(http_request=http_request,\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/http/http_client.py"", line 42, in process_request\n if _handle_response_and_retry(response, success_status_codes,\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/http/http_client.py"", line 105, in _handle_response_and_retry\n raise OperationException(message=""Error response received for request"",\nadobe.pdfservices.operation.internal.exceptions.OperationException\n\nDuring handling of the above exception, another exception occurred:\n\nTraceback (most recent call last):\n File ""/app/extract/extract.py"", line 655, in pdf_to_data\n result = self._get_adobe_api_result(pdf_filepath)\n File ""/app/extract/extract.py"", line 621, in _get_adobe_api_result\n result = extract_pdf_operation.execute(_execution_context)\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/pdfops/extract_pdf_operation.py"", line 138, in execute\n raise ServiceApiException(message=oex.error_message, error_code=oex.error_code,\nadobe.pdfservices.operation.exception.exceptions.ServiceApiException: description =ERROR - Unable to extract content. Internal error; requestTrackingId=ZdtGesUUEQG3ZeRnmDG1ypforyo4opjp; statusCode=500; errorCode=UNKNOWN\n558it [3:28:23, 21.40s/it]",cclw-8511-e3e3d4239cd243ffa0f89bb9c695cc5d.pdf,False,False,False,False,True,False,False
89,"Exception encountered while executing operation for /pdf-in/cclw-8779-f4a8f37ebc5d41999a4b05dc52bfc3cd.pdf: description =ERROR - Unable to extract content. Internal error; requestTrackingId=LfBB52sTAstUeQUp4sdKTbOUb0CwTckd; statusCode=500; errorCode=UNKNOWN\nTraceback (most recent call last):\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/pdfops/extract_pdf_operation.py"", line 134, in execute\n ExtractPDFAPI.download_and_save(location=location, context=execution_context, file_location=file_location)\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/service/extract_pdf_api.py"", line 48, in download_and_save\n response = CPFApi.cpf_status_api(location, context)\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/api/cpf_api.py"", line 86, in cpf_status_api\n response = polling2.poll(\n File ""/usr/local/lib/python3.8/site-packages/polling2.py"", line 191, in poll\n val = target(*args, **kwargs)\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/api/cpf_api.py"", line 87, in <lambda>\n lambda: http_client.process_request(http_request=http_request,\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/http/http_client.py"", line 42, in process_request\n if _handle_response_and_retry(response, success_status_codes,\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/http/http_client.py"", line 105, in _handle_response_and_retry\n raise OperationException(message=""Error response received for request"",\nadobe.pdfservices.operation.internal.exceptions.OperationException\n\nDuring handling of the above exception, another exception occurred:\n\nTraceback (most recent call last):\n File ""/app/extract/extract.py"", line 655, in pdf_to_data\n result = self._get_adobe_api_result(pdf_filepath)\n File ""/app/extract/extract.py"", line 621, in _get_adobe_api_result\n result = extract_pdf_operation.execute(_execution_context)\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/pdfops/extract_pdf_operation.py"", line 138, in execute\n raise ServiceApiException(message=oex.error_message, error_code=oex.error_code,\nadobe.pdfservices.operation.exception.exceptions.ServiceApiException: description =ERROR - Unable to extract content. Internal error; requestTrackingId=LfBB52sTAstUeQUp4sdKTbOUb0CwTckd; statusCode=500; errorCode=UNKNOWN\n679it [4:01:49, 10.12s/it]",cclw-8779-f4a8f37ebc5d41999a4b05dc52bfc3cd.pdf,False,False,False,False,True,False,False
106,"Exception encountered while executing operation for /temp/cclw-9505-64b2c3cb21334ce498313bf4fb65ee24_split_0_maxpages_75.pdf: description =ERROR - Unable to extract content. Internal error; requestTrackingId=1iaSOCqA5SDKotEmjwK04l3jgZuCspkg; statusCode=500; errorCode=UNKNOWN\nTraceback (most recent call last):\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/pdfops/extract_pdf_operation.py"", line 134, in execute\n ExtractPDFAPI.download_and_save(location=location, context=execution_context, file_location=file_location)\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/service/extract_pdf_api.py"", line 48, in download_and_save\n response = CPFApi.cpf_status_api(location, context)\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/api/cpf_api.py"", line 86, in cpf_status_api\n response = polling2.poll(\n File ""/usr/local/lib/python3.8/site-packages/polling2.py"", line 191, in poll\n val = target(*args, **kwargs)\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/api/cpf_api.py"", line 87, in <lambda>\n lambda: http_client.process_request(http_request=http_request,\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/http/http_client.py"", line 42, in process_request\n if _handle_response_and_retry(response, success_status_codes,\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/http/http_client.py"", line 105, in _handle_response_and_retry\n raise OperationException(message=""Error response received for request"",\nadobe.pdfservices.operation.internal.exceptions.OperationException\n\nDuring handling of the above exception, another exception occurred:\n\nTraceback (most recent call last):\n File ""/app/extract/extract.py"", line 655, in pdf_to_data\n result = self._get_adobe_api_result(pdf_filepath)\n File ""/app/extract/extract.py"", line 621, in _get_adobe_api_result\n result = extract_pdf_operation.execute(_execution_context)\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/pdfops/extract_pdf_operation.py"", line 138, in execute\n raise ServiceApiException(message=oex.error_message, error_code=oex.error_code,\nadobe.pdfservices.operation.exception.exceptions.ServiceApiException: description =DISQUALIFIED - File not suitable for content extraction: File exceeds page limit; requestTrackingId=emdxMLFgIyFdsCGcO8a5vhMoKMGDoZXF; statusCode=400; errorCode=UNKNOWN\n\nDuring handling of the above exception, another exception occurred:\n\nTraceback (most recent call last):\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/pdfops/extract_pdf_operation.py"", line 134, in execute\n ExtractPDFAPI.download_and_save(location=location, context=execution_context, file_location=file_location)\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/service/extract_pdf_api.py"", line 48, in download_and_save\n response = CPFApi.cpf_status_api(location, context)\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/api/cpf_api.py"", line 86, in cpf_status_api\n response = polling2.poll(\n File ""/usr/local/lib/python3.8/site-packages/polling2.py"", line 191, in poll\n val = target(*args, **kwargs)\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/api/cpf_api.py"", line 87, in <lambda>\n lambda: http_client.process_request(http_request=http_request,\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/http/http_client.py"", line 42, in process_request\n if _handle_response_and_retry(response, success_status_codes,\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/http/http_client.py"", line 105, in _handle_response_and_retry\n raise OperationException(message=""Error response received for request"",\nadobe.pdfservices.operation.internal.exceptions.OperationException\n\nDuring handling of the above exception, another exception occurred:\n\nTraceback (most recent call last):\n File ""/app/extract/extract.py"", line 655, in pdf_to_data\n result = self._get_adobe_api_result(pdf_filepath)\n File ""/app/extract/extract.py"", line 621, in _get_adobe_api_result\n result = extract_pdf_operation.execute(_execution_context)\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/pdfops/extract_pdf_operation.py"", line 138, in execute\n raise ServiceApiException(message=oex.error_message, error_code=oex.error_code,\nadobe.pdfservices.operation.exception.exceptions.ServiceApiException: description =ERROR - Unable to extract content. Internal error; requestTrackingId=1iaSOCqA5SDKotEmjwK04l3jgZuCspkg; statusCode=500; errorCode=UNKNOWN\n804it [4:35:13, 87.04s/it]",cclw-9505-64b2c3cb21334ce498313bf4fb65ee24_split_0_maxpages_75.pdf,True,False,False,False,True,True,True
122,"Exception encountered while executing operation for /temp/cclw-9647-0bbb30e57dcd4661925d218df7ef8cd3_split_2_maxpages_75.pdf: description =ERROR - Unable to extract content. Internal error; requestTrackingId=h4VJFBabJYrM9CmxdcY3bnlppXrsWrlQ; statusCode=500; errorCode=UNKNOWN\nTraceback (most recent call last):\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/pdfops/extract_pdf_operation.py"", line 134, in execute\n ExtractPDFAPI.download_and_save(location=location, context=execution_context, file_location=file_location)\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/service/extract_pdf_api.py"", line 48, in download_and_save\n response = CPFApi.cpf_status_api(location, context)\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/api/cpf_api.py"", line 86, in cpf_status_api\n response = polling2.poll(\n File ""/usr/local/lib/python3.8/site-packages/polling2.py"", line 191, in poll\n val = target(*args, **kwargs)\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/api/cpf_api.py"", line 87, in <lambda>\n lambda: http_client.process_request(http_request=http_request,\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/http/http_client.py"", line 42, in process_request\n if _handle_response_and_retry(response, success_status_codes,\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/http/http_client.py"", line 105, in _handle_response_and_retry\n raise OperationException(message=""Error response received for request"",\nadobe.pdfservices.operation.internal.exceptions.OperationException\n\nDuring handling of the above exception, another exception occurred:\n\nTraceback (most recent call last):\n File ""/app/extract/extract.py"", line 655, in pdf_to_data\n result = self._get_adobe_api_result(pdf_filepath)\n File ""/app/extract/extract.py"", line 621, in _get_adobe_api_result\n result = extract_pdf_operation.execute(_execution_context)\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/pdfops/extract_pdf_operation.py"", line 138, in execute\n raise ServiceApiException(message=oex.error_message, error_code=oex.error_code,\nadobe.pdfservices.operation.exception.exceptions.ServiceApiException: description =DISQUALIFIED - File not suitable for content extraction: File exceeds page limit; requestTrackingId=feMbVDfBsgwbgscBYPRuXU7CoBeosFfe; statusCode=400; errorCode=UNKNOWN\n\nDuring handling of the above exception, another exception occurred:\n\nTraceback (most recent call last):\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/pdfops/extract_pdf_operation.py"", line 134, in execute\n ExtractPDFAPI.download_and_save(location=location, context=execution_context, file_location=file_location)\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/service/extract_pdf_api.py"", line 48, in download_and_save\n response = CPFApi.cpf_status_api(location, context)\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/api/cpf_api.py"", line 86, in cpf_status_api\n response = polling2.poll(\n File ""/usr/local/lib/python3.8/site-packages/polling2.py"", line 191, in poll\n val = target(*args, **kwargs)\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/api/cpf_api.py"", line 87, in <lambda>\n lambda: http_client.process_request(http_request=http_request,\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/http/http_client.py"", line 42, in process_request\n if _handle_response_and_retry(response, success_status_codes,\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/http/http_client.py"", line 105, in _handle_response_and_retry\n raise OperationException(message=""Error response received for request"",\nadobe.pdfservices.operation.internal.exceptions.OperationException\n\nDuring handling of the above exception, another exception occurred:\n\nTraceback (most recent call last):\n File ""/app/extract/extract.py"", line 655, in pdf_to_data\n result = self._get_adobe_api_result(pdf_filepath)\n File ""/app/extract/extract.py"", line 621, in _get_adobe_api_result\n result = extract_pdf_operation.execute(_execution_context)\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/pdfops/extract_pdf_operation.py"", line 138, in execute\n raise ServiceApiException(message=oex.error_message, error_code=oex.error_code,\nadobe.pdfservices.operation.exception.exceptions.ServiceApiException: description =ERROR - Unable to extract content. Internal error; requestTrackingId=h4VJFBabJYrM9CmxdcY3bnlppXrsWrlQ; statusCode=500; errorCode=UNKNOWN\nSyntax Warning: Bad annotation destination\nSyntax Warning: Bad annotation destination",cclw-9647-0bbb30e57dcd4661925d218df7ef8cd3_split_2_maxpages_75.pdf,True,False,False,False,True,True,True
123,"Exception encountered while executing operation for /temp/cclw-9691-da6a721391e04486b220b5a90ca7e5ec_split_0_maxpages_75.pdf: description =ERROR - Unable to extract content. Internal error; requestTrackingId=BNpkYAxcuZ3nMkATi3FnHO2eu7LTdj13; statusCode=500; errorCode=UNKNOWN\nTraceback (most recent call last):\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/pdfops/extract_pdf_operation.py"", line 134, in execute\n ExtractPDFAPI.download_and_save(location=location, context=execution_context, file_location=file_location)\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/service/extract_pdf_api.py"", line 48, in download_and_save\n response = CPFApi.cpf_status_api(location, context)\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/api/cpf_api.py"", line 86, in cpf_status_api\n response = polling2.poll(\n File ""/usr/local/lib/python3.8/site-packages/polling2.py"", line 191, in poll\n val = target(*args, **kwargs)\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/api/cpf_api.py"", line 87, in <lambda>\n lambda: http_client.process_request(http_request=http_request,\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/http/http_client.py"", line 42, in process_request\n if _handle_response_and_retry(response, success_status_codes,\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/http/http_client.py"", line 105, in _handle_response_and_retry\n raise OperationException(message=""Error response received for request"",\nadobe.pdfservices.operation.internal.exceptions.OperationException\n\nDuring handling of the above exception, another exception occurred:\n\nTraceback (most recent call last):\n File ""/app/extract/extract.py"", line 655, in pdf_to_data\n result = self._get_adobe_api_result(pdf_filepath)\n File ""/app/extract/extract.py"", line 621, in _get_adobe_api_result\n result = extract_pdf_operation.execute(_execution_context)\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/pdfops/extract_pdf_operation.py"", line 138, in execute\n raise ServiceApiException(message=oex.error_message, error_code=oex.error_code,\nadobe.pdfservices.operation.exception.exceptions.ServiceApiException: description =DISQUALIFIED - File not suitable for content extraction: File exceeds page limit; requestTrackingId=FQseNbvSIGzDal9XZMohCffJsSTOaQFU; statusCode=400; errorCode=UNKNOWN\n\nDuring handling of the above exception, another exception occurred:\n\nTraceback (most recent call last):\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/pdfops/extract_pdf_operation.py"", line 134, in execute\n ExtractPDFAPI.download_and_save(location=location, context=execution_context, file_location=file_location)\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/service/extract_pdf_api.py"", line 48, in download_and_save\n response = CPFApi.cpf_status_api(location, context)\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/api/cpf_api.py"", line 86, in cpf_status_api\n response = polling2.poll(\n File ""/usr/local/lib/python3.8/site-packages/polling2.py"", line 191, in poll\n val = target(*args, **kwargs)\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/api/cpf_api.py"", line 87, in <lambda>\n lambda: http_client.process_request(http_request=http_request,\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/http/http_client.py"", line 42, in process_request\n if _handle_response_and_retry(response, success_status_codes,\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/http/http_client.py"", line 105, in _handle_response_and_retry\n raise OperationException(message=""Error response received for request"",\nadobe.pdfservices.operation.internal.exceptions.OperationException\n\nDuring handling of the above exception, another exception occurred:\n\nTraceback (most recent call last):\n File ""/app/extract/extract.py"", line 655, in pdf_to_data\n result = self._get_adobe_api_result(pdf_filepath)\n File ""/app/extract/extract.py"", line 621, in _get_adobe_api_result\n result = extract_pdf_operation.execute(_execution_context)\n File ""/usr/local/lib/python3.8/site-packages/adobe/pdfservices/operation/pdfops/extract_pdf_operation.py"", line 138, in execute\n raise ServiceApiException(message=oex.error_message, error_code=oex.error_code,\nadobe.pdfservices.operation.exception.exceptions.ServiceApiException: description =ERROR - Unable to extract content. Internal error; requestTrackingId=BNpkYAxcuZ3nMkATi3FnHO2eu7LTdj13; statusCode=500; errorCode=UNKNOWN\n910it [5:23:33, 19.93s/it]",cclw-9691-da6a721391e04486b220b5a90ca7e5ec_split_0_maxpages_75.pdf,True,False,False,False,True,True,True


In [133]:
DEBUG_FILES_PATH = Path("../../data/pdf2text/debugging/")

def convert_split_pdf_name_into_original(pdf_filename: str) -> str:
    if "split" not in pdf_filename:
        return pdf_filename
    
    return pdf_filename.split("_")[0] + ".pdf"

for error_type in ("ssl_error", "write_operation_timeout", "read_operation_timeout", "adobe_500", "disqualified_exceeds_page_limit"):
    error_pdf_folder = DEBUG_FILES_PATH / error_type
    if not os.path.exists(error_pdf_folder): os.mkdir(error_pdf_folder)
    
    pdf_filenames = df.loc[df[error_type], "pdf_filename"].apply(convert_split_pdf_name_into_original).tolist()
    
    for f_name in pdf_filenames:
        shutil.copy(PDF_DIR/f_name, error_pdf_folder)
        
    print(f"{len(pdf_filenames)} pdfs copied to {error_pdf_folder}")

24 pdfs copied to ../../data/pdf2text/debugging/ssl_error
71 pdfs copied to ../../data/pdf2text/debugging/write_operation_timeout
21 pdfs copied to ../../data/pdf2text/debugging/read_operation_timeout
10 pdfs copied to ../../data/pdf2text/debugging/adobe_500
19 pdfs copied to ../../data/pdf2text/debugging/disqualified_exceeds_page_limit


## get all failed PDFs to run on EC2

In [141]:
all_pdf_stems = [Path(i).stem for i in glob.glob(str(PDF_DIR / "*.pdf"))]

failed_pdf_stems = set(all_pdf_stems) - set(adobe_successes)

len(failed_pdf_stems)

131

In [145]:
for pdf_stem in tqdm(failed_pdf_stems):
    pdf_path = PDF_DIR / f"{pdf_stem}.pdf"
    
    shutil.copy(pdf_path, "../../data/pdf2text/debugging/all_failed/")

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 131/131 [00:04<00:00, 30.29it/s]


## analysing results of ec2 run

In [163]:
EC2_INTERMEDIATE_FOLDER = Path("../../data/pdf2text/debugging/all_failed/intermediate/")
EC2_NUM_PDFS = len(glob.glob(str(EC2_INTERMEDIATE_FOLDER.parent / "*.pdf")))

In [164]:
adobe_successes_ec2 = []
embedded_text_extractor_successes_ec2 = []

for file_or_folder in EC2_INTERMEDIATE_FOLDER.iterdir():
    if file_or_folder.is_dir():
        adobe_successes_ec2.append(file_or_folder.name)
    elif file_or_folder.suffix == ".xml" and not file_or_folder.stem.endswith("metadata") and not file_or_folder.stem.endswith("outline"):
        embedded_text_extractor_successes_ec2.append(file_or_folder)

# check that all adobe folders for a split PDF are populated. otherwise remove from successes
split_pdfs = list(set([i.split("_")[0] for i in adobe_successes_ec2 if len(i.split("_")) == 2]))

for pdf_name in split_pdfs:
    split_dirs = [_dir for _dir in adobe_successes_ec2 if _dir.startswith(pdf_name)]
    
    # number of files in each directory created from the split pdfs
    split_dir_num_files = [len(list((EC2_INTERMEDIATE_FOLDER / _dir).iterdir())) for _dir in split_dirs]
    if not all([i > 0 for i in split_dir_num_files]):
        adobe_successes_ec2 = [i for i in adobe_successes_ec2 if not i.startswith(pdf_name)]
        
# remove numbering suffixes from successful split PDFs, then add the extension '.pdf' to all of them
adobe_successes_ec2 = list(set([i.split("_")[0] for i in adobe_successes_ec2]))

embedded_text_extractor_successes_ec2 = [filename.with_suffix(".pdf") for filename in embedded_text_extractor_successes_ec2]

len(adobe_successes_ec2), len(embedded_text_extractor_successes_ec2)
perc_successful_adobe = len(adobe_successes_ec2) / EC2_NUM_PDFS * 100

print(f"{len(adobe_successes_ec2)} ({round(perc_successful_adobe, 2)}%) passed through adobe. {len(embedded_text_extractor_successes_ec2)} failed in adobe and passed through the embedded text extractor.")

112 (85.5%) passed through adobe. 16 failed in adobe and passed through the embedded text extractor.


In [189]:
failed_ec2_filenames = set([Path(_path).name for _path in glob.glob(str(EC2_INTERMEDIATE_FOLDER.parent / "*.pdf"))]) - set([f"{path}.pdf" for path in adobe_successes_ec2])
print(len(failed_ec2_filenames))
df.loc[df['pdf_filename'].isin(failed_ec2_filenames) & ~df['adobe_500'], :]

19


Unnamed: 0,msg,pdf_filename,on_split,ssl_error,write_operation_timeout,read_operation_timeout,adobe_500,disqualified,disqualified_exceeds_page_limit
2,"Exception encountered while executing operation for /pdf-in/cclw-10086-357cef7658b8440b823e4c76c0b09745.pdf: description =Request could not be completed. Possible cause attached!, requestTrackingId=(<class 'requests.exceptions.ConnectionError'>, ConnectionError(ProtocolError('Connection aborted.', timeout('The write operation timed out'))), <traceback object at 0x7f267784c240>)\nTraceback (mos...",cclw-10086-357cef7658b8440b823e4c76c0b09745.pdf,False,False,True,False,False,False,False
66,"Exception encountered while executing operation for /pdf-in/cclw-8482-7a59b4bc5d7841cd9d8a0010215c97ec.pdf: description =Request could not be completed. Possible cause attached!, requestTrackingId=(<class 'requests.exceptions.ConnectionError'>, ConnectionError(ProtocolError('Connection aborted.', timeout('The write operation timed out'))), <traceback object at 0x7f2674d6d9c0>)\nTraceback (most...",cclw-8482-7a59b4bc5d7841cd9d8a0010215c97ec.pdf,False,False,True,False,False,False,False
67,"Exception encountered while executing operation for /pdf-in/cclw-8489-87e2bd2bd4824467bb47dbff4ab62e8b.pdf: description =Request could not be completed. Possible cause attached!, requestTrackingId=(<class 'requests.exceptions.ConnectionError'>, ConnectionError(ProtocolError('Connection aborted.', timeout('The write operation timed out'))), <traceback object at 0x7f26766b8100>)\nTraceback (most...",cclw-8489-87e2bd2bd4824467bb47dbff4ab62e8b.pdf,False,False,True,False,False,False,False
82,"Exception encountered while executing operation for /pdf-in/cclw-8633-f5db4522ce57485b9b7ddc8ce639e741.pdf: description =Request could not be completed. Possible cause attached!, requestTrackingId=(<class 'requests.exceptions.SSLError'>, SSLError(MaxRetryError(""HTTPSConnectionPool(host='cpf-ue1.adobe.io', port=443): Max retries exceeded with url: /ops/id/dnGFTu5URfUz9KZp6BX6820BT2cOyQ3D (Cause...",cclw-8633-f5db4522ce57485b9b7ddc8ce639e741.pdf,False,True,False,False,False,False,False
86,"Exception encountered while executing operation for /pdf-in/cclw-8737-67b33aa4117d4f51bb7055f6944a4b4f.pdf: description =Request could not be completed. Possible cause attached!, requestTrackingId=(<class 'requests.exceptions.ConnectionError'>, ConnectionError(ProtocolError('Connection aborted.', timeout('The write operation timed out'))), <traceback object at 0x7f2676275500>)\nTraceback (most...",cclw-8737-67b33aa4117d4f51bb7055f6944a4b4f.pdf,False,False,True,False,False,False,False
107,"Exception encountered while executing operation for /pdf-in/cclw-9508-78e9d495b9bd405dbed2513f256ee001.pdf: description =Request could not be completed. Possible cause attached!, requestTrackingId=(<class 'requests.exceptions.ConnectionError'>, ConnectionError(ProtocolError('Connection aborted.', timeout('The write operation timed out'))), <traceback object at 0x7f2676f78cc0>)\nTraceback (most...",cclw-9508-78e9d495b9bd405dbed2513f256ee001.pdf,False,False,True,False,False,False,False
114,"Exception encountered while executing operation for /pdf-in/cclw-9602-999f0a58e7b44922babefe98d1eb4c63.pdf: description =Request could not be completed. Possible cause attached!, requestTrackingId=(<class 'requests.exceptions.ConnectionError'>, ConnectionError(ProtocolError('Connection aborted.', timeout('The write operation timed out'))), <traceback object at 0x7f2675cb7bc0>)\nTraceback (most...",cclw-9602-999f0a58e7b44922babefe98d1eb4c63.pdf,False,False,True,False,False,False,False


In [193]:
failed_ec2_filenames_adobe_500 = df.loc[df['pdf_filename'].isin(failed_ec2_filenames) & df['adobe_500'], 'pdf_filename'].tolist()
filenames_to_retry_ec2 = list(set(failed_ec2_filenames) - set(failed_ec2_filenames_adobe_500))

for pdf_name in tqdm(filenames_to_retry_ec2):
    pdf_path = PDF_DIR / pdf_name
    
    shutil.copy(pdf_path, "../../data/pdf2text/debugging/all_failed_retry/")

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:08<00:00,  1.83it/s]


## after ec2 run retry

increase timeout to remove write limit error -> all PDFs passed except for those with `BAD_PDF` or new `500` errors

In [196]:
EC2_RETRY_INTERMEDIATE_FOLDER = Path("../../data/pdf2text/debugging/all_failed_retry/intermediate/")
EC2_RETRY_NUM_PDFS = len(glob.glob(str(EC2_RETRY_INTERMEDIATE_FOLDER.parent / "*.pdf")))

In [199]:
adobe_successes_ec2_retry = []
embedded_text_extractor_successes_ec2_retry = []

for file_or_folder in EC2_RETRY_INTERMEDIATE_FOLDER.iterdir():
    if file_or_folder.is_dir():
        adobe_successes_ec2_retry.append(file_or_folder.name)
    elif file_or_folder.suffix == ".xml" and not file_or_folder.stem.endswith("metadata") and not file_or_folder.stem.endswith("outline"):
        embedded_text_extractor_successes_ec2_retry.append(file_or_folder)

# check that all adobe folders for a split PDF are populated. otherwise remove from successes
split_pdfs = list(set([i.split("_")[0] for i in adobe_successes_ec2_retry if len(i.split("_")) == 2]))

for pdf_name in split_pdfs:
    split_dirs = [_dir for _dir in adobe_successes_ec2_retry if _dir.startswith(pdf_name)]
    
    # number of files in each directory created from the split pdfs
    split_dir_num_files = [len(list((EC2_RETRY_INTERMEDIATE_FOLDER / _dir).iterdir())) for _dir in split_dirs]
    if not all([i > 0 for i in split_dir_num_files]):
        adobe_successes_ec2_retry = [i for i in adobe_successes_ec2_retry if not i.startswith(pdf_name)]
        
# remove numbering suffixes from successful split PDFs, then add the extension '.pdf' to all of them
adobe_successes_ec2_retry = list(set([i.split("_")[0] for i in adobe_successes_ec2_retry]))

embedded_text_extractor_successes_ec2_retry = [filename.with_suffix(".pdf") for filename in embedded_text_extractor_successes_ec2_retry]

len(adobe_successes_ec2_retry), len(embedded_text_extractor_successes_ec2_retry)
perc_successful_adobe = len(adobe_successes_ec2_retry) / EC2_RETRY_NUM_PDFS * 100

print(f"{len(adobe_successes_ec2_retry)} ({round(perc_successful_adobe, 2)}%) passed through adobe. {len(embedded_text_extractor_successes_ec2_retry)} failed in adobe and passed through the embedded text extractor.")

3 (18.75%) passed through adobe. 13 failed in adobe and passed through the embedded text extractor.


## create master dataset

1. 3 adobe successes from ec2 retry (increase read and write timeouts)
2. 112 adobe successes from first ec2 try (moving environment solved SSL error)
3. 808 adobe successes from original run

success rate = `808+112+3/939 = 98.3%`


In [211]:
RETRY_SUCCESSES = adobe_successes_ec2_retry
EC2_SUCCESSES = adobe_successes_ec2
INITIAL_SUCCESSES = adobe_successes

# no overlap
len(set(RETRY_SUCCESSES).intersection(set(EC2_SUCCESES)).intersection(set(INITIAL_SUCCESSES)))

0

In [224]:
MASTER_INTERMEDIATE_FOLDER = Path("../../data/pdf2text/intermediate-final")

for pdf_stem in RETRY_SUCCESSES:
    f_paths = [_path for _path in EC2_RETRY_INTERMEDIATE_FOLDER.iterdir() if str(_path.name).startswith(pdf_stem)]
    
    for path in f_paths:
        if path.is_dir():
            shutil.copytree(path, MASTER_INTERMEDIATE_FOLDER/path.name)
        else:
            shutil.copy(path, MASTER_INTERMEDIATE_FOLDER)

In [225]:
for pdf_stem in EC2_SUCCESSES:
    f_paths = [_path for _path in EC2_INTERMEDIATE_FOLDER.iterdir() if str(_path.name).startswith(pdf_stem)]
    
    for path in f_paths:
        if path.is_dir():
            shutil.copytree(path, MASTER_INTERMEDIATE_FOLDER/path.name)
        else:
            shutil.copy(path, MASTER_INTERMEDIATE_FOLDER)

In [226]:
for pdf_stem in INITIAL_SUCCESSES:
    f_paths = [_path for _path in DATA_DIR.iterdir() if str(_path.name).startswith(pdf_stem)]
    
    for path in f_paths:
        if path.is_dir():
            shutil.copytree(path, MASTER_INTERMEDIATE_FOLDER/path.name)
        else:
            shutil.copy(path, MASTER_INTERMEDIATE_FOLDER)

In [237]:
all_pdf_stems = list([p.stem for p in PDF_DIR.iterdir()])
failed_pdf_stems = set(all_pdf_stems) - set(RETRY_SUCCESSES) - set(EC2_SUCCESSES) - set(INITIAL_SUCCESSES)

for pdf_stem in failed_pdf_stems:
    f_paths = [_path for _path in DATA_DIR.iterdir() if str(_path.name).startswith(pdf_stem)]
    
    for path in f_paths:
        if path.is_dir():
            shutil.copytree(path, MASTER_INTERMEDIATE_FOLDER/path.name)
        else:
            shutil.copy(path, MASTER_INTERMEDIATE_FOLDER)

In [232]:
# tidy up double nested folders
for _path in tqdm(MASTER_INTERMEDIATE_FOLDER.iterdir()):
    if "_" in _path.name:
        for _subpath in _path.iterdir():
            # ignore .DS_Store
            if _subpath.name.startswith("cclw"):
                for _file in _subpath.iterdir(): 
                    if not _file.name.startswith("."):
                        shutil.move(str(_file), str(_subpath.parent))
                shutil.rmtree(str(_subpath))

1221it [00:00, 1523.02it/s]


In [None]:
# create final output folder
