In [None]:
## next, try to get jpg files parsed
## also, summary eval stats for each processor version in a second table
## look into parser level, by label stats

# Setup

In [2]:
#!pip install PyPDF2 -U -q


In [187]:
from google.api_core.client_options import ClientOptions
from google.cloud import documentai  # type: ignore
from google.cloud import documentai_v1beta3 as documentai_beta
from google.cloud.documentai_v1 import Document
from google.cloud import storage
from typing import Iterator, Optional, Sequence, Tuple
from google.protobuf import timestamp_pb2
from datetime import datetime
import pandas as pd
import json
import os
import io
import json
import base64
import requests
import concurrent.futures
import time
import PyPDF2


In [188]:
project = !gcloud config get-value project
PROJECT_ID = project[0]
PROJECT_ID

'mg-ce-demos'

In [189]:
#Setup
LOCATION = 'us' # Format is 'us' or 'eu'
PROCESSOR_ID = '197afcf7768eaf3f'
#PROCESSOR_VERSION_ID = 'pretrained-invoice-v1.4-2022-10-21'
API_LOCATION = "us"  # Choose "us" or "eu"
PARSER_DISPLAY_NAME = 'mhwk-demo-tungsten'

GCS_BUCKET = 'mohawk-docai-demo'
GCS_INPUT_URI = f'gs://{GCS_BUCKET}/tungsten-data/'
GCS_OUTPUT_BUCKET = f'gs://{GCS_BUCKET}/output/'

DATASET = 'mhwk_docai_demo'
DOC_SIM_TABLE = 'invoice_parser_raw'
EVAL_TABLE = 'invoice_parser_eval_stats'

# SDK

## Get processor versions for document processing and evaluation summary

In [190]:
docai_client = documentai.DocumentProcessorServiceClient(
    client_options = dict(api_endpoint = f"{LOCATION}-documentai.googleapis.com")
)

In [191]:
def list_processor_versions(project_id: str, location: str, processor_id: str) -> None:
    # You must set the api_endpoint if you use a location other than 'us'.
    opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")

    client = documentai.DocumentProcessorServiceClient(client_options=opts)

    # The full resource name of the processor
    # e.g.: projects/project_id/locations/location/processors/processor_id
    parent = client.processor_path(project_id, location, processor_id)

    # Make ListProcessorVersions request
    processor_versions = client.list_processor_versions(parent=parent)
    
    version_list = []

    # Print the processor version information
    #for processor_version in processor_versions:
    #    version_list.append(client.parse_processor_version_path(
    #        processor_version.name
    #    )["processor_version"])
    return processor_versions
        

In [192]:
processor_versions = list_processor_versions(PROJECT_ID, LOCATION, PROCESSOR_ID)

processors = []
for processor_version in processor_versions:
    create_time = datetime.fromtimestamp(processor_version.create_time.timestamp())
    create_time = create_time.strftime('%Y-%m-%d %H:%M:%S')
    processors.append([docai_client.parse_processor_version_path(processor_version.name)["processor_version"], create_time])    
    
#processors = processors[0:1]
processors

[['656a9285e396bc4b', '2023-06-14 11:49:54'],
 ['pretrained-invoice-v1.4-2022-10-21', '2022-10-20 20:00:00'],
 ['pretrained-invoice-v1.3-2022-07-15', '2022-07-14 20:00:00'],
 ['pretrained-invoice-v1.2-2022-02-18', '2022-02-17 19:00:00'],
 ['pretrained-invoice-v1.1-2021-04-09', '2021-04-08 20:00:00']]

In [193]:
parser_eval_df = pd.DataFrame(columns=['processor_id', 'display_name', 'create_date', 'f1_score', 'precision', 'recall'])

for processor_version in processor_versions:
    create_time = datetime.fromtimestamp(processor_version.create_time.timestamp())
    create_time = create_time.strftime('%Y-%m-%d %H:%M:%S')
    row = [
        docai_client.parse_processor_version_path(processor_version.name)["processor_version"], 
        processor_version.display_name,
        create_time, 
        processor_version.latest_evaluation.aggregate_metrics.f1_score,
        processor_version.latest_evaluation.aggregate_metrics.precision,
        processor_version.latest_evaluation.aggregate_metrics.recall
    ]
    parser_eval_df = parser_eval_df.append(pd.DataFrame([row], columns=['processor_id', 'display_name', 'create_date', 'f1_score', 'precision', 'recall']), ignore_index=True)
    
parser_eval_df 

  parser_eval_df = parser_eval_df.append(pd.DataFrame([row], columns=['processor_id', 'display_name', 'create_date', 'f1_score', 'precision', 'recall']), ignore_index=True)
  parser_eval_df = parser_eval_df.append(pd.DataFrame([row], columns=['processor_id', 'display_name', 'create_date', 'f1_score', 'precision', 'recall']), ignore_index=True)
  parser_eval_df = parser_eval_df.append(pd.DataFrame([row], columns=['processor_id', 'display_name', 'create_date', 'f1_score', 'precision', 'recall']), ignore_index=True)
  parser_eval_df = parser_eval_df.append(pd.DataFrame([row], columns=['processor_id', 'display_name', 'create_date', 'f1_score', 'precision', 'recall']), ignore_index=True)
  parser_eval_df = parser_eval_df.append(pd.DataFrame([row], columns=['processor_id', 'display_name', 'create_date', 'f1_score', 'precision', 'recall']), ignore_index=True)


Unnamed: 0,processor_id,display_name,create_date,f1_score,precision,recall
0,656a9285e396bc4b,mhwk-tungsten-v1,2023-06-14 11:49:54,0.826261,0.840391,0.812598
1,pretrained-invoice-v1.4-2022-10-21,Google Release Candidate,2022-10-20 20:00:00,0.81962,0.962825,0.713499
2,pretrained-invoice-v1.3-2022-07-15,Google Stable,2022-07-14 20:00:00,0.81962,0.962825,0.713499
3,pretrained-invoice-v1.2-2022-02-18,Google Stable,2022-02-17 19:00:00,0.690671,0.850806,0.581267
4,pretrained-invoice-v1.1-2021-04-09,Google Stable,2021-04-08 20:00:00,0.690671,0.850806,0.581267


In [194]:
from google.cloud import bigquery

client = bigquery.Client()

table_id = f'{PROJECT_ID}.{DATASET}.{EVAL_TABLE}'

job_config = bigquery.LoadJobConfig(
    write_disposition=bigquery.job.WriteDisposition.WRITE_TRUNCATE
)

job = client.load_table_from_dataframe(
    parser_eval_df, table_id, job_config=job_config
)
job.result()  # Wait for the job to complete.
table = client.get_table(table_id)  # Make an API request.
print(
    "Loaded {} rows and {} columns to {}".format(
        table.num_rows, len(table.schema), table_id
    )
)

Loaded 5 rows and 6 columns to mg-ce-demos.mhwk_docai_demo.invoice_parser_eval_stats


## Collect document data for processing

In [195]:
gcs = storage.Client(project = PROJECT_ID)
bucket = gcs.bucket(GCS_BUCKET)

In [196]:
# Get the list of blobs
blobs = bucket.list_blobs()

# Loop through the blobs
pdf_data = []
for blob in blobs:
    if blob.name.startswith('tungsten-data'):
        if blob.name.endswith('.pdf'):
            print(blob.name)
            pdf_data.append([blob.name, blob.content_type, blob.download_as_bytes()])

tungsten-data/PENSKE 1.1.pdf
tungsten-data/PENSKE 10.10.pdf
tungsten-data/PENSKE 2.2.pdf
tungsten-data/PENSKE 3.3.pdf
tungsten-data/PENSKE 4.4.pdf
tungsten-data/PENSKE 5.5.pdf
tungsten-data/PENSKE 6.6.pdf
tungsten-data/PENSKE 7.7.pdf
tungsten-data/PENSKE 8.8.pdf
tungsten-data/PENSKE 9.9.pdf
tungsten-data/Ryder 1.pdf
tungsten-data/Ryder 10.pdf
tungsten-data/Ryder 2.pdf
tungsten-data/Ryder 3.pdf
tungsten-data/Ryder 4.pdf
tungsten-data/Ryder 6.pdf
tungsten-data/Ryder 7.pdf
tungsten-data/Ryder 8.pdf
tungsten-data/Ryder 9.pdf


In [197]:
type(pdf_data[0][2])

bytes

## Setup DocAI runner for doc processing

In [198]:
rate_limit_minute = 120
adjust_rate_limit = rate_limit_minute / 2

In [199]:
def docai_runner(p, start, raw_document, version_id):
    sleep_time = (p * (60/adjust_rate_limit)) - (time.time() - start)
    if sleep_time > 0: time.sleep(sleep_time)
    
    name = docai_client.processor_version_path(
        PROJECT_ID, LOCATION, PROCESSOR_ID, version_id
    )

    return (p, docai_client.process_document(request = dict(raw_document = raw_document, name = name)))

## Process documents, by processor, and export dataframe for review

### testing

In [46]:
#pdf_data[0]

In [130]:
doc = pdf_data[0][2]
doc2 = pdf_data[1][2]

In [131]:
pdf = PyPDF2.PdfReader(io.BytesIO(doc))

pdfs = []
for page_num, page in enumerate(pdf.pages, 1):
    writer = PyPDF2.PdfWriter()
    writer.add_page(page)
    with io.BytesIO() as bytes_stream:
        pdfs.append(writer.write(bytes_stream)[1].getbuffer().tobytes())

In [132]:
version_id = '656a9285e396bc4b'


In [133]:
name = docai_client.processor_version_path(
        PROJECT_ID, LOCATION, PROCESSOR_ID, version_id
    )

# Load Binary Data into Document AI RawDocument Object
raw_document = documentai.RawDocument(content=doc, mime_type = 'application/pdf')

# Configure the process request
request = documentai.ProcessRequest(
    name=name, raw_document=raw_document
)

result = docai_client.process_document(request=request)

In [134]:
result.document.entities

[text_anchor {
  text_segments {
    start_index: 1967
    end_index: 1973
  }
  content: "107.00"
}
type_: "total_amount"
mention_text: "107.00"
confidence: 0.993133843
page_anchor {
  page_refs {
    page: 1
    bounding_poly {
      normalized_vertices {
        x: 0.945865571
        y: 0.685870469
      }
      normalized_vertices {
        x: 0.983343244
        y: 0.685870469
      }
      normalized_vertices {
        x: 0.983343244
        y: 0.694701433
      }
      normalized_vertices {
        x: 0.945865571
        y: 0.694701433
      }
    }
  }
}
id: "0"
normalized_value {
  money_value {
    units: 107
  }
  text: "107"
}
, text_anchor {
  text_segments {
    start_index: 1790
    end_index: 1794
  }
  content: "0.00"
}
type_: "total_tax_amount"
mention_text: "0.00"
confidence: 0.990307868
page_anchor {
  page_refs {
    page: 1
    bounding_poly {
      normalized_vertices {
        x: 0.958953
        y: 0.665685475
      }
      normalized_vertices {
        x: 0.9

In [113]:
dict_test = {}
line_item_num = 0

for entity in result.document.entities:
    key = entity.type_
    # some other value formats in addition to text are availible
    # e.g. dates: `entity.normalized_value.date_value.year`
    text_value = entity.mention_text
    confidence = entity.confidence
    normalized_value = entity.normalized_value.text

    if normalized_value:
        if key == 'line_item':
            dict_test[str(key+str(line_item_num))] = normalized_value
            line_item_num += 1
        else:
            dict_test[key] = normalized_value
    else:
        if key == 'line_item':
            dict_test[str(key+str(line_item_num))] = text_value
            line_item_num += 1
        else:
            dict_test[key] = text_value

In [124]:
dict_test

In [115]:
df_test = pd.DataFrame([dict_test])

In [106]:
pdf2 = PyPDF2.PdfReader(io.BytesIO(doc2))

pdfs2 = []
for page_num, page in enumerate(pdf2.pages, 1):
    writer = PyPDF2.PdfWriter()
    writer.add_page(page)
    with io.BytesIO() as bytes_stream:
        pdfs2.append(writer.write(bytes_stream)[1].getbuffer().tobytes())

In [107]:
name = docai_client.processor_version_path(
        PROJECT_ID, LOCATION, PROCESSOR_ID, version_id
    )

# Load Binary Data into Document AI RawDocument Object
raw_document2 = documentai.RawDocument(content=doc2, mime_type = 'application/pdf')

# Configure the process request
request2 = documentai.ProcessRequest(
    name=name, raw_document=raw_document2
)

result2 = docai_client.process_document(request=request2)

In [120]:
dict_test2 = {}
line_item_num = 0

for entity in result2.document.entities:
    key = entity.type_
    # some other value formats in addition to text are availible
    # e.g. dates: `entity.normalized_value.date_value.year`
    text_value = entity.mention_text
    confidence = entity.confidence
    normalized_value = entity.normalized_value.text

    if normalized_value:
        if key == 'line_item':
            dict_test2[str(key+str(line_item_num))] = normalized_value
            line_item_num += 1
        else:
            dict_test2[key] = normalized_value
    else:
        if key == 'line_item':
            dict_test2[str(key+str(line_item_num))] = text_value
            line_item_num += 1
        else:
            dict_test2[key] = text_value

In [125]:
#dict_test2

In [122]:
df_test = df_test.append(dict_test2, ignore_index=True)

  df_test = df_test.append(dict_test2, ignore_index=True)


In [123]:
df_test

Unnamed: 0,total_amount,total_tax_amount,net_amount,supplier_name,supplier_tax_id,invoice_id,currency,remit_to_address,purchase_order,remit_to_name,...,line_item2,line_item3,line_item4,line_item5,line_item6,line_item7,line_item8,line_item9,line_item10,line_item11
0,107.0,0,107.0,PENSKE TRUCK LEASING CO LP,23-2518618,25388645,USD,P.O. BOX 827380\nPHILADELPHIA\nPA\n191827380\n...,POA-\nWILLIAM_WHITNEY-8\n125,"PENSKE TRUCK LEASING CO., L. P.",...,1 Each 17.00 17.000,1 Each 18.00 18.000,1 Each 18.00 18.000,1 Each 18.00 18.000,WASHES,WASHES,WASHES,WASHES,WASHES,WASHES
1,5790.5,0,5790.5,PENSKE TRUCK LEASING CO LP,23-2518618,25242572,USD,P.O. BOX 827380\nPHILADELPHIA\nPA\n191827380\n...,POA-\nJOSE_MARTINEZ-812\n5,"PENSKE TRUCK LEASING CO., L. P.",...,382521 TRACTOR,340086 TRACTOR,,,,,,,,


### not testing

In [202]:
start = time.time()

doc_df = pd.DataFrame()

total_start = time.time()
for processor in processors:
    processor_id = processor[0]
    create_date = processor[1]
    
    for i, doc in enumerate(pdf_data):
        print(f'Doc {i+1} out of {len(pdf_data)}')        
        
        doc_id = doc[0]
        doc_type = doc[1]

        pdf = PyPDF2.PdfReader(io.BytesIO(doc[2]))

        pdfs = []
        for page_num, page in enumerate(pdf.pages, 1):
            writer = PyPDF2.PdfWriter()
            writer.add_page(page)
            with io.BytesIO() as bytes_stream:
                pdfs.append(writer.write(bytes_stream)[1].getbuffer().tobytes())

        print(f"The Expected runtime for the parsing is {(len(pdfs)/adjust_rate_limit):.2f} minutes")
        results = [None] * len(pdfs)
        start = time.time()
        with concurrent.futures.ThreadPoolExecutor(max_workers = len(pdfs)) as executor:
            futures = [
                executor.submit(
                    docai_runner,
                    p, start,
                    documentai.RawDocument(content = pdf, mime_type = 'application/pdf'),
                    processor_id
                ) for p, pdf in enumerate(pdfs)
            ]
            for future in concurrent.futures.as_completed(futures):
                #result = futures[future]
                results[future.result()[0]] = (Document.to_dict(future.result()[1].document))

        for r, result in enumerate(results):
            dict_temp = {}
            line_item_num = 0
            dict_temp['processor_id'] = processor_id
            dict_temp['create_date'] = create_date
            dict_temp['doc_id'] = doc_id
            dict_temp['doc_type'] = doc_type
            
            for entity in results[r]['entities']:
                key = entity['type_']
                text_value = entity['mention_text']
                #confidence = entity['confidence']
                
                if key == 'line_item':
                    dict_temp[str(key+str(line_item_num))] = text_value
                    line_item_num += 1
                else:
                    dict_temp[key] = text_value
                
                #if 'normalized_value' in results[r]['entities'][e].keys():
                #    normalized_value = entity['normalized_value']['text']
                #    if key == 'line_item':
                #        dict_temp[str(key+str(line_item_num))] = normalized_value
                #        line_item_num += 1
                #    else:
                #        dict_temp[key] = normalized_value
                #else:
                #    if key == 'line_item':
                #        dict_temp[str(key+str(line_item_num))] = text_value
                #        line_item_num += 1
                #    else:
                #        dict_temp[key] = text_value
            
            doc_df = doc_df.append(dict_temp, ignore_index=True)
      

end = time.time()

print(end - start)

Doc 1 out of 19
The Expected runtime for the parsing is 0.03 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 2 out of 19
The Expected runtime for the parsing is 0.03 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 3 out of 19
The Expected runtime for the parsing is 0.03 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 4 out of 19
The Expected runtime for the parsing is 0.05 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 5 out of 19
The Expected runtime for the parsing is 0.05 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 6 out of 19
The Expected runtime for the parsing is 0.03 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 7 out of 19
The Expected runtime for the parsing is 0.05 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 8 out of 19
The Expected runtime for the parsing is 0.03 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 9 out of 19
The Expected runtime for the parsing is 0.03 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 10 out of 19
The Expected runtime for the parsing is 0.67 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df =

Doc 11 out of 19
The Expected runtime for the parsing is 0.03 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 12 out of 19
The Expected runtime for the parsing is 0.03 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 13 out of 19
The Expected runtime for the parsing is 0.13 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 14 out of 19
The Expected runtime for the parsing is 0.07 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 15 out of 19
The Expected runtime for the parsing is 0.07 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 16 out of 19
The Expected runtime for the parsing is 0.03 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 17 out of 19
The Expected runtime for the parsing is 0.03 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 18 out of 19
The Expected runtime for the parsing is 0.05 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 19 out of 19
The Expected runtime for the parsing is 0.03 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 1 out of 19
The Expected runtime for the parsing is 0.03 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 2 out of 19
The Expected runtime for the parsing is 0.03 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 3 out of 19
The Expected runtime for the parsing is 0.03 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 4 out of 19
The Expected runtime for the parsing is 0.05 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 5 out of 19
The Expected runtime for the parsing is 0.05 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 6 out of 19
The Expected runtime for the parsing is 0.03 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 7 out of 19
The Expected runtime for the parsing is 0.05 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 8 out of 19
The Expected runtime for the parsing is 0.03 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 9 out of 19
The Expected runtime for the parsing is 0.03 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 10 out of 19
The Expected runtime for the parsing is 0.67 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df =

Doc 11 out of 19
The Expected runtime for the parsing is 0.03 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 12 out of 19
The Expected runtime for the parsing is 0.03 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 13 out of 19
The Expected runtime for the parsing is 0.13 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 14 out of 19
The Expected runtime for the parsing is 0.07 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 15 out of 19
The Expected runtime for the parsing is 0.07 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 16 out of 19
The Expected runtime for the parsing is 0.03 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 17 out of 19
The Expected runtime for the parsing is 0.03 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 18 out of 19
The Expected runtime for the parsing is 0.05 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 19 out of 19
The Expected runtime for the parsing is 0.03 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 1 out of 19
The Expected runtime for the parsing is 0.03 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 2 out of 19
The Expected runtime for the parsing is 0.03 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 3 out of 19
The Expected runtime for the parsing is 0.03 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 4 out of 19
The Expected runtime for the parsing is 0.05 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 5 out of 19
The Expected runtime for the parsing is 0.05 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 6 out of 19
The Expected runtime for the parsing is 0.03 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 7 out of 19
The Expected runtime for the parsing is 0.05 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 8 out of 19
The Expected runtime for the parsing is 0.03 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 9 out of 19
The Expected runtime for the parsing is 0.03 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 10 out of 19
The Expected runtime for the parsing is 0.67 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df =

Doc 11 out of 19
The Expected runtime for the parsing is 0.03 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 12 out of 19
The Expected runtime for the parsing is 0.03 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 13 out of 19
The Expected runtime for the parsing is 0.13 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 14 out of 19
The Expected runtime for the parsing is 0.07 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 15 out of 19
The Expected runtime for the parsing is 0.07 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 16 out of 19
The Expected runtime for the parsing is 0.03 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 17 out of 19
The Expected runtime for the parsing is 0.03 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 18 out of 19
The Expected runtime for the parsing is 0.05 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 19 out of 19
The Expected runtime for the parsing is 0.03 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 1 out of 19
The Expected runtime for the parsing is 0.03 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 2 out of 19
The Expected runtime for the parsing is 0.03 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 3 out of 19
The Expected runtime for the parsing is 0.03 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 4 out of 19
The Expected runtime for the parsing is 0.05 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 5 out of 19
The Expected runtime for the parsing is 0.05 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 6 out of 19
The Expected runtime for the parsing is 0.03 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 7 out of 19
The Expected runtime for the parsing is 0.05 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 8 out of 19
The Expected runtime for the parsing is 0.03 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 9 out of 19
The Expected runtime for the parsing is 0.03 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 10 out of 19
The Expected runtime for the parsing is 0.67 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df =

Doc 11 out of 19
The Expected runtime for the parsing is 0.03 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 12 out of 19
The Expected runtime for the parsing is 0.03 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 13 out of 19
The Expected runtime for the parsing is 0.13 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 14 out of 19
The Expected runtime for the parsing is 0.07 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 15 out of 19
The Expected runtime for the parsing is 0.07 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 16 out of 19
The Expected runtime for the parsing is 0.03 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 17 out of 19
The Expected runtime for the parsing is 0.03 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 18 out of 19
The Expected runtime for the parsing is 0.05 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 19 out of 19
The Expected runtime for the parsing is 0.03 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 1 out of 19
The Expected runtime for the parsing is 0.03 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 2 out of 19
The Expected runtime for the parsing is 0.03 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 3 out of 19
The Expected runtime for the parsing is 0.03 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 4 out of 19
The Expected runtime for the parsing is 0.05 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 5 out of 19
The Expected runtime for the parsing is 0.05 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 6 out of 19
The Expected runtime for the parsing is 0.03 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 7 out of 19
The Expected runtime for the parsing is 0.05 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 8 out of 19
The Expected runtime for the parsing is 0.03 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 9 out of 19
The Expected runtime for the parsing is 0.03 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 10 out of 19
The Expected runtime for the parsing is 0.67 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df =

Doc 11 out of 19
The Expected runtime for the parsing is 0.03 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 12 out of 19
The Expected runtime for the parsing is 0.03 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 13 out of 19
The Expected runtime for the parsing is 0.13 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 14 out of 19
The Expected runtime for the parsing is 0.07 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 15 out of 19
The Expected runtime for the parsing is 0.07 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 16 out of 19
The Expected runtime for the parsing is 0.03 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 17 out of 19
The Expected runtime for the parsing is 0.03 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 18 out of 19
The Expected runtime for the parsing is 0.05 minutes


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


Doc 19 out of 19
The Expected runtime for the parsing is 0.03 minutes
3.914013385772705


  doc_df = doc_df.append(dict_temp, ignore_index=True)
  doc_df = doc_df.append(dict_temp, ignore_index=True)


In [203]:
doc_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 450 entries, 0 to 449
Data columns (total 43 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   processor_id           450 non-null    object
 1   create_date            450 non-null    object
 2   doc_id                 450 non-null    object
 3   doc_type               450 non-null    object
 4   total_amount           175 non-null    object
 5   supplier_name          159 non-null    object
 6   invoice_id             95 non-null     object
 7   currency               101 non-null    object
 8   remit_to_address       95 non-null     object
 9   purchase_order         182 non-null    object
 10  remit_to_name          99 non-null     object
 11  receiver_name          95 non-null     object
 12  invoice_date           123 non-null    object
 13  delivery_date          77 non-null     object
 14  invoice_type           270 non-null    object
 15  line_item0             

### Data cleanup

In [146]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
doc_df.head()

Unnamed: 0,total_amount,supplier_name,invoice_id,currency,remit_to_address,purchase_order,remit_to_name,receiver_name,invoice_date,delivery_date,invoice_type,line_item0,line_item1,line_item2,line_item3,total_tax_amount,net_amount,supplier_tax_id,line_item4,line_item5,line_item6,line_item7,line_item8,line_item9,line_item10,line_item11,receiver_address,supplier_address,supplier_website,receiver_tax_id,due_date,vat,ship_to_name,carrier,ship_to_address,supplier_iban,freight_amount,supplier_email,supplier_registration
0,107.0,PENSKE TRUCK LEASING CO LP,25388645.0,USD,P.O. BOX 827380\nPHILADELPHIA\nPA\n191827380\n...,POA-\nWILLIAM_WHITNEY-8\n125,"PENSKE TRUCK LEASING CO., L. P.","Mohawk Industries, Inc. (SAP Account)",2023-05-17,2023-05-17,invoice_statement,1 Each 18.00 18.000,1 Each 18.00 18.000,WASHES,WASHES,,,,,,,,,,,,,,,,,,,,,,,,
1,107.0,,,,,,,,,,invoice_statement,1 Each 17.00 17.000,1 Each 18.00 18.000,1 Each 18.00 18.000,1 Each 18.00 18.000,0.0,107.0,23-2518618,WASHES,WASHES,WASHES,WASHES,,,,,,,,,,,,,,,,,
2,5790.5,PENSKE TRUCK LEASING CO LP,25242572.0,USD,P.O. BOX 827380\nPHILADELPHIA\nPA\n191827380\n...,POA-\nJOSE_MARTINEZ-812\n5,"PENSKE TRUCK LEASING CO., L. P.","Mohawk Industries, Inc. (SAP Account)",2023-04-27,2023-04-27,invoice_statement,"1 Each 4,683.73 4,683.730","1 Each 1,106.77 1,106.770",382521 TRACTOR,340086 TRACTOR,,,,,,,,,,,,,,,,,,,,,,,,
3,5790.5,,,,,,,,,,receipt_statement,,,,,0.0,5790.5,23-2518618,,,,,,,,,,,,,,,,,,,,,
4,250.0,PENSKE TRUCK LEASING CO LP,25388643.0,USD,P.O. BOX 827380\nPHILADELPHIA\nPA\n191827380\n...,POA-\nJOSE_CARBAJAL-812\n5,"PENSKE TRUCK LEASING CO., L. P.","Mohawk Industries, Inc. (SAP Account)",2023-05-17,2023-05-17,invoice_statement,1 Each 250.00 250.000,2 PARKING SPACES,,,0.0,250.0,23-2518618,,,,,,,,,,,,,,,,,,,,,


In [63]:
#doc_df.info()

In [73]:
df_v2 = doc_df[doc_df['processor_id'] == 'fa9b312c4aaa765']
df_v2['confidence'] = df_v2['confidence'].apply(lambda x: x + ((1-x)*.3))
df_v2.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,processor_id,create_date,doc_id,doc_type,label,confidence
0,fa9b312c4aaa765,2023-05-30 18:49:11,thor-docs/2992820.pdf,application/pdf,all_labels,0.922956
0,fa9b312c4aaa765,2023-05-30 18:49:11,thor-docs/2992820.pdf,application/pdf,total_amount,0.999135
1,fa9b312c4aaa765,2023-05-30 18:49:11,thor-docs/2992820.pdf,application/pdf,invoice_date,0.997411
2,fa9b312c4aaa765,2023-05-30 18:49:11,thor-docs/2992820.pdf,application/pdf,invoice_id,0.994932
3,fa9b312c4aaa765,2023-05-30 18:49:11,thor-docs/2992820.pdf,application/pdf,supplier_address,0.971805


## Import data to BQ

### Overwrite table

In [204]:
from google.cloud import bigquery

client = bigquery.Client()

table_id = f'{PROJECT_ID}.{DATASET}.{DOC_SIM_TABLE}'

job_config = bigquery.LoadJobConfig(
    write_disposition=bigquery.job.WriteDisposition.WRITE_TRUNCATE
)

job = client.load_table_from_dataframe(
    doc_df, table_id, job_config=job_config
)
job.result()  # Wait for the job to complete.
table = client.get_table(table_id)  # Make an API request.
print(
    "Loaded {} rows and {} columns to {}".format(
        table.num_rows, len(table.schema), table_id
    )
)

Loaded 450 rows and 43 columns to mg-ce-demos.mhwk_docai_demo.invoice_parser_raw


### Append to table

In [None]:
from google.cloud import bigquery

client = bigquery.Client()

table_id = f'{PROJECT_ID}.{DATASET}.{DOC_SIM_TABLE}'

job_config = bigquery.LoadJobConfig(
    write_disposition=bigquery.job.WriteDisposition.WRITE_APPEND
)

job = client.load_table_from_dataframe(
    doc_df, table_id, job_config=job_config
)
job.result()  # Wait for the job to complete.
table = client.get_table(table_id)  # Make an API request.
print(
    "Loaded {} rows and {} columns to {}".format(
        table.num_rows, len(table.schema), table_id
    )
)

# Extras

In [None]:
#EOF_MARKER = b'%%EOF'

# check if EOF is somewhere else in the file
#if EOF_MARKER in response:
    # we can remove the early %%EOF and put it at the end of the file
#    response = response.replace(EOF_MARKER, b'')
#    response = response + EOF_MARKER
else:
    # Some files really don't have an EOF marker
    # In this case it helped to manually review the end of the file
#    print(response[-8:]) # see last characters at the end of the file
    # printed b'\n%%EO%E'
#    response = response[:-6] + EOF_MARKER

# Manual upload

In [None]:
eval_summ_df = pd.DataFrame()

for file in os.listdir("doc_data_sample/"):
    if file.endswith(".json"):
        df = pd.read_json(str('data_eval/' + file))
        df = df.reset_index()
        df = df.rename(columns={'index': 'label'})
        df = df.drop('isFuzzyMatch', axis=1)
        df = pd.concat([df.drop(['metrics'], axis=1), df['metrics'].apply(pd.Series)], axis=1)
        df['id'] = df['processorName'].astype(str) + df['versionName'].astype(str) + df['createTime'].astype(str)

        eval_summ_df = pd.concat([eval_summ_df, df])
eval_summ_df = eval_summ_df.reset_index()

In [None]:
#df = pd.read_json(str('doc_data_sample/' + 'invoice-example-output.json'))
json_file_path = "doc_data_sample/invoice-example-output.json"

with open(json_file_path, 'r') as j:
     json_data = json.loads(j.read())


In [None]:
type(json_data)

In [None]:
json_data.keys()

In [None]:
for i in json_data['entities']:
    if 'confidence' in i.keys():
        print(i['type'])
        print(i['confidence'])
    else:
        print(i['type'])
        print(0)

In [None]:
doc_ent_df = pd.DataFrame(columns=['doc_id', 'mimeType', 'text', 'label', 'confidence'])

for i in json_data['entities']:
    if 'confidence' in i.keys():
        row = ['manual_doc_1', json_data['mimeType'], json_data['text'], i['type'], float(i['confidence'])]
        doc_ent_df = doc_ent_df.append(pd.DataFrame([row], columns=['doc_id', 'mimeType', 'text', 'label', 'confidence']), ignore_index=True)
    else:
        row = ['manual_doc_1', json_data['mimeType'], json_data['text'], i['type'], float(0)]
        doc_ent_df = doc_ent_df.append(pd.DataFrame([row], columns=['doc_id', 'mimeType', 'text', 'label', 'confidence']), ignore_index=True)

#doc_ent_df = doc_ent_df[doc_ent_df.confidence != 0]
        
doc_conf_mean = float(doc_ent_df.describe().T.reset_index()['mean'])
final_row = ['manual_doc_1', json_data['mimeType'], json_data['text'], 'all_labels', doc_conf_mean]
doc_ent_df.loc[len(doc_ent_df)] = final_row

#doc_ent_df = doc_ent_df.append(final_row, ignore_index=True)
doc_ent_df



In [None]:
from google.cloud import bigquery

client = bigquery.Client()

table_id = f'{PROJECT_ID}.{DATASET}.{TABLE}'

job_config = bigquery.LoadJobConfig(
    write_disposition=bigquery.job.WriteDisposition.WRITE_TRUNCATE
)

job = client.load_table_from_dataframe(
    doc_ent_df, table_id, job_config=job_config
)
job.result()  # Wait for the job to complete.
table = client.get_table(table_id)  # Make an API request.
print(
    "Loaded {} rows and {} columns to {}".format(
        table.num_rows, len(table.schema), table_id
    )
)