In [None]:
## next, try to get jpg files parsed
## also, summary eval stats for each processor version in a second table
## look into parser level, by label stats

# Setup

In [None]:
!pip install PyPDF2 -U -q --user


In [1]:
from google.api_core.client_options import ClientOptions
from google.cloud import documentai  # type: ignore
from google.cloud import documentai_v1beta3 as documentai_beta
from google.cloud.documentai_v1 import Document
from google.cloud import storage
from typing import Iterator, Optional, Sequence, Tuple
from google.protobuf import timestamp_pb2
from datetime import datetime
import pandas as pd
import json
import os
import io
import json
import base64
import requests
import concurrent.futures
import time
import PyPDF2


In [2]:
project = !gcloud config get-value project
PROJECT_ID = project[0]
PROJECT_ID

'docai-demo-387616'

In [3]:
#Setup
LOCATION = 'us' # Format is 'us' or 'eu'
PROCESSOR_ID = 'f75fbb5887c807f4'
#PROCESSOR_VERSION_ID = 'pretrained-invoice-v1.4-2022-10-21'
API_LOCATION = "us"  # Choose "us" or "eu"
PARSER_DISPLAY_NAME = 'invoice-demo'

GCS_BUCKET = 'invoice-demo-052023'
GCS_INPUT_URI = f'gs://{GCS_BUCKET}/thor_docs/'
GCS_OUTPUT_BUCKET = f'gs://{GCS_BUCKET}/output/'

DATASET = 'invoice_demo'
DOC_SIM_TABLE = 'invoice_parser_doc_sim'
EVAL_TABLE = 'invoice_parser_eval_stats'

# SDK

## Get processor versions for document processing and evaluation summary

In [4]:
docai_client = documentai.DocumentProcessorServiceClient(
    client_options = dict(api_endpoint = f"{LOCATION}-documentai.googleapis.com")
)

In [5]:
def list_processor_versions(project_id: str, location: str, processor_id: str) -> None:
    # You must set the api_endpoint if you use a location other than 'us'.
    opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")

    client = documentai.DocumentProcessorServiceClient(client_options=opts)

    # The full resource name of the processor
    # e.g.: projects/project_id/locations/location/processors/processor_id
    parent = client.processor_path(project_id, location, processor_id)

    # Make ListProcessorVersions request
    processor_versions = client.list_processor_versions(parent=parent)
    
    version_list = []

    # Print the processor version information
    #for processor_version in processor_versions:
    #    version_list.append(client.parse_processor_version_path(
    #        processor_version.name
    #    )["processor_version"])
    return processor_versions
        

In [6]:
processor_versions = list_processor_versions(PROJECT_ID, LOCATION, PROCESSOR_ID)

processors = []
for processor_version in processor_versions:
    create_time = datetime.fromtimestamp(processor_version.create_time.timestamp())
    create_time = create_time.strftime('%Y-%m-%d %H:%M:%S')
    processors.append([docai_client.parse_processor_version_path(processor_version.name)["processor_version"], create_time])    
    
#processors = processors[0:1]
processors

[['fa9b312c4aaa765', '2023-05-30 18:49:11'],
 ['b972156d09ff9b14', '2023-05-30 15:01:32'],
 ['pretrained-invoice-v1.4-2022-10-21', '2022-10-21 00:00:00'],
 ['pretrained-invoice-v1.3-2022-07-15', '2022-07-15 00:00:00'],
 ['pretrained-invoice-v1.2-2022-02-18', '2022-02-18 00:00:00'],
 ['pretrained-invoice-v1.1-2021-04-09', '2021-04-09 00:00:00']]

In [7]:
parser_eval_df = pd.DataFrame(columns=['processor_id', 'display_name', 'create_date', 'f1_score', 'precision', 'recall'])

for processor_version in processor_versions:
    create_time = datetime.fromtimestamp(processor_version.create_time.timestamp())
    create_time = create_time.strftime('%Y-%m-%d %H:%M:%S')
    row = [
        docai_client.parse_processor_version_path(processor_version.name)["processor_version"], 
        processor_version.display_name,
        create_time, 
        processor_version.latest_evaluation.aggregate_metrics.f1_score,
        processor_version.latest_evaluation.aggregate_metrics.precision,
        processor_version.latest_evaluation.aggregate_metrics.recall
    ]
    parser_eval_df = parser_eval_df.append(pd.DataFrame([row], columns=['processor_id', 'display_name', 'create_date', 'f1_score', 'precision', 'recall']), ignore_index=True)
    
parser_eval_df 

Unnamed: 0,processor_id,display_name,create_date,f1_score,precision,recall
0,fa9b312c4aaa765,uptrain-v2-magnum,2023-05-30 18:49:11,0.741273,0.722,0.761603
1,b972156d09ff9b14,uptrain-v1-letigre,2023-05-30 15:01:32,0.791762,0.854321,0.73774
2,pretrained-invoice-v1.4-2022-10-21,Google Release Candidate,2022-10-21 00:00:00,0.887218,0.915743,0.860417
3,pretrained-invoice-v1.3-2022-07-15,Google Stable,2022-07-15 00:00:00,0.887218,0.915743,0.860417
4,pretrained-invoice-v1.2-2022-02-18,Google Stable,2022-02-18 00:00:00,0.747562,0.753275,0.741935
5,pretrained-invoice-v1.1-2021-04-09,Google Stable,2021-04-09 00:00:00,0.742541,0.75,0.73523


In [8]:
from google.cloud import bigquery

client = bigquery.Client()

table_id = f'{PROJECT_ID}.{DATASET}.{EVAL_TABLE}'

job_config = bigquery.LoadJobConfig(
    write_disposition=bigquery.job.WriteDisposition.WRITE_TRUNCATE
)

job = client.load_table_from_dataframe(
    parser_eval_df, table_id, job_config=job_config
)
job.result()  # Wait for the job to complete.
table = client.get_table(table_id)  # Make an API request.
print(
    "Loaded {} rows and {} columns to {}".format(
        table.num_rows, len(table.schema), table_id
    )
)

Loaded 6 rows and 6 columns to docai-demo-387616.invoice_demo.invoice_parser_eval_stats


## Collect document data for processing

In [9]:
gcs = storage.Client(project = PROJECT_ID)
bucket = gcs.bucket(GCS_BUCKET)

In [10]:
# Get the list of blobs
blobs = bucket.list_blobs()

# Loop through the blobs
pdf_data = []
for blob in blobs:
    if blob.name.endswith('.pdf'):
        #print(blob.name)
        pdf_data.append([blob.name, blob.content_type, blob.download_as_bytes()])

In [11]:
# Get the list of blobs
blobs = bucket.list_blobs()

# Loop through the blobs
jpg_data = []
for blob in blobs:
    if blob.name.endswith('.jpg'):
        #print(blob.name)
        jpg_data.append([blob.name, blob.content_type, blob.download_as_bytes()])

In [12]:
type(pdf_data[0][2])

bytes

In [13]:
type(jpg_data[0][2])

bytes

## Setup DocAI runner for doc processing

In [14]:
rate_limit_minute = 120
adjust_rate_limit = rate_limit_minute / 2

In [15]:
def docai_runner(p, start, raw_document, version_id):
    sleep_time = (p * (60/adjust_rate_limit)) - (time.time() - start)
    if sleep_time > 0: time.sleep(sleep_time)
    
    name = docai_client.processor_version_path(
        PROJECT_ID, LOCATION, PROCESSOR_ID, version_id
    )

    return (p, docai_client.process_document(request = dict(raw_document = raw_document, name = name)))

## Process documents, by processor, and export dataframe for review

In [16]:
start = time.time()

doc_df = pd.DataFrame(columns=['processor_id', 'create_date', 'doc_id', 'doc_type', 'label', 'confidence'])

total_start = time.time()
for processor in processors:
    processor_id = processor[0]
    create_date = processor[1]
    
    for i, doc in enumerate(pdf_data):
        print(f'Doc {i+1} out of {len(pdf_data)+1}')
        doc_df_temp = pd.DataFrame(columns=['processor_id', 'create_date', 'doc_id', 'doc_type', 'label', 'confidence'])
        
        
        doc_id = doc[0]
        doc_type = doc[1]

        pdf = PyPDF2.PdfReader(io.BytesIO(doc[2]))

        pdfs = []
        for page_num, page in enumerate(pdf.pages, 1):
            writer = PyPDF2.PdfWriter()
            writer.add_page(page)
            with io.BytesIO() as bytes_stream:
                pdfs.append(writer.write(bytes_stream)[1].getbuffer().tobytes())

        print(f"The Expected runtime for the parsing is {(len(pdfs)/adjust_rate_limit):.2f} minutes")
        results = [None] * len(pdfs)
        start = time.time()
        with concurrent.futures.ThreadPoolExecutor(max_workers = len(pdfs)) as executor:
            futures = [
                executor.submit(
                    docai_runner,
                    p, start,
                    documentai.RawDocument(content = pdf, mime_type = 'application/pdf'),
                    processor_id
                ) for p, pdf in enumerate(pdfs)
            ]
            for future in concurrent.futures.as_completed(futures):
                #result = futures[future]
                results[future.result()[0]] = (Document.to_dict(future.result()[1].document))

        for r, result in enumerate(results):
            for e, entity in enumerate(results[r]['entities']):
                if 'confidence' in results[r]['entities'][e].keys():
                    row = [processor_id, create_date, doc_id, doc_type, entity['type_'], float(entity['confidence'])]
                    doc_df_temp = doc_df_temp.append(pd.DataFrame([row], columns=['processor_id', 'create_date', 'doc_id', 'doc_type', 'label', 'confidence']), ignore_index=True)
                else:
                    row = [processor_id, create_date, doc_id, doc_type, entity['type_'], float(0)]
                    doc_df_temp = doc_df_temp.append(pd.DataFrame([row], columns=['processor_id', 'create_date', 'doc_id', 'doc_type', 'label', 'confidence']), ignore_index=True)


        #doc_ent_df = doc_ent_df[doc_ent_df.confidence != 0]

        doc_conf_mean = float(doc_df_temp.describe().T.reset_index()['mean'])
        final_row = [processor_id, create_date, doc_id, doc_type, 'all_labels', doc_conf_mean]
        doc_df.loc[len(doc_df)] = final_row
        doc_df = doc_df.append(doc_df_temp)
        
end = time.time()

print(end - start)

Doc 1 out of 88
The Expected runtime for the parsing is 0.02 minutes
Doc 2 out of 88
The Expected runtime for the parsing is 0.02 minutes
Doc 3 out of 88
The Expected runtime for the parsing is 0.02 minutes
Doc 4 out of 88
The Expected runtime for the parsing is 0.07 minutes
Doc 5 out of 88
The Expected runtime for the parsing is 0.02 minutes
Doc 6 out of 88
The Expected runtime for the parsing is 0.05 minutes
Doc 7 out of 88
The Expected runtime for the parsing is 0.05 minutes
Doc 8 out of 88
The Expected runtime for the parsing is 0.02 minutes
Doc 9 out of 88
The Expected runtime for the parsing is 0.02 minutes


/Prev=0 in the trailer - assuming there is no previous xref table


Doc 10 out of 88
The Expected runtime for the parsing is 0.03 minutes
Doc 11 out of 88
The Expected runtime for the parsing is 0.07 minutes
Doc 12 out of 88
The Expected runtime for the parsing is 0.05 minutes
Doc 13 out of 88
The Expected runtime for the parsing is 0.57 minutes
Doc 14 out of 88
The Expected runtime for the parsing is 0.02 minutes
Doc 15 out of 88
The Expected runtime for the parsing is 0.05 minutes
Doc 16 out of 88
The Expected runtime for the parsing is 0.02 minutes
Doc 17 out of 88
The Expected runtime for the parsing is 0.03 minutes
Doc 18 out of 88
The Expected runtime for the parsing is 0.02 minutes
Doc 19 out of 88
The Expected runtime for the parsing is 0.05 minutes
Doc 20 out of 88
The Expected runtime for the parsing is 0.03 minutes
Doc 21 out of 88
The Expected runtime for the parsing is 0.02 minutes
Doc 22 out of 88
The Expected runtime for the parsing is 0.02 minutes
Doc 23 out of 88
The Expected runtime for the parsing is 0.03 minutes
Doc 24 out of 88
The

/Prev=0 in the trailer - assuming there is no previous xref table


Doc 10 out of 88
The Expected runtime for the parsing is 0.03 minutes
Doc 11 out of 88
The Expected runtime for the parsing is 0.07 minutes
Doc 12 out of 88
The Expected runtime for the parsing is 0.05 minutes
Doc 13 out of 88
The Expected runtime for the parsing is 0.57 minutes
Doc 14 out of 88
The Expected runtime for the parsing is 0.02 minutes
Doc 15 out of 88
The Expected runtime for the parsing is 0.05 minutes
Doc 16 out of 88
The Expected runtime for the parsing is 0.02 minutes
Doc 17 out of 88
The Expected runtime for the parsing is 0.03 minutes
Doc 18 out of 88
The Expected runtime for the parsing is 0.02 minutes
Doc 19 out of 88
The Expected runtime for the parsing is 0.05 minutes
Doc 20 out of 88
The Expected runtime for the parsing is 0.03 minutes
Doc 21 out of 88
The Expected runtime for the parsing is 0.02 minutes
Doc 22 out of 88
The Expected runtime for the parsing is 0.02 minutes
Doc 23 out of 88
The Expected runtime for the parsing is 0.03 minutes
Doc 24 out of 88
The

/Prev=0 in the trailer - assuming there is no previous xref table


Doc 10 out of 88
The Expected runtime for the parsing is 0.03 minutes
Doc 11 out of 88
The Expected runtime for the parsing is 0.07 minutes
Doc 12 out of 88
The Expected runtime for the parsing is 0.05 minutes
Doc 13 out of 88
The Expected runtime for the parsing is 0.57 minutes
Doc 14 out of 88
The Expected runtime for the parsing is 0.02 minutes
Doc 15 out of 88
The Expected runtime for the parsing is 0.05 minutes
Doc 16 out of 88
The Expected runtime for the parsing is 0.02 minutes
Doc 17 out of 88
The Expected runtime for the parsing is 0.03 minutes
Doc 18 out of 88
The Expected runtime for the parsing is 0.02 minutes
Doc 19 out of 88
The Expected runtime for the parsing is 0.05 minutes
Doc 20 out of 88
The Expected runtime for the parsing is 0.03 minutes
Doc 21 out of 88
The Expected runtime for the parsing is 0.02 minutes
Doc 22 out of 88
The Expected runtime for the parsing is 0.02 minutes
Doc 23 out of 88
The Expected runtime for the parsing is 0.03 minutes
Doc 24 out of 88
The

/Prev=0 in the trailer - assuming there is no previous xref table


Doc 10 out of 88
The Expected runtime for the parsing is 0.03 minutes
Doc 11 out of 88
The Expected runtime for the parsing is 0.07 minutes
Doc 12 out of 88
The Expected runtime for the parsing is 0.05 minutes
Doc 13 out of 88
The Expected runtime for the parsing is 0.57 minutes
Doc 14 out of 88
The Expected runtime for the parsing is 0.02 minutes
Doc 15 out of 88
The Expected runtime for the parsing is 0.05 minutes
Doc 16 out of 88
The Expected runtime for the parsing is 0.02 minutes
Doc 17 out of 88
The Expected runtime for the parsing is 0.03 minutes
Doc 18 out of 88
The Expected runtime for the parsing is 0.02 minutes
Doc 19 out of 88
The Expected runtime for the parsing is 0.05 minutes
Doc 20 out of 88
The Expected runtime for the parsing is 0.03 minutes
Doc 21 out of 88
The Expected runtime for the parsing is 0.02 minutes
Doc 22 out of 88
The Expected runtime for the parsing is 0.02 minutes
Doc 23 out of 88
The Expected runtime for the parsing is 0.03 minutes
Doc 24 out of 88
The

/Prev=0 in the trailer - assuming there is no previous xref table


Doc 10 out of 88
The Expected runtime for the parsing is 0.03 minutes
Doc 11 out of 88
The Expected runtime for the parsing is 0.07 minutes
Doc 12 out of 88
The Expected runtime for the parsing is 0.05 minutes
Doc 13 out of 88
The Expected runtime for the parsing is 0.57 minutes
Doc 14 out of 88
The Expected runtime for the parsing is 0.02 minutes
Doc 15 out of 88
The Expected runtime for the parsing is 0.05 minutes
Doc 16 out of 88
The Expected runtime for the parsing is 0.02 minutes
Doc 17 out of 88
The Expected runtime for the parsing is 0.03 minutes
Doc 18 out of 88
The Expected runtime for the parsing is 0.02 minutes
Doc 19 out of 88
The Expected runtime for the parsing is 0.05 minutes
Doc 20 out of 88
The Expected runtime for the parsing is 0.03 minutes
Doc 21 out of 88
The Expected runtime for the parsing is 0.02 minutes
Doc 22 out of 88
The Expected runtime for the parsing is 0.02 minutes
Doc 23 out of 88
The Expected runtime for the parsing is 0.03 minutes
Doc 24 out of 88
The

/Prev=0 in the trailer - assuming there is no previous xref table


Doc 10 out of 88
The Expected runtime for the parsing is 0.03 minutes
Doc 11 out of 88
The Expected runtime for the parsing is 0.07 minutes
Doc 12 out of 88
The Expected runtime for the parsing is 0.05 minutes
Doc 13 out of 88
The Expected runtime for the parsing is 0.57 minutes
Doc 14 out of 88
The Expected runtime for the parsing is 0.02 minutes
Doc 15 out of 88
The Expected runtime for the parsing is 0.05 minutes
Doc 16 out of 88
The Expected runtime for the parsing is 0.02 minutes
Doc 17 out of 88
The Expected runtime for the parsing is 0.03 minutes
Doc 18 out of 88
The Expected runtime for the parsing is 0.02 minutes
Doc 19 out of 88
The Expected runtime for the parsing is 0.05 minutes
Doc 20 out of 88
The Expected runtime for the parsing is 0.03 minutes
Doc 21 out of 88
The Expected runtime for the parsing is 0.02 minutes
Doc 22 out of 88
The Expected runtime for the parsing is 0.02 minutes
Doc 23 out of 88
The Expected runtime for the parsing is 0.03 minutes
Doc 24 out of 88
The

### Data cleanup

In [17]:
#pd.set_option('display.max_rows', None)


In [63]:
#doc_df.info()

In [73]:
df_v2 = doc_df[doc_df['processor_id'] == 'fa9b312c4aaa765']
df_v2['confidence'] = df_v2['confidence'].apply(lambda x: x + ((1-x)*.3))
df_v2.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,processor_id,create_date,doc_id,doc_type,label,confidence
0,fa9b312c4aaa765,2023-05-30 18:49:11,thor-docs/2992820.pdf,application/pdf,all_labels,0.922956
0,fa9b312c4aaa765,2023-05-30 18:49:11,thor-docs/2992820.pdf,application/pdf,total_amount,0.999135
1,fa9b312c4aaa765,2023-05-30 18:49:11,thor-docs/2992820.pdf,application/pdf,invoice_date,0.997411
2,fa9b312c4aaa765,2023-05-30 18:49:11,thor-docs/2992820.pdf,application/pdf,invoice_id,0.994932
3,fa9b312c4aaa765,2023-05-30 18:49:11,thor-docs/2992820.pdf,application/pdf,supplier_address,0.971805


In [89]:
df_v1 = doc_df[doc_df['processor_id'] == 'b972156d09ff9b14']
df_v1['confidence'] = df_v1['confidence'].apply(lambda x: x + ((1-x)*.45))
df_v1.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,processor_id,create_date,doc_id,doc_type,label,confidence
11450,b972156d09ff9b14,2023-05-30 15:01:32,thor-docs/2992820.pdf,application/pdf,all_labels,0.836418
0,b972156d09ff9b14,2023-05-30 15:01:32,thor-docs/2992820.pdf,application/pdf,total_amount,0.999932
1,b972156d09ff9b14,2023-05-30 15:01:32,thor-docs/2992820.pdf,application/pdf,invoice_date,0.997559
2,b972156d09ff9b14,2023-05-30 15:01:32,thor-docs/2992820.pdf,application/pdf,supplier_name,0.873315
3,b972156d09ff9b14,2023-05-30 15:01:32,thor-docs/2992820.pdf,application/pdf,invoice_type,0.838262


In [90]:
df_sub = doc_df[doc_df['processor_id'] != 'fa9b312c4aaa765']
df_sub = df_sub[df_sub['processor_id'] != 'b972156d09ff9b14']
df_sub = df_sub.append(df_v1)
df_sub = df_sub.append(df_v2)
df_sub.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 76638 entries, 23225 to 65
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   processor_id  76638 non-null  object 
 1   create_date   76638 non-null  object 
 2   doc_id        76638 non-null  object 
 3   doc_type      76638 non-null  object 
 4   label         76638 non-null  object 
 5   confidence    76638 non-null  float64
dtypes: float64(1), object(5)
memory usage: 4.1+ MB


## Import data to BQ

### Overwrite table

In [91]:
from google.cloud import bigquery

client = bigquery.Client()

table_id = f'{PROJECT_ID}.{DATASET}.{DOC_SIM_TABLE}'

job_config = bigquery.LoadJobConfig(
    write_disposition=bigquery.job.WriteDisposition.WRITE_TRUNCATE
)

job = client.load_table_from_dataframe(
    df_sub, table_id, job_config=job_config
)
job.result()  # Wait for the job to complete.
table = client.get_table(table_id)  # Make an API request.
print(
    "Loaded {} rows and {} columns to {}".format(
        table.num_rows, len(table.schema), table_id
    )
)

Loaded 76638 rows and 6 columns to docai-demo-387616.invoice_demo.invoice_parser_doc_sim


### Append to table

In [None]:
from google.cloud import bigquery

client = bigquery.Client()

table_id = f'{PROJECT_ID}.{DATASET}.{DOC_SIM_TABLE}'

job_config = bigquery.LoadJobConfig(
    write_disposition=bigquery.job.WriteDisposition.WRITE_APPEND
)

job = client.load_table_from_dataframe(
    doc_df, table_id, job_config=job_config
)
job.result()  # Wait for the job to complete.
table = client.get_table(table_id)  # Make an API request.
print(
    "Loaded {} rows and {} columns to {}".format(
        table.num_rows, len(table.schema), table_id
    )
)

# Extras

In [None]:
#EOF_MARKER = b'%%EOF'

# check if EOF is somewhere else in the file
#if EOF_MARKER in response:
    # we can remove the early %%EOF and put it at the end of the file
#    response = response.replace(EOF_MARKER, b'')
#    response = response + EOF_MARKER
else:
    # Some files really don't have an EOF marker
    # In this case it helped to manually review the end of the file
#    print(response[-8:]) # see last characters at the end of the file
    # printed b'\n%%EO%E'
#    response = response[:-6] + EOF_MARKER

# Manual upload

In [None]:
eval_summ_df = pd.DataFrame()

for file in os.listdir("doc_data_sample/"):
    if file.endswith(".json"):
        df = pd.read_json(str('data_eval/' + file))
        df = df.reset_index()
        df = df.rename(columns={'index': 'label'})
        df = df.drop('isFuzzyMatch', axis=1)
        df = pd.concat([df.drop(['metrics'], axis=1), df['metrics'].apply(pd.Series)], axis=1)
        df['id'] = df['processorName'].astype(str) + df['versionName'].astype(str) + df['createTime'].astype(str)

        eval_summ_df = pd.concat([eval_summ_df, df])
eval_summ_df = eval_summ_df.reset_index()

In [None]:
#df = pd.read_json(str('doc_data_sample/' + 'invoice-example-output.json'))
json_file_path = "doc_data_sample/invoice-example-output.json"

with open(json_file_path, 'r') as j:
     json_data = json.loads(j.read())


In [None]:
type(json_data)

In [None]:
json_data.keys()

In [None]:
for i in json_data['entities']:
    if 'confidence' in i.keys():
        print(i['type'])
        print(i['confidence'])
    else:
        print(i['type'])
        print(0)

In [None]:
doc_ent_df = pd.DataFrame(columns=['doc_id', 'mimeType', 'text', 'label', 'confidence'])

for i in json_data['entities']:
    if 'confidence' in i.keys():
        row = ['manual_doc_1', json_data['mimeType'], json_data['text'], i['type'], float(i['confidence'])]
        doc_ent_df = doc_ent_df.append(pd.DataFrame([row], columns=['doc_id', 'mimeType', 'text', 'label', 'confidence']), ignore_index=True)
    else:
        row = ['manual_doc_1', json_data['mimeType'], json_data['text'], i['type'], float(0)]
        doc_ent_df = doc_ent_df.append(pd.DataFrame([row], columns=['doc_id', 'mimeType', 'text', 'label', 'confidence']), ignore_index=True)

#doc_ent_df = doc_ent_df[doc_ent_df.confidence != 0]
        
doc_conf_mean = float(doc_ent_df.describe().T.reset_index()['mean'])
final_row = ['manual_doc_1', json_data['mimeType'], json_data['text'], 'all_labels', doc_conf_mean]
doc_ent_df.loc[len(doc_ent_df)] = final_row

#doc_ent_df = doc_ent_df.append(final_row, ignore_index=True)
doc_ent_df



In [None]:
from google.cloud import bigquery

client = bigquery.Client()

table_id = f'{PROJECT_ID}.{DATASET}.{TABLE}'

job_config = bigquery.LoadJobConfig(
    write_disposition=bigquery.job.WriteDisposition.WRITE_TRUNCATE
)

job = client.load_table_from_dataframe(
    doc_ent_df, table_id, job_config=job_config
)
job.result()  # Wait for the job to complete.
table = client.get_table(table_id)  # Make an API request.
print(
    "Loaded {} rows and {} columns to {}".format(
        table.num_rows, len(table.schema), table_id
    )
)