In [2]:
import json
import os
import json
from fuzzywuzzy import fuzz, process
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

#https://cloud.google.com/document-ai/docs/reference/rest/v1/Document
from google.cloud import documentai_v1 as documentai #v0.5.0
from google.cloud import storage
from google.api_core.client_options import ClientOptions



### Process the documents using DocumentAI

In [18]:
PROJECT_ID = 'bbsm-dev'
LOCATION = 'us'
PROCESSOR_ID = "235601fbdcf02caa"

In [19]:
storage_client = storage.Client()
blobs = storage_client.list_blobs(bucket_or_name='bbsm-dev-bucket', prefix='document_ai/output')
for blob in blobs:
    blob.delete()

In [20]:
# You must set the api_endpoint if you use a location other than 'us', e.g.:
client_options = ClientOptions()
if LOCATION == "eu":
    client_options = ClientOptions(api_endpoint="eu-documentai.googleapis.com")


docai_client = documentai.DocumentProcessorServiceClient(client_options=client_options)
processor_name = f"projects/{PROJECT_ID}/locations/{LOCATION}/processors/{PROCESSOR_ID}"

input_docs = ["gs://bbsm-dev-bucket/document_ai/sample_docs/inv_1.pdf","gs://bbsm-dev-bucket/document_ai/sample_docs/inv_2.pdf"]
destination_uri = "gs://bbsm-dev-bucket/document_ai/output"

input_config  = documentai.types.BatchDocumentsInputConfig(
    gcs_documents=documentai.types.GcsDocuments(
        documents=[documentai.types.GcsDocument(gcs_uri=doc_uri, mime_type="application/pdf") for doc_uri in input_docs]
    )
)

output_config = documentai.types.DocumentOutputConfig(
    gcs_output_config=documentai.types.DocumentOutputConfig.GcsOutputConfig(gcs_uri=destination_uri)
)
request = documentai.types.BatchProcessRequest(
    name=processor_name,
    input_documents=input_config,
    document_output_config=output_config
)
operation = docai_client.batch_process_documents(request=request)
operation.result(timeout=300)



### Download the JSON output from GCS

In [21]:
local_file_path = './output/doc_output.json'
templates_path = './templates/invoice_template.json'

In [22]:
# local_file_path = './output/doc_output.json'
# templates_path = './templates/invoice_template.json'

storage_client = storage.Client()
blobs = storage_client.list_blobs(bucket_or_name='bbsm-dev-bucket', prefix='document_ai/output')
for blob in blobs:
    if blob.name.endswith('.json'):
        print('gs://'+blob.bucket.name+'/'+blob.name)
        blob.download_to_filename(local_file_path)
        blob.delete()
        break
    else:
        blob.delete()

# blobs = storage_client.list_blobs(bucket_or_name='bbsm-dev-bucket', prefix='document_ai/output')
# for blob in blobs:
#     blob.delete()

gs://bbsm-dev-bucket/document_ai/output/5652862103474603191/0/inv_1-0.json


In [24]:
with open(local_file_path, 'r') as f:
    doc_response = json.load(f)
with open(templates_path,'r') as f:
    template = json.load(f)

### Get key-values from output json

In [25]:
def get_page_dimensions(doc_response):
    page_dimension_dict = {}
    pages = doc_response['pages']
    for idx, page in enumerate(pages):
        vertices = page['layout']['boundingPoly']['vertices']
        w,h = 0,0
        for point in vertices:
            w= max(w, point.get('x', 0))
            h= max(h, point.get('y',0))
        page_dimension_dict[f"page_{idx+1}"] = (w,h)
    return page_dimension_dict

In [26]:
def get_bbox_boundaries(vertices, normalized=True, width=None, height=None):
    """
    vertices: [{"x":val,"y":val},{"x":val,"y":val},{"x":val,"y":val},{"x":val,"y":val}]
    """
    if normalized:
        assert width is not None, "Width is None"
        assert height is not None, "Height is None"
        x,y = [],[]
        for vertex in vertices:
            if 'x' in vertex and vertex['x'] not in x:
                x.append(int(vertex['x'] * width))
            if 'y' in vertex and vertex['y'] not in y:
                y.append(int(vertex['y'] * height))
        boundaries = [min(x),min(y),max(x),max(y)]
    else:
        x,y = [],[]
        for vertex in vertices:
            if 'x' in vertex and vertex['x'] not in x:
                x.append(int(vertex['x'] * width))
            if 'y' in vertex and vertex['y'] not in y:
                y.append(int(vertex['y'] * height))
        boundaries = [min(x),min(y),max(x),max(y)]

    return boundaries

In [27]:
doc_text = doc_response['text']
page = doc_response['pages'][0]
page_dimensions = get_page_dimensions(doc_response)
width, height = page_dimensions[f"page_1"]

line_info_dict = {'line_no':[],'line_text':[],'x_min':[],'x_max':[],'y_min':[],'y_max':[]}  #{"line_1":{"boundaries":{"x_min":val,"y_min":val,"x_max":val,"y_max":val},"text":""}}
for idx,line in enumerate(page['lines']):
    line_no = f"line_{idx+1}"
    #line_info_dict[key] = {}
    line_info_dict['line_no'].append(line_no)

    vertices = line['layout']['boundingPoly']['normalizedVertices']
    boundaries = get_bbox_boundaries(vertices,width=width, height=height) #boundaries: [x_min, y_min, x_max, y_max]

    # line_info_dict[key]["boundaries"] = {"x_min":boundaries[0], "y_min":boundaries[1], "x_max":boundaries[2],
    #                                      "y_max":boundaries[3]}
    line_info_dict['x_min'].append(boundaries[0])
    line_info_dict['y_min'].append(boundaries[1])
    line_info_dict['x_max'].append(boundaries[2])
    line_info_dict['y_max'].append(boundaries[3])

    text_segment = line['layout']['textAnchor']['textSegments'][0]
    startIndex = int(text_segment.get('startIndex',0))
    endIndex = int(text_segment.get('endIndex',0))
    #line_info_dict[key]["text"] = doc_text[startIndex:endIndex]
    line_info_dict['line_text'].append(doc_text[startIndex:endIndex])

line_info_df = pd.DataFrame(data=line_info_dict)

In [28]:
line_info_df.head()

Unnamed: 0,line_no,line_text,x_min,x_max,y_min,y_max
0,line_1,7/9/2021\n,77,158,42,64
1,line_2,12340987-1.jpg\n,852,1005,39,69
2,line_3,Spring ML\n,282,570,144,235
3,line_4,INVOICE\n,1244,1524,170,233
4,line_5,# 12340987\n,1359,1518,253,281


In [29]:
class Point:
    def __init__(self,x,y):
        self.x = x
        self.y = y

def calculate_euclidean_distance(p1,p2):
    import numpy as np
    from math import sqrt
    return sqrt((p1.x-p2.x)**2 + (p1.y-p2.y)**2)

In [30]:
def add_euclidean_col(row, reference_coords):
    #print("####")
    #print(row.index)
    p = Point((row['x_max']+row['x_min'])/2, row['y_min'])
    #p = Point((row.x_max - row.x_min)/2, row.y_min)
    p_ref = Point((reference_coords['x_max']+reference_coords['x_min'])/2, reference_coords['y_min'])
    return calculate_euclidean_distance(p_ref, p)
    


def get_nearest_value_for_key(df, key_text, key_coordinates, val_lookup_loc='right'):
    #key_coordinates = (x_min, y_min)
    x_min, y_min, x_max, y_max = key_coordinates[0], key_coordinates[1], key_coordinates[2], key_coordinates[3]
    val = ''
    if val_lookup_loc == 'right':
        possible_values_df = df[(df['y_min']>=y_min-3) & (df['y_min']<=y_min+3) \
                                & (df['x_min']>x_min)][df['line_text']!=key_text]
        possible_values_list = np.array(possible_values_df).tolist()
        #print("Possible values list ", possible_values_list)
        if len(possible_values_list)==0:
            val = ''
        if len(possible_values_list) >= 1:
            sorted_possible_values_list = sorted(possible_values_list, key=lambda x: x[2]-x_min)
            #print("Sorted possible values list ", sorted_possible_values_list)
            val = sorted_possible_values_list[0][1].replace('\n','')
    elif val_lookup_loc == 'bottom':
        reference_coords = {'x_min':x_min,'x_max':x_max,'y_min':y_min,'y_max':y_max}
        df_copy = df[df['y_min']>y_min][df['line_text']!=key_text].copy().reset_index()
        df_copy['euclidean_dist'] = df_copy.apply(lambda row: add_euclidean_col(row, reference_coords), axis=1)
        #print(np.array(df_copy).tolist())
        #print(df_copy)
        minvalueIndexLabel = df_copy['euclidean_dist'].idxmin()
        #print("minvalueIndexLabel: ",minvalueIndexLabel)
        #val = df_copy.iloc[minvalueIndexLabel['euclidean_dist']]['line_text']
        val = df_copy.iloc[minvalueIndexLabel]['line_text'].replace('\n','')
        #print(val)
    else:
        val = ''
    return val  

In [31]:
results = {}

for index,row in line_info_df.iterrows():
    curr_line_text = row['line_text'].lower().replace('\n','')
    #curr_line_coordinates = {'x_min':row['x_min'],'y_min':row['y_min'],'x_max':row['x_max'],'y_max':row['y_max']}
    curr_line_coordinates = [row['x_min'], row['y_min'], row['x_max'], row['y_max']]
    #print(f"Current line:{curr_line_text}")
    for key in template.keys():
        #print(f"\tKey:{key}")
        synonyms = template[key]['synonyms']
        #print(f"\tSynonyms:{synonyms}")
        match_ratio = [fuzz.ratio(curr_line_text,synonym.lower()) for synonym in synonyms]
        #print(f"\tRatios: {match_ratio}")
        if any([True if ratio>=95 else False for ratio in match_ratio]):
            #print(f"Match found for {synonyms} with text {curr_line_text}")
            if 'right' in template[key]['locations']:
                right_value = get_nearest_value_for_key(line_info_df, curr_line_text, curr_line_coordinates, val_lookup_loc='right')
                #print(f"Value fetched from right side: {right_value}")
            if 'bottom' in template[key]['locations']:
                bottom_value = get_nearest_value_for_key(line_info_df, curr_line_text, curr_line_coordinates, val_lookup_loc='bottom')
                #print(f"Value fetched from bottom side: {bottom_value}")
            if key not in results:
                if len(right_value) > 0:
                    results[key] = right_value
                    break
                if len(bottom_value) > 0:
                    results[key] = bottom_value
                    break
            else:
                break

In [32]:
results

{'Invoice Number': '# 12340987',
 'From': 'Kumar Sharma',
 'Bill To': 'Samantha Adele',
 'Invoice Date': 'Jul 11, 2019',
 'Due Date': 'Jul 15, 2019',
 'Due Amount': '$1,909.22',
 'Total Amount': '$6,909.22'}