# Notebook for Training and Running Inference

In [149]:
%%bash
export GOOGLE_APPLICATION_CREDENTIALS="mwpmltr-f1668dcc0858.json"

In [158]:
# read training_data
import os
document_classes=os.listdir("training_document_samples")

#get current working directory
current_path=os.getcwd()

# get list of files and labels
list_files=[]
labels=[]
for classes in document_classes:
    class_path=str(current_path)+'/'+"training_document_samples"+'/'+str(classes)
    files=os.listdir(class_path)
    for file in files:
        file_path=class_path+'/' +file
        list_files.append(file_path)
        labels.append(classes)

In [159]:
# USE DOCUMENT AI for ocr
from google.cloud import documentai_v1 as documentai

def process_document(project_id: str, location: str,
                     processor_id: str, file_path: str,
                     mime_type: str) -> documentai.Document:
    """
    Processes a document using the Document AI API.
    """

    # Instantiates a client
    documentai_client = documentai.DocumentProcessorServiceClient()

    # The full resource name of the processor, e.g.:
    # projects/project-id/locations/location/processor/processor-id
    # You must create new processors in the Cloud Console first
    resource_name = documentai_client.processor_path(
        project_id, location, processor_id)

    # Read the file into memory
    with open(file_path, "rb") as image:
        image_content = image.read()

        # Load Binary Data into Document AI RawDocument Object
        raw_document = documentai.RawDocument(
            content=image_content, mime_type=mime_type)

        # Configure the process request
        request = documentai.ProcessRequest(
            name=resource_name, raw_document=raw_document)

        # Use the Document AI client to process the sample form
        result = documentai_client.process_document(request=request)

        return result.document


In [160]:
# set parameters
project_id= 'mwpmltr'
location = 'us' # Format is 'us' or 'eu'
processor_id = '5146e2e343bf6d70' # document ocr #  Create processor in Cloud Console
mime_type = 'application/pdf'

# extract the text from the files
# all text contains the text from all the files
all_text=[]
for file in list_files:
    try:
        document = process_document(project_id=project_id, location=location,
                            processor_id=processor_id, file_path=file,
                            mime_type=mime_type)
        text=" ".join(document.text.splitlines()).strip()
        all_text.append(text)
    except:
        print("the file not processed is...",file)


In [161]:
# Training a classifier using tf-idf for feature generation
from sklearn.feature_extraction.text import TfidfVectorizer

training_data=all_text
training_labels=labels

#Train classification model using random forest
from sklearn.pipeline import make_pipeline

vectorizer = TfidfVectorizer()
model = RandomForestClassifier(n_estimators = 100)
pipeline = make_pipeline(vectorizer, model)

pipeline.fit(training_data, training_labels)

# save the model in a pickle file
import pickle
pickle.dump(pipeline, open('classification_pipeline.pickle', 'wb'))



PREDICTION SERVICE

In [163]:
# classify document using randomforest
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
import pickle


file_path=list_files[23]
document = process_document(project_id=project_id, location=location,
                            processor_id=processor_id, file_path=file_path,
                            mime_type=mime_type)
text=[" ".join(document.text.splitlines()).strip()]

# read the pipeline from pickle
pickle.load(open('classification_pipeline.pickle', 'rb'))
prediction = str(list(pipeline.predict(text))[0])
print(prediction)



invoice


In [121]:
# perform Key Value pair
from google.cloud import documentai_v1 as documentai
from PIL import Image, ImageDraw

import os
import pandas as pd

#Set up processor variables
PROJECT_ID = project_id
LOCATION = "us"  # Format is 'us' or 'eu'
PDF_PATH = file_path # Update to path of target document

if prediction=='w-9':
    PROCESSOR_ID ='ece6aa307cb0d855'
    
def process_document_sample():
    # Instantiates a client
    client_options = {"api_endpoint": "{}-documentai.googleapis.com".format(LOCATION)}
    client = documentai.DocumentProcessorServiceClient(client_options=client_options)

    # The full resource name of the processor, e.g.:
    # projects/project-id/locations/location/processor/processor-id
    # You must create new processors in the Cloud Console first
    name = f"projects/{PROJECT_ID}/locations/{LOCATION}/processors/{PROCESSOR_ID}"

    with open(PDF_PATH, "rb") as image:
        image_content = image.read()

    # Read the file into memory
    document = {"content": image_content, "mime_type": "application/pdf"}

    # Configure the process request
    request = {"name": name, "raw_document": document}

    # Recognizes text entities in the PDF document
    result = client.process_document(request=request)
    document = result.document
    entities = document.entities
    print("Document processing complete.\n\n")

    # For a full list of Document object attributes, please reference this page: https://googleapis.dev/python/documentai/latest/_modules/google/cloud/documentai_v1beta3/types/document.html#Document  
    types = []
    values = []
    confidence = []
    
    # Grab each key/value pair and their corresponding confidence scores.
    for entity in entities:
        types.append(entity.type_)
        values.append(entity.mention_text)
        confidence.append(round(entity.confidence,4))
        
    # Create a Pandas Dataframe to print the values in tabular format. 
    df = pd.DataFrame({'Type': types, 'Value': values, 'Confidence': confidence})
    display(df)
    
    #if result.human_review_operation:
    #    print ("Triggered HITL long running operation: {}".format(result.human_review_operation))

    return document


def get_text(doc_element: dict, document: dict):
    """
    Document AI identifies form fields by their offsets
    in document text. This function converts offsets
    to text snippets.
    """
    response = ""
    # If a text segment spans several lines, it will
    # be stored in different text segments.
    for segment in doc_element.text_anchor.text_segments:
        start_index = (
            int(segment.start_index)
            if segment in doc_element.text_anchor.text_segments
            else 0
        )
        end_index = int(segment.end_index)
        response += document.text[start_index:end_index]
    return response    

In [122]:
doc = process_document_sample()

Document processing complete.




Unnamed: 0,Type,Value,Confidence
0,FormRevisionDate,October 2018,0.9597
1,HasSignature,YES,0.9
2,HasSignatureDate,YES,0.9
3,SSN,721074732,0.9872
4,Name,Rose Jeppson,0.9853
5,FederalTaxClassification,Individual/sole proprietor or single-member LLC,0.9387
6,Address,101 South Orange Ave,0.979
7,CityStateZip,"Pasadena, CA 91001",0.9889
