In [None]:
!python -m pip install amazon-textract-caller --upgrade
!python -m pip install amazon-textract-response-parser --upgrade 
!python -m pip install tabulate --upgrade
!python -m pip install PyPDF4 --upgrade

In [None]:
import boto3
import time
import json
import tabulate

In [None]:
# Curent AWS Region. Use this to choose corresponding S3 bucket with sample content

mySession = boto3.session.Session()
awsRegion = mySession.region_name

# Amazon Textract client
textract = boto3.client('textract')
s3 = boto3.client("s3")

In [None]:
import os
import re
import boto3
from PyPDF4 import PdfFileReader, PdfFileWriter

search_string1 = "E-FILE"
search_string2 = "EFILE"

bucket_name = "somename"
prefix = "uploads/"

s3 = boto3.client("s3")
response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
for obj in response['Contents']:
    if obj['Key'].endswith('.pdf'):
        s3.download_file(bucket_name, obj['Key'], os.path.basename(obj['Key']))
        print(os.path.basename(obj['Key']))
for filename in os.listdir(os.getcwd()):
    if filename.endswith(".pdf"):
        with open(filename, 'rb') as pdf_file:
            num_pages = PdfFileReader(pdf_file).getNumPages()
            found_pages = []
            for page_num in range(num_pages):
                page = PdfFileReader(pdf_file).getPage(page_num)
                page_text = page.extractText()
                if re.search(search_string1 + '|' + search_string2, page_text):
                    found_pages.append(page_num)
            if found_pages:
                pdf_writer = PdfFileWriter()
                for page_num in found_pages:
                    page = PdfFileReader(pdf_file).getPage(page_num)
                    pdf_writer.addPage(page)
                    output_filename = f"{filename[:-4]}_page{page_num+1}.pdf"
                    with open(output_filename, "wb") as output_file:
                        pdf_writer.write(output_file)
                    s3.upload_file(output_filename, bucket_name, output_filename)
                    os.remove(output_filename)
            else:
                print(f"No instances of '{search_string1}' or '{search_string2}' found in {filename}")

In [None]:
def startJob(s3BucketName, objectName):
    response = None
    response = textract.start_document_analysis(
    DocumentLocation={
        'S3Object': {
            'Bucket': s3BucketName,
            'Name': objectName
        }
    },
 FeatureTypes=["QUERIES"],
    QueriesConfig={
            "Queries": [
                {
                "Text": "what is in 'F Partner's name, address, city, state, and ZIP code'??",
                "Pages" : ["*"],    
                "Alias": "NAME_ADDRESS"
            },
                {
                "Text": "What is the year?",
                "Pages" : ["*"],
                "Alias": "YEAR"
            },
                {
                "Text": "What is the partnership name?",
                "Pages" : ["*"],
                "Alias": "PARTNERSHIP_NAME"
            },
                {
                "Text": "What is the partnership address?",
                "Pages" : ["*"],
                "Alias": "PARTNERSHIP_ADDRESS"
            },
            ]}
    
    )

    return response["JobId"]

def isJobComplete(jobId):
    response = textract.get_document_analysis(JobId=jobId)
    status = response["JobStatus"]
    print("Job status: {}".format(status))

    while(status == "IN_PROGRESS"):
        time.sleep(5)
        response = textract.get_document_analysis(JobId=jobId)
        status = response["JobStatus"]
        print("Job status: {}".format(status))

    return status

def getJobResults(jobId):

    pages = []
    response = textract.get_document_analysis(JobId=jobId)
    
    pages.append(response)
   # print("Resultset page recieved: {}".format(len(pages)))
    
    nextToken = None
    if('NextToken' in response):
        print("Yes")
        nextToken = response['NextToken']
        print(response['NextToken'])

    while(nextToken):
        response = textract.get_document_analysis(JobId=jobId, NextToken=nextToken)

        pages.append(response)
        print("Resultset page recieved: {}".format(len(pages)))
        nextToken = None
        if('NextToken' in response):
            nextToken = response['NextToken']
    
    with open(f'OutputResponse1.json', 'w') as json_file:
        json.dump(response, json_file)

    return response

In [None]:
import trp.trp2 as t2 
from tabulate import tabulate
from io import StringIO
import csv


# Amazon S3 client
s3 = boto3.resource('s3')

bucket_name = "somename"
s3BucketName = "somename"

my_bucket = s3.Bucket(bucket_name)

for object_summary in my_bucket.objects.filter():
    if object_summary.key.endswith('.pdf'):
        print(object_summary.key)
        documentName = object_summary.key
        jobId = startJob(s3BucketName, documentName)
        print("Started job with id: {}".format(jobId))
        if(isJobComplete(jobId)):
            response = getJobResults(jobId)
            print(dir(response))
            d = t2.TDocumentSchema().load(response)
            #  print(len(d.pages))
            for pageIndex in range(len(d.pages)):
                page = d.pages[pageIndex]
                query_answers = d.get_query_answers(page=page)
                print(tabulate(query_answers, tablefmt="github"))

                csv_output = StringIO()
                with open('outputfile.csv', 'a') as csv_file:
                    csv_writer_file = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
                    for qa in query_answers:
                        if len(qa[2]) > 0:
                            csv_writer_file.writerow([documentName,qa[1], qa[2]])

print("Process completed")