In [None]:
import json
import boto3
import os

def lambda_handler(event, context):
    step_function_client = boto3.client('stepfunctions')
    state_machine_arn = os.environ['STATEMACHINEARN']
    
    # Extract relevant information from the S3 event
    s3_bucket = event['Records'][0]['s3']['bucket']['name']
    s3_object_key = event['Records'][0]['s3']['object']['key']
    
    # Prepare the input for the state machine
    step_state = {
        "s3Bucket": s3_bucket,
        "s3ObjectKey": s3_object_key
    }
    
    # Start the execution of the state machine
    response = step_function_client.start_execution(
        stateMachineArn=state_machine_arn,
        input=json.dumps(step_state)
    )

    return json.dumps(response, default=str)

In [None]:
##### doc-to-json-textract LAMBDA ######

##### USING CALL_TEXTRACT FUNCTION #####
##### JUST GETTING TABLES #####


import boto3
import json

from textractcaller.t_call import call_textract, Textract_Features

def lambda_handler(event, context):
    textract_client = boto3.client('textract', region_name='us-east-1')
    s3_client = boto3.client('s3')

    # Retrieve input from the state machine
    step_state = event['Input']
    s3_bucket = step_state['s3Bucket']   # this is the initial upload bucket
    s3_document_key = step_state['s3ObjectKey']   # this is the initial upload object

    # Create the document uri:
    s3_uri_of_document = f's3://{s3_bucket}/{s3_document_key}'

    textract_json = call_textract(input_document=s3_uri_of_document, features=[Textract_Features.FORMS, Textract_Features.TABLES], boto3_textract_client = textract_client)
    
    # Save the JSON to S3
    s3_textract_json_key = s3_document_key + "_textract.json"
    s3_client.put_object(Body=json.dumps(textract_json), Bucket=s3_bucket, Key=s3_textract_json_key)

    # Update step_state with the S3 key of textractJson
    step_state['s3TextractJsonKey'] = s3_textract_json_key

    # This returns the updated step_state back to the state machine.
    return step_state


In [None]:
##### THIS IS PARSE_TEXTRACT_JSON_OBJ_FOR_RDS LAMBDA #####


import pandas as pd
import json
from trp import Document
import boto3

def textract_to_dataframes(textract_json):
    doc = Document(textract_json)
    
    df_dict = {}
    table_counter = 1
    
    for page in doc.pages:
        for table in page.tables:
            table_rows = [[cell.text for cell in row.cells] for row in table.rows]
            df = pd.DataFrame(table_rows)
            df_dict[f'df{table_counter}'] = df
            table_counter += 1

    return df_dict



def lambda_handler(event, context):
    s3_client = boto3.client('s3')

    # Retrieve input from the state machine
    step_state = event['Input']
    s3_bucket = step_state['s3Bucket']   
    s3_textract_json_key = step_state['s3TextractJsonKey']

    # Get the Textract JSON from S3
    textract_json_object = s3_client.get_object(Bucket=s3_bucket, Key=s3_textract_json_key)
    textract_json = json.load(textract_json_object['Body'])

    # Convert the JSON to DataFrames
    df_dict = textract_to_dataframes(textract_json)

    # Define validation bucket
    validation_bucket = "validation-bucket---doc-parser"

    # Write DataFrame to "menu.csv" in validation bucket
    days_of_week = ["MONDAY", "TUESDAY", "WEDNESDAY", "THURSDAY", "FRIDAY"]
    for df_name, df in df_dict.items():
        if all(day in df.values for day in days_of_week):
            csv_string = df.to_csv(index=False)
            s3_client.put_object(Body=csv_string, Bucket=validation_bucket, Key="menu.csv")
            step_state["menuCsvKey"] = "menu.csv"
            break

    # Write DataFrame to "food.csv" in validation bucket
    food_columns = ["Entrees", "Vegetables", "Fruits", "Whole Grain"]
    for df_name, df in df_dict.items():
        if all(food in df.columns for food in food_columns):
            csv_string = df.to_csv(index=False)
            s3_client.put_object(Body=csv_string, Bucket=validation_bucket, Key="food.csv")
            step_state["foodCsvKey"] = "food.csv"
            break

    # Write DataFrame to "choices.csv" in validation bucket
    choice_columns = ["# of", "choices daily"]
    for df_name, df in df_dict.items():
        if all(choice in df.columns for choice in choice_columns):
            csv_string = df.to_csv(index=False)
            s3_client.put_object(Body=csv_string, Bucket=validation_bucket, Key="choices.csv")
            step_state["choicesCsvKey"] = "choices.csv"
            break

    # Add validation bucket to step_state
    step_state["validationBucket"] = validation_bucket

    # This returns the updated step_state back to the state machine.
    return step_state