In [None]:
import json
import boto3
import os

def lambda_handler(event, context):
    step_function_client = boto3.client('stepfunctions')
    state_machine_arn = os.environ['STATEMACHINEARN']
    
    # Extract relevant information from the S3 event
    s3_bucket = event['Records'][0]['s3']['bucket']['name']
    s3_object_key = event['Records'][0]['s3']['object']['key']
    
    # Prepare the input for the state machine
    step_state = {
        "s3Bucket": s3_bucket,
        "s3ObjectKey": s3_object_key
    }
    
    # Start the execution of the state machine
    response = step_function_client.start_execution(
        stateMachineArn=state_machine_arn,
        input=json.dumps(step_state)
    )
    print(json.dumps(step_state))
    return step_state#, json.dumps(response, default=str)

In [None]:
##### doc-to-json-textract LAMBDA ######
#### "Start extraction" step in step function #####
##### USING CALL_TEXTRACT FUNCTION #####
##### JUST GETTING TABLES #####


import boto3
import json

from textractcaller.t_call import call_textract, Textract_Features

def lambda_handler(event, context):
    print(json.dumps(event))
    textract_client = boto3.client('textract', region_name='us-east-1')
    s3_client = boto3.client('s3')

    # Retrieve input from the state machine
    step_state = event['Input']
    s3_bucket = step_state['s3Bucket']   # this is the initial upload bucket
    s3_document_key = step_state['s3ObjectKey']   # this is the initial upload object

    # Create the document uri:
    s3_uri_of_document = f's3://{s3_bucket}/{s3_document_key}'

    textract_json = call_textract(input_document=s3_uri_of_document, features=[Textract_Features.FORMS, Textract_Features.TABLES], boto3_textract_client = textract_client)
    
    # Save the JSON to S3
    s3_textract_json_key = s3_document_key + "_textract.json"
    s3_client.put_object(Body=json.dumps(textract_json), Bucket=s3_bucket, Key=s3_textract_json_key)

    # Update step_state with the S3 key of textractJson
    step_state = {
        "s3Bucket": s3_bucket,
        "s3ObjectKey": s3_document_key,
        "s3TextractJsonKey": s3_textract_json_key
    }

    # This returns the updated step_state back to the state machine.
    return step_state


In [None]:
##### THIS IS PARSE_TEXTRACT_JSON_OBJ_FOR_RDS LAMBDA #####
###### "Parse Textract JSON" step in step function #####

import pandas as pd
import numpy as np
import json
from trp import Document
import boto3
import datetime
import uuid

def textract_to_dataframes(textract_json):
    doc = Document(textract_json)
    
    df_dict = {}
    table_counter = 1
    
    for page in doc.pages:
        for table in page.tables:
            table_rows = [[cell.text for cell in row.cells] for row in table.rows]
            df = pd.DataFrame(table_rows)
            df_dict[f'df{table_counter}'] = df
            table_counter += 1
    print(df_dict)
    return df_dict




def lambda_handler(event, context):
    print(json.dumps(event))
    s3_client = boto3.client('s3')

    # Retrieve input from the state machine
    step_state = event['Input']['Payload']
    s3_bucket = step_state['s3Bucket']   
    s3_textract_json_key = step_state['s3TextractJsonKey']
    s3_object_key = step_state['s3ObjectKey']

    # Define validation bucket
    validation_bucket = "validation-bucket---doc-parser"

    # Get the Textract JSON from S3
    textract_json_object = s3_client.get_object(Bucket=s3_bucket, Key=s3_textract_json_key)
    textract_json = json.load(textract_json_object['Body'])

    # Convert the JSON to DataFrames
    df_dict = textract_to_dataframes(textract_json)

    # ###### TAKE THIS OUT AFTER DEBUGGIN DONE     #####
    # # Write each DataFrame to a CSV in the validation bucket
    # for df_name, df in df_dict.items():
    #     # Generate a timestamp for the filename
    #     timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    #     # Generate the filename using the DataFrame name and the timestamp
    #     csv_filename = f"{df_name}-{timestamp}.csv"
    #     # Convert the DataFrame to CSV
    #     csv_string = df.to_csv(index=False)
    #     # Upload the CSV to S3
    #     s3_client.put_object(Body=csv_string, Bucket=validation_bucket, Key=csv_filename)



    # Write DataFrame to "menu.csv" in validation bucket
    for df_name, df in df_dict.items():
        if df[0].str.contains('WEEK 1').any():
            # remove first row
            fixed_df = df.copy().drop(df.index[0])
            # Strip extra spaces from column names and cells m=""
            fixed_df = fixed_df.applymap(str.strip)
            fixed_df.columns = fixed_df.columns.astype(str).str.strip()
            fixed_df = fixed_df.drop(fixed_df.index[0])
            
            # Rename columns
            new_columns = ['WEEK_NUM', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday']
            fixed_df.columns = new_columns[:len(fixed_df.columns)]

            # Replace empty strings with np.nan
            fixed_df.replace("", np.nan, inplace=True)
            
            # Fill missing values in 'WEEK_NUM' column
            fixed_df['WEEK_NUM'].ffill(inplace=True)




            csv_string = fixed_df.to_csv(index=False)
            # create the menu.csv key with s3ObjectKey
            menu_key = f"{s3_object_key}_menu.csv"
            s3_client.put_object(Body=csv_string, Bucket=validation_bucket, Key=menu_key)
            step_state["menuCsvKey"] = menu_key
            print(csv_string)
            break


    # Write DataFrame to "food.csv" in validation bucket
    for df_name, df in df_dict.items():
        if df[0].str.contains('Entrees').any():
            # Strip extra spaces from column names and cells
            df = df.applymap(str.strip)
            df.columns = df.columns.astype(str).str.strip()

            # Set column names to first row's values
            df.columns = df.iloc[0]
            df = df[1:]

            csv_string = df.to_csv(index=False)
            # create the food.csv key with s3ObjectKey
            food_key = f"{s3_object_key}_food.csv"
            s3_client.put_object(Body=csv_string, Bucket=validation_bucket, Key=food_key)
            step_state["foodCsvKey"] = food_key
            print(csv_string)
            break


    # Write DataFrame to "choices.csv" in validation bucket
    for df_name, df in df_dict.items():
        if df[0].str.contains("# of entrÃ©e choices daily").any():
            # Strip extra spaces from column names and cells
            df = df.applymap(str.strip)
            df.columns = df.columns.astype(str).str.strip()
            
            # Set column names to first row's values
            df.columns = df.iloc[0]
            df = df[1:]

            csv_string = df.to_csv(index=False)
            # create the choices.csv key with s3ObjectKey
            choices_key = f"{s3_object_key}_choices.csv"
            s3_client.put_object(Body=csv_string, Bucket=validation_bucket, Key=choices_key)
            step_state["choicesCsvKey"] = choices_key
            print(csv_string)
            break


    # Add validation bucket to step_state
    step_state["validationBucket"] = validation_bucket

    print(json.dumps(step_state, default=str))
    # This returns the updated step_state back to the state machine.
    return step_state

In [None]:
####### "Validate table" step in step function #####
####### THIS IS table_validator LAMBDA #####


##### (not inserted in lambda function yet) #####

import pandas as pd
import json
import boto3
import datetime
from io import StringIO
import re

def lambda_handler(event, context):
    step_state = event['Input']['Payload']
    s3_client = boto3.client('s3')

    # Retrieve input from the state machine
    validation_bucket = step_state['validationBucket']
    validation_key = 'validation.csv'  # Specify your validation csv key


    try:
        # Try to load existing validation data from S3
        validation_object = s3_client.get_object(Bucket=validation_bucket, Key=validation_key)
        validation_df = pd.read_csv(validation_object['Body'])
    except s3_client.exceptions.NoSuchKey:
        # If validation.csv does not exist yet, create an empty DataFrame
        validation_df = pd.DataFrame(columns=["Timestamp", "MenuIsValid", "FoodIsValid", "ChoicesIsValid"])




    ########## MENU validation ###############


    try:
        menu_csv_key = step_state['menuCsvKey']
        menu_csv_object = s3_client.get_object(Bucket=validation_bucket, Key=menu_csv_key)
        menu_df = pd.read_csv(menu_csv_object['Body'])

        if menu_df.shape[0] >= 1 and menu_df.shape[1] == 6:
            step_state["menuIsValid"] = True
        else:
            step_state["menuIsValid"] = False

    except KeyError:
        step_state["menuIsValid"] = False
        print("MenuCsvKeyError")

    
    ########## FOOD validation ###############

    try:
        food_csv_key = step_state['foodCsvKey']
        food_csv_object = s3_client.get_object(Bucket=validation_bucket, Key=food_csv_key)
        food_df = pd.read_csv(food_csv_object['Body'])

        if food_df.shape[0] >= 1 and food_df.shape[1] == 4 and food_df.iloc[0].astype(str).str.contains(r'[a-zA-Z]').any():
            step_state["foodIsValid"] = True
        else:
            step_state["foodIsValid"] = False

    except KeyError:
        step_state["foodIsValid"] = False
        print("FoodCsvKeyError")



    ########## CHOICES validation ###############

    try:
        choices_csv_key = step_state['choicesCsvKey']
        choices_csv_object = s3_client.get_object(Bucket=validation_bucket, Key=choices_csv_key)
        choices_df = pd.read_csv(choices_csv_object['Body'])

        if choices_df.shape[0] == 1 and choices_df.shape[1] == 3:
            step_state["choicesIsValid"] = True
        else:
            step_state["choicesIsValid"] = False

    except KeyError:
        step_state["choicesIsValid"] = False
        print("ChoicesCsvKeyError")




    # Add new validation results
    new_results = {
        "Timestamp": datetime.datetime.now(),
        "MenuIsValid": step_state["menuIsValid"],
        "FoodIsValid": step_state["foodIsValid"],
        "ChoicesIsValid": step_state["choicesIsValid"]
    }

    new_results_df = pd.DataFrame([new_results])

    validation_df = pd.concat([validation_df, new_results_df])

    # Save the updated validation_df back to S3
    csv_buffer = StringIO()
    validation_df.to_csv(csv_buffer, index=False)
    s3_client.put_object(Body=csv_buffer.getvalue(), Bucket=validation_bucket, Key=validation_key)

    return step_state