In [1]:
import boto3
import pandas as pd

env = "prod"
aws_session = boto3.session.Session(profile_name=f"data-maintainer-{env}")
s3_client = aws_session.client("s3")
lambda_client = aws_session.client("lambda")
s3 = aws_session.resource("s3")

landing_bucket = s3.Bucket(f"f14-datalake-landing-{env}")
input_bucket = f"f14-datalake-landing-{env}"
output_bucket = f"f14-datalake-raw-{env}"
base_prefix = "amazon_sp_api/GET_RESTOCK_INVENTORY_RECOMMENDATIONS_REPORT/"

### In order to be able to run the lambda, we have to apply a few changes to it:
1. lambda_function.py ENVIRONMENT ln replace with: ENV = os.getenv("ENVIRONMENT", "prod")
2. lambda_function.py credentials, change client with this one: "aws_session = boto3.session.Session(profile_name=f'data-maintainer-prod')
s3_client = aws_session.client('s3')"
3. Once we've got the aws_session, we must pass it to the json generator function, like this: wr.s3.to_json(df, raw_path, orient="records", lines=True, boto3_session=aws_session).
4. Remember to reload the lambda module before continue with the rest of the snippets.

In [2]:
import logging

logging.basicConfig()
logger = logging.getLogger(__name__)
logger.setLevel(getattr(logging, "ERROR"))
logger.info(f"Loading Lambda Function {__name__}")

In [3]:
import sys

path_to_module = "/Users/emif/Documents/Factory14/landingFileProcessor"
sys.path.append(path_to_module)

In [None]:
import importlib
import src.main.lambda_function as lf

importlib.reload(lf)

In [5]:
def get_files_to_proccess(valid_prefix):
    files_to_process = []
    for landing_bucket_object in landing_bucket.objects.all():
        object_last_update = landing_bucket_object.last_modified
        bucket_name = landing_bucket_object.bucket_name
        object_key = landing_bucket_object.key

        if valid_prefix in object_key and (
            ".csv" in object_key or ".tsv" in object_key or ".jsonl" in object_key
        ):
            files_to_process.append([bucket_name, object_key, object_last_update])

    return files_to_process

In [6]:
def get_lambda_payload(bucket_name, object_prefix):
    return {
        "Records": [
            {
                "eventVersion": "2.0",
                "eventSource": "aws:s3",
                "awsRegion": "us-east-1",
                "eventTime": "1970-01-01T00:00:00.000Z",
                "eventName": "ObjectCreated:Put",
                "userIdentity": {"principalId": "EXAMPLE"},
                "requestParameters": {"sourceIPAddress": "127.0.0.1"},
                "responseElements": {
                    "x-amz-request-id": "EXAMPLE123456789",
                    "x-amz-id-2": "EXAMPLE123/5678abcdefghijklambdaisawesome/mnopqrstuvwxyzABCDEFGH",
                },
                "s3": {
                    "s3SchemaVersion": "1.0",
                    "configurationId": "testConfigRule",
                    "bucket": {
                        "name": bucket_name,
                        "ownerIdentity": {"principalId": "EXAMPLE"},
                        "arn": f"arn:aws:s3:::{bucket_name}",
                    },
                    "object": {
                        "key": object_prefix,
                        "size": 1024,
                        "eTag": "0123456789abcdef0123456789abcdef",
                        "sequencer": "0A1B2C3D4E5F678901",
                    },
                },
            }
        ]
    }

In [None]:
files_to_process_all = get_files_to_proccess(base_prefix)
len(files_to_process_all)

In [None]:
files_to_process_all[:5]

In [9]:
files_to_process = files_to_process_all

In [None]:
files_to_process

In [None]:
import progressbar

i = 0
with progressbar.ProgressBar(max_value=len(files_to_process)) as bar:
    for file_to_process in files_to_process:
        call_payload = get_lambda_payload(file_to_process[0], file_to_process[1])
        # INVOKE
        lf.lambda_handler(call_payload, None)
        i = i + 1
        bar.update(i)

print("Done!")