In [1]:
import boto3

env = 'prod'
aws_session = boto3.session.Session(profile_name=f'data-maintainer-{env}')
s3_client = aws_session.client('s3')
lambda_client = aws_session.client('lambda')
s3 = aws_session.resource('s3')

landing_bucket = s3.Bucket(f'f14-datalake-landing-{env}')
input_bucket = f'f14-datalake-landing-{env}'
output_bucket = f'f14-datalake-raw-{env}'
base_prefix = 'amazon_sp_api/GET_FBA_MYI_UNSUPPRESSED_INVENTORY_DATA/'

In [2]:
import json 
from datetime import datetime
import tempfile
import progressbar
from datetime import datetime
import pathlib
import os


def get_files_to_proccess(valid_prefix):
    files_to_process = []
    for landing_bucket_object in landing_bucket.objects.all():
        object_last_update = landing_bucket_object.last_modified
        bucket_name = landing_bucket_object.bucket_name
        object_key = landing_bucket_object.key

        if valid_prefix in object_key and ('.csv' in object_key or '.tsv' in object_key or '.jsonl' in object_key):
            files_to_process.append([bucket_name, object_key, object_last_update])   
    
    return files_to_process


def get_lambda_payload(bucket_name, object_prefix, file_last_update_ts):
    return {
              "Records": [
                {
                  "eventVersion": "2.0",
                  "eventSource": "aws:s3",
                  "awsRegion": "us-east-1",
                  "eventTime": "1970-01-01T00:00:00.000Z",
                  "eventName": "ObjectCreated:Put",
                  "userIdentity": {
                    "principalId": "EXAMPLE"
                  },
                  "requestParameters": {
                    "sourceIPAddress": "127.0.0.1"
                  },
                  "responseElements": {
                    "x-amz-request-id": "EXAMPLE123456789",
                    "x-amz-id-2": "EXAMPLE123/5678abcdefghijklambdaisawesome/mnopqrstuvwxyzABCDEFGH"
                  },
                  "s3": {
                    "s3SchemaVersion": "1.0",
                    "configurationId": "testConfigRule",
                    "bucket": {
                      "name": bucket_name,
                      "ownerIdentity": {
                        "principalId": "EXAMPLE"
                      },
                      "arn": f"arn:aws:s3:::{bucket_name}"
                    },
                    "object": {
                      "key": object_prefix,
                      "size": 1024,
                      "eTag": "0123456789abcdef0123456789abcdef",
                      "sequencer": "0A1B2C3D4E5F678901"
                    }
                  }
                }
              ]
            }

In [3]:
import time
import progressbar
import io

files_to_process_all = get_files_to_proccess(base_prefix)
len(files_to_process_all)

2038

In [8]:
files_to_process_all[:5]

[['f14-datalake-landing-prod',
  'amazon_sp_api/GET_FBA_MYI_UNSUPPRESSED_INVENTORY_DATA/p_creation_dt=2022-03-03/p_brand_id=BARVIVO/p_region_id=EU/842052018985.tsv',
  datetime.datetime(2022, 3, 3, 16, 48, 4, tzinfo=tzutc())],
 ['f14-datalake-landing-prod',
  'amazon_sp_api/GET_FBA_MYI_UNSUPPRESSED_INVENTORY_DATA/p_creation_dt=2022-03-03/p_brand_id=BARVIVO/p_region_id=EU/842151018985.tsv',
  datetime.datetime(2022, 3, 3, 16, 46, 5, tzinfo=tzutc())],
 ['f14-datalake-landing-prod',
  'amazon_sp_api/GET_FBA_MYI_UNSUPPRESSED_INVENTORY_DATA/p_creation_dt=2022-03-03/p_brand_id=BARVIVO/p_region_id=EU/842238018985.tsv',
  datetime.datetime(2022, 3, 3, 16, 44, 5, tzinfo=tzutc())],
 ['f14-datalake-landing-prod',
  'amazon_sp_api/GET_FBA_MYI_UNSUPPRESSED_INVENTORY_DATA/p_creation_dt=2022-03-03/p_brand_id=BARVIVO/p_region_id=EU/842310018985.tsv',
  datetime.datetime(2022, 3, 3, 16, 42, 4, tzinfo=tzutc())],
 ['f14-datalake-landing-prod',
  'amazon_sp_api/GET_FBA_MYI_UNSUPPRESSED_INVENTORY_DATA/p_cr

In [6]:
import time
import progressbar
import io
from datetime import timedelta

previous_day_str = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d')
current_day_str = datetime.now().strftime('%Y-%m-%d')

files_to_process = [file_prefix for file_prefix in files_to_process_all if previous_day_str in file_prefix[1] or current_day_str in file_prefix[1]]
len(files_to_process)

0

In [7]:
files_to_process = files_to_process_all

In [9]:
import json
from time import sleep

i = 0
with progressbar.ProgressBar(max_value=len(files_to_process)) as bar:
    for file_to_process in files_to_process:   
        call_payload = get_lambda_payload(file_to_process[0], file_to_process[1], file_to_process[2])

        response = lambda_client.invoke_async(
            FunctionName='landingFileProcessor',
            InvokeArgs=json.dumps(call_payload),
        )
        sleep(1.5)          
        i = i + 1
        bar.update(i)

print("Done!")

100% (2038 of 2038) |####################| Elapsed Time: 0:55:27 Time:  0:55:27


Done!
