## Functions

In [149]:
def input_key_cleaner(input_list):

    keys_data_cleaned = []

    for i in input_list:
        data = i.split("/")
        dt = data[2].replace("p_creation_dt=", "")
        filename = data[3].split(".")[0]

        keys_data_cleaned.append((dt, filename))
    
    return keys_data_cleaned

In [159]:
# Dump Fn from S3
def s3_items_from_table(bucket, prefix):
    paginator = s3.get_paginator('list_objects_v2')
    pages = paginator.paginate(Bucket=bucket, Prefix=prefix)
    objects = []
    for page in pages:
        for item in page['Contents']:
            objects.append(item['Key'])
    
    return objects

In [182]:
def get_selected_files(items_to_look_for, files):

    defective_landing_files = []

    for i in items_to_look_for:
        for j in files:
            if i[0] in j and i[1] in j:
                defective_landing_files.append(j)
    
    return defective_landing_files


In [82]:
def read_from_s3(
    s3_client, bucket_name: str, file_prefix: str, encoding: str = "utf-8"
) -> str:
    obj = s3_client.get_object(Bucket=bucket_name, Key=file_prefix)
    body = obj["Body"].read().decode(encoding)

    return body

In [166]:
def fix_item(body):
    body_lst = list(body)
    pos = body.find('"')
    
    del body_lst[:pos]
    
    body_mod = "".join(body_lst)

    body_tab = body_mod.replace('","', "	")
    body_fixed = body_tab.replace('"', "")

    return body_fixed

In [120]:
def df_from_str(
    body: str, quotechar: str = '"', sep: str = ",", delim_whitespace: bool = False
) -> pd.DataFrame:
    df = pd.read_csv(
        io.StringIO(body),
        quotechar=quotechar,
        sep=sep,
        delim_whitespace=delim_whitespace,
    )

    return df

In [123]:
from datetime import datetime


def spapi_df_transform(
    df: pd.DataFrame,
    brand_id: str,
    region_id: str,
    aud_process_ts: datetime = datetime.now(),
) -> pd.DataFrame:
    df["brand_id"] = brand_id.upper()
    df["region_id"] = region_id.upper()
    df["aud_process_ts"] = aud_process_ts.strftime(
        "%Y-%m-%d %H:%M:%S"
    )

    return df

In [124]:
def fix_col_titles(df: pd.DataFrame) -> pd.DataFrame:
    dest_cols = []
    for col in df.columns:
        new_name = (
            col.lower()
            .replace("-", "_")
            .replace(" ", "_")
            .replace("___", "_")
            .replace("(", "")
            .replace(")", "")
        )
        dest_cols.append(new_name)
        df[new_name] = df[col]

    return df[dest_cols]

In [125]:
def replace_with_raw_extension(input_file_name: str) -> str:
    return input_file_name[: input_file_name.find(".")] + ".jsonl"

In [127]:
def get_destination_prefix_spapi(file_prefix: str):
    file_prefix = file_prefix.split("/")
    file_name = file_prefix[-1]
    root = file_prefix[0]
    brand = file_prefix[3].replace("p_brand_id=", "")
    region = file_prefix[4].replace("p_region_id=", "")
    report = file_prefix[1].lower()
    dt = file_prefix[2]

    destination_prefix = f"{root}/{report}/{dt}/{file_name}"

    return destination_prefix, brand, region

In [168]:
def df_to_s3_as_jsonl(s3, df, bucket, prefix):

    json_buffer = io.StringIO()
    df.to_json(json_buffer, orient="records", lines=True)
    my_bucket = s3.Bucket(bucket)
    my_bucket.put_object(Key=prefix, Body=json_buffer.getvalue())


## Main Execution

In [190]:
import pandas as pd
import progressbar
import io
import boto3

In [158]:
aws_access_key_id = "your_access_key"
aws_secret_access_key = "your_secret_key"
aws_session_token = "your_session_token"

In [167]:
# Cliente S3
s3 = boto3.client('s3', aws_access_key_id=aws_access_key_id, 
                          aws_secret_access_key=aws_secret_access_key, 
                          aws_session_token=aws_session_token)

s3_res = boto3.resource('s3', aws_access_key_id=aws_access_key_id, 
                            aws_secret_access_key=aws_secret_access_key, 
                            aws_session_token=aws_session_token)


In [196]:
bucket = 'f14-datalake-landing-dev'
destination_bucket = 'f14-datalake-raw-dev'
prefix = 'amazon_sp_api/GET_FBA_FULFILLMENT_CURRENT_INVENTORY_DATA/'

In [213]:
defective_raw_keys = ["amazon_sp_api/get_fba_fulfillment_current_inventory_data/p_creation_dt=2021-11-22/357488018953.jsonl",
                    "amazon_sp_api/get_fba_fulfillment_current_inventory_data/p_creation_dt=2021-11-22/648067018953.jsonl",
                    "amazon_sp_api/get_fba_fulfillment_current_inventory_data/p_creation_dt=2021-11-22/648130018953.jsonl",
                    "amazon_sp_api/get_fba_fulfillment_current_inventory_data/p_creation_dt=2021-11-22/809464018953.jsonl",
                    "amazon_sp_api/get_fba_fulfillment_current_inventory_data/p_creation_dt=2021-11-22/809465018953.jsonl",
                    "amazon_sp_api/get_fba_fulfillment_current_inventory_data/p_creation_dt=2021-11-22/809468018953.jsonl",
                    "amazon_sp_api/get_fba_fulfillment_current_inventory_data/p_creation_dt=2021-11-22/809574018953.jsonl",
                    "amazon_sp_api/get_fba_fulfillment_current_inventory_data/p_creation_dt=2021-11-24/358541018955.jsonl",
                    "amazon_sp_api/get_fba_fulfillment_current_inventory_data/p_creation_dt=2021-11-24/358542018955.jsonl",
                    "amazon_sp_api/get_fba_fulfillment_current_inventory_data/p_creation_dt=2021-11-24/649570018955.jsonl",
                    "amazon_sp_api/get_fba_fulfillment_current_inventory_data/p_creation_dt=2021-11-24/689919018955.jsonl",
                    "amazon_sp_api/get_fba_fulfillment_current_inventory_data/p_creation_dt=2021-11-24/812229018955.jsonl",
                    "amazon_sp_api/get_fba_fulfillment_current_inventory_data/p_creation_dt=2021-11-25/691423018956.jsonl",
                    "amazon_sp_api/get_fba_fulfillment_current_inventory_data/p_creation_dt=2021-12-01/516697018962.jsonl",
                    "amazon_sp_api/get_fba_fulfillment_current_inventory_data/p_creation_dt=2021-12-01/532139018962.jsonl",
                    "amazon_sp_api/get_fba_fulfillment_current_inventory_data/p_creation_dt=2021-12-03/655708018964.jsonl",
                    "amazon_sp_api/get_fba_fulfillment_current_inventory_data/p_creation_dt=2021-12-03/822433018964.jsonl",
                    "amazon_sp_api/get_fba_fulfillment_current_inventory_data/p_creation_dt=2021-12-03/822442018964.jsonl",
                    "amazon_sp_api/get_fba_fulfillment_current_inventory_data/p_creation_dt=2021-12-04/534306018965.jsonl",
                    "amazon_sp_api/get_fba_fulfillment_current_inventory_data/p_creation_dt=2021-12-07/708338018968.jsonl",
                    "amazon_sp_api/get_fba_fulfillment_current_inventory_data/p_creation_dt=2021-12-10/93094018971.jsonl",
                    "amazon_sp_api/get_fba_fulfillment_current_inventory_data/p_creation_dt=2021-12-12/249889018973.jsonl",
                    "amazon_sp_api/get_fba_fulfillment_current_inventory_data/p_creation_dt=2021-12-16/524066018977.jsonl",
                    "amazon_sp_api/get_fba_fulfillment_current_inventory_data/p_creation_dt=2021-12-16/543187018977.jsonl",
                    "amazon_sp_api/get_fba_fulfillment_current_inventory_data/p_creation_dt=2021-12-16/543188018977.jsonl",
                    "amazon_sp_api/get_fba_fulfillment_current_inventory_data/p_creation_dt=2021-12-17/524657018978.jsonl",
                    "amazon_sp_api/get_fba_fulfillment_current_inventory_data/p_creation_dt=2021-12-17/543187018977.jsonl",
                    "amazon_sp_api/get_fba_fulfillment_current_inventory_data/p_creation_dt=2021-12-17/543188018977.jsonl",
                    "amazon_sp_api/get_fba_fulfillment_current_inventory_data/p_creation_dt=2021-12-18/524066018977.jsonl",
                    "amazon_sp_api/get_fba_fulfillment_current_inventory_data/p_creation_dt=2021-12-18/524657018978.jsonl",]
print(len(defective_raw_keys))

30


In [214]:
keys_data_cleaned = input_key_cleaner(defective_raw_keys)

In [215]:
print(keys_data_cleaned)
print(len(keys_data_cleaned))

[('2021-11-22', '357488018953'), ('2021-11-22', '648067018953'), ('2021-11-22', '648130018953'), ('2021-11-22', '809464018953'), ('2021-11-22', '809465018953'), ('2021-11-22', '809468018953'), ('2021-11-22', '809574018953'), ('2021-11-24', '358541018955'), ('2021-11-24', '358542018955'), ('2021-11-24', '649570018955'), ('2021-11-24', '689919018955'), ('2021-11-24', '812229018955'), ('2021-11-25', '691423018956'), ('2021-12-01', '516697018962'), ('2021-12-01', '532139018962'), ('2021-12-03', '655708018964'), ('2021-12-03', '822433018964'), ('2021-12-03', '822442018964'), ('2021-12-04', '534306018965'), ('2021-12-07', '708338018968'), ('2021-12-10', '93094018971'), ('2021-12-12', '249889018973'), ('2021-12-16', '524066018977'), ('2021-12-16', '543187018977'), ('2021-12-16', '543188018977'), ('2021-12-17', '524657018978'), ('2021-12-17', '543187018977'), ('2021-12-17', '543188018977'), ('2021-12-18', '524066018977'), ('2021-12-18', '524657018978')]
30


In [219]:
# list S3 Objects
files = s3_items_from_table(bucket=bucket, prefix=prefix)
print(f'{len(files)} items in total')

3623 items in total


In [220]:
files[:5]

['amazon_sp_api/GET_FBA_FULFILLMENT_CURRENT_INVENTORY_DATA/p_creation_dt=2021-10-27/p_brand_id=BARVIVO/p_region_id=EU/768408018927.tsv',
 'amazon_sp_api/GET_FBA_FULFILLMENT_CURRENT_INVENTORY_DATA/p_creation_dt=2021-10-27/p_brand_id=BARVIVO/p_region_id=EU/768411018927.tsv',
 'amazon_sp_api/GET_FBA_FULFILLMENT_CURRENT_INVENTORY_DATA/p_creation_dt=2021-10-27/p_brand_id=BARVIVO/p_region_id=EU/768415018927.tsv',
 'amazon_sp_api/GET_FBA_FULFILLMENT_CURRENT_INVENTORY_DATA/p_creation_dt=2021-10-27/p_brand_id=BARVIVO/p_region_id=EU/768419018927.tsv',
 'amazon_sp_api/GET_FBA_FULFILLMENT_CURRENT_INVENTORY_DATA/p_creation_dt=2021-10-27/p_brand_id=BARVIVO/p_region_id=EU/768428018927.tsv']

In [221]:
defective_landing_files = get_selected_files(keys_data_cleaned, files)
print(len(defective_landing_files))

30


In [222]:
defective_landing_files[:5]

['amazon_sp_api/GET_FBA_FULFILLMENT_CURRENT_INVENTORY_DATA/p_creation_dt=2021-11-22/p_brand_id=WWO/p_region_id=US/357488018953.tsv',
 'amazon_sp_api/GET_FBA_FULFILLMENT_CURRENT_INVENTORY_DATA/p_creation_dt=2021-11-22/p_brand_id=BARVIVO/p_region_id=US/648067018953.tsv',
 'amazon_sp_api/GET_FBA_FULFILLMENT_CURRENT_INVENTORY_DATA/p_creation_dt=2021-11-22/p_brand_id=BARVIVO/p_region_id=US/648130018953.tsv',
 'amazon_sp_api/GET_FBA_FULFILLMENT_CURRENT_INVENTORY_DATA/p_creation_dt=2021-11-22/p_brand_id=BARVIVO/p_region_id=EU/809464018953.tsv',
 'amazon_sp_api/GET_FBA_FULFILLMENT_CURRENT_INVENTORY_DATA/p_creation_dt=2021-11-22/p_brand_id=BARVIVO/p_region_id=EU/809465018953.tsv']

In [223]:
i = 0
with progressbar.ProgressBar(max_value=len(defective_landing_files)+1) as bar:
    
    processed_file_names = []

    for def_land_file in defective_landing_files:
        
        body = read_from_s3(s3, bucket, def_land_file)

        if '"' in body:
            body = fix_item(body)
            df = df_from_str(body, sep="\t")
            
            output_prefix, brand_id, region_id = get_destination_prefix_spapi(def_land_file)
            output_prefix = replace_with_raw_extension(output_prefix)

            df = spapi_df_transform(df, brand_id, region_id)
            df = fix_col_titles(df)

            df_to_s3_as_jsonl(s3_res, df, destination_bucket, output_prefix)

            processed_file_names.append(f"s3://{destination_bucket}/{output_prefix}")

            i = i + 1
            bar.update(i)
        else:
            print(f"file {output_prefix} won't be processed")

print(f"Done!!!, processed files:{len(processed_file_names)}")
for x in processed_file_names:
    print(x)


100% (31 of 31) |########################| Elapsed Time: 0:06:54 Time:  0:06:54


Done!!!, processed files:30
s3://f14-datalake-raw-dev/amazon_sp_api/get_fba_fulfillment_current_inventory_data/p_creation_dt=2021-11-22/357488018953.jsonl
s3://f14-datalake-raw-dev/amazon_sp_api/get_fba_fulfillment_current_inventory_data/p_creation_dt=2021-11-22/648067018953.jsonl
s3://f14-datalake-raw-dev/amazon_sp_api/get_fba_fulfillment_current_inventory_data/p_creation_dt=2021-11-22/648130018953.jsonl
s3://f14-datalake-raw-dev/amazon_sp_api/get_fba_fulfillment_current_inventory_data/p_creation_dt=2021-11-22/809464018953.jsonl
s3://f14-datalake-raw-dev/amazon_sp_api/get_fba_fulfillment_current_inventory_data/p_creation_dt=2021-11-22/809465018953.jsonl
s3://f14-datalake-raw-dev/amazon_sp_api/get_fba_fulfillment_current_inventory_data/p_creation_dt=2021-11-22/809468018953.jsonl
s3://f14-datalake-raw-dev/amazon_sp_api/get_fba_fulfillment_current_inventory_data/p_creation_dt=2021-11-22/809574018953.jsonl
s3://f14-datalake-raw-dev/amazon_sp_api/get_fba_fulfillment_current_inventory_data/