## Functions

In [17]:
# Dump Fn from S3
def s3_items_from_table(bucket, prefix):
    paginator = s3_client.get_paginator('list_objects_v2')
    pages = paginator.paginate(Bucket=bucket, Prefix=prefix)
    objects = []
    for page in pages:
        for item in page['Contents']:
            objects.append(item['Key'])
    
    return objects

In [2]:
def get_selected_files(items_to_look_for, files):

    defective_landing_files = []

    for i in items_to_look_for:
        for j in files:
            if i[0] in j and i[1] in j:
                defective_landing_files.append(j)
    
    return defective_landing_files

In [3]:
def input_key_cleaner(input_list):

    keys_data_cleaned = []

    for i in input_list:
        data = i.split("/")
        dt = data[2].replace("p_creation_dt=", "")
        filename = data[3].split(".")[0]

        keys_data_cleaned.append((dt, filename))
    
    return keys_data_cleaned

In [4]:
def fix_item(body):
    body_lst = list(body)
    pos = body.find('"')
    del body_lst[:pos]
    body_mod = "".join(body_lst)

    if "," in body_mod[-3:]: 
        body_mod_lst = list(body_mod)
        pos = body_mod.rfind(',')
        del body_mod_lst[pos]
        body_mod = "".join(body_mod_lst)

    body_tab = body_mod.replace('","', "	")
    body_fixed = body_tab.replace('"', "")

    return body_fixed

In [5]:
def read_from_s3(
    s3_client, bucket_name: str, file_prefix: str, encoding: str = "utf-8"
) -> str:
    obj = s3_client.get_object(Bucket=bucket_name, Key=file_prefix)
    body = obj["Body"].read().decode(encoding)

    return body

In [8]:
def df_from_str(
    body: str, quotechar: str = '"', sep: str = ",", delim_whitespace: bool = False
) -> pd.DataFrame:
    df = pd.read_csv(
        io.StringIO(body),
        quotechar=quotechar,
        sep=sep,
        delim_whitespace=delim_whitespace,
    )

    return df

In [9]:
def df_to_s3_as_tsv(s3, df, bucket, prefix):

    csv_buffer = io.StringIO()
    df.to_csv(csv_buffer, sep="	", index=False)
    s3.Object(bucket, prefix).put(Body=csv_buffer.getvalue())

## Main Execution

In [37]:
import pandas as pd
import progressbar
import io
import boto3

In [40]:
env = 'prod'
aws_session = boto3.session.Session(profile_name=f'data-maintainer-{env}')
s3_client = aws_session.client('s3')
s3_resource = aws_session.resource('s3')

bucket = f'f14-datalake-landing-{env}'
destination_bucket = f'f14-datalake-landing-{env}'
prefix = 'amazon_sp_api/GET_RESTOCK_INVENTORY_RECOMMENDATIONS_REPORT/'

In [41]:
defective_raw_items = ["amazon_sp_api/get_restock_inventory_recommendations_report/p_creation_dt=2022-01-19/231540019011.jsonl",
                       "amazon_sp_api/get_restock_inventory_recommendations_report/p_creation_dt=2022-01-11/747329019003.jsonl",
]
print(len(defective_raw_items))

2


In [42]:
keys_data_cleaned = input_key_cleaner(defective_raw_items)

In [43]:
print(keys_data_cleaned)
print(len(keys_data_cleaned))

[('2022-01-19', '231540019011'), ('2022-01-11', '747329019003')]
2


In [44]:
# list S3 Objects
files = s3_items_from_table(bucket=bucket, prefix=prefix)
print(f'{len(files)} items in total')

2381 items in total


In [45]:
files[:5]

['amazon_sp_api/GET_RESTOCK_INVENTORY_RECOMMENDATIONS_REPORT/p_creation_dt=2021-12-10/p_brand_id=BARVIVO/p_region_id=EU/819466018961.tsv',
 'amazon_sp_api/GET_RESTOCK_INVENTORY_RECOMMENDATIONS_REPORT/p_creation_dt=2021-12-10/p_brand_id=BARVIVO/p_region_id=EU/819549018962.tsv',
 'amazon_sp_api/GET_RESTOCK_INVENTORY_RECOMMENDATIONS_REPORT/p_creation_dt=2021-12-10/p_brand_id=BARVIVO/p_region_id=EU/819664018962.tsv',
 'amazon_sp_api/GET_RESTOCK_INVENTORY_RECOMMENDATIONS_REPORT/p_creation_dt=2021-12-10/p_brand_id=BARVIVO/p_region_id=EU/819760018962.tsv',
 'amazon_sp_api/GET_RESTOCK_INVENTORY_RECOMMENDATIONS_REPORT/p_creation_dt=2021-12-10/p_brand_id=BARVIVO/p_region_id=EU/819849018962.tsv']

In [46]:
files_to_process = get_selected_files(keys_data_cleaned, files)
print(len(files_to_process))

2


In [47]:
files_to_process

['amazon_sp_api/GET_RESTOCK_INVENTORY_RECOMMENDATIONS_REPORT/p_creation_dt=2022-01-19/p_brand_id=WWO/p_region_id=EU/231540019011.tsv',
 'amazon_sp_api/GET_RESTOCK_INVENTORY_RECOMMENDATIONS_REPORT/p_creation_dt=2022-01-11/p_brand_id=TOOLZILLA/p_region_id=EU/747329019003.tsv']

In [48]:
i = 0
with progressbar.ProgressBar(max_value=len(files_to_process)) as bar:
    
    processed_file_names = []

    for file in files_to_process:
        
        body = read_from_s3(s3_client, bucket, file)
        body = fix_item(body)
        df = df_from_str(body, sep="\t")
        df_to_s3_as_tsv(s3_resource, df, destination_bucket, file)
        
        processed_file_names.append(f"s3://{destination_bucket}/{file}")

        i = i + 1
        bar.update(i)
        
print(f"Done!!!, processed files:{len(processed_file_names)}")
for x in processed_file_names:
    print(x)

100% (2 of 2) |##########################| Elapsed Time: 0:00:01 Time:  0:00:01


Done!!!, processed files:2
s3://f14-datalake-landing-prod/amazon_sp_api/GET_RESTOCK_INVENTORY_RECOMMENDATIONS_REPORT/p_creation_dt=2022-01-19/p_brand_id=WWO/p_region_id=EU/231540019011.tsv
s3://f14-datalake-landing-prod/amazon_sp_api/GET_RESTOCK_INVENTORY_RECOMMENDATIONS_REPORT/p_creation_dt=2022-01-11/p_brand_id=TOOLZILLA/p_region_id=EU/747329019003.tsv
