## Functions

In [2]:
# Dump Fn from S3
def s3_items_from_table(bucket, prefix):
    paginator = s3.get_paginator('list_objects_v2')
    pages = paginator.paginate(Bucket=bucket, Prefix=prefix)
    objects = []
    for page in pages:
        for item in page['Contents']:
            objects.append(item['Key'])
    
    return objects

In [3]:
def get_csv_files(input_lst, ext=".csv"):
    result = []
    for i in input_lst:
        if ext in i:
            result.append(i)
    
    return result

In [4]:
def read_from_s3(
    s3_client, bucket_name: str, file_prefix: str, encoding: str = "utf-8"
) -> str:
    obj = s3_client.get_object(Bucket=bucket_name, Key=file_prefix)
    body = obj["Body"].read().decode(encoding)

    return body

In [7]:
def df_from_str(
    body: str, quotechar: str = '"', sep: str = ",", delim_whitespace: bool = False
) -> pd.DataFrame:
    df = pd.read_csv(
        io.StringIO(body),
        quotechar=quotechar,
        sep=sep,
        delim_whitespace=delim_whitespace,
    )

    return df

In [8]:
def replace_with_tsv_extension(input_file_name: str) -> str:
    return input_file_name[: input_file_name.find(".")] + ".tsv"

In [9]:
def df_to_s3_as_tsv(s3, df, bucket, prefix):

    csv_buffer = io.StringIO()
    df.to_csv(csv_buffer, sep="	", index=False)
    s3.Object(bucket, prefix).put(Body=csv_buffer.getvalue())

## Main Execution

In [6]:
import pandas as pd
import progressbar
import io
import boto3

In [17]:
aws_access_key_id = "your_access_key"
aws_secret_access_key = "your_secret_key"
aws_session_token = "your_session_token"

In [18]:
# Cliente S3
s3 = boto3.client('s3', aws_access_key_id=aws_access_key_id, 
                          aws_secret_access_key=aws_secret_access_key, 
                          aws_session_token=aws_session_token)

s3_res = boto3.resource('s3', aws_access_key_id=aws_access_key_id, 
                            aws_secret_access_key=aws_secret_access_key, 
                            aws_session_token=aws_session_token)


In [19]:
bucket = 'f14-datalake-landing-prod'
destination_bucket = 'f14-datalake-landing-prod'
prefix = 'amazon_sp_api/GET_FBA_FULFILLMENT_CURRENT_INVENTORY_DATA/'

In [13]:
# list S3 Objects
files = s3_items_from_table(bucket=bucket, prefix=prefix)
print(f'{len(files)} items in total')

3747 items in total


In [14]:
files[:5]

['amazon_sp_api/GET_FBA_FULFILLMENT_CURRENT_INVENTORY_DATA/p_creation_dt=2021-10-27/p_brand_id=BARVIVO/p_region_id=EU/768408018927.tsv',
 'amazon_sp_api/GET_FBA_FULFILLMENT_CURRENT_INVENTORY_DATA/p_creation_dt=2021-10-27/p_brand_id=BARVIVO/p_region_id=EU/768411018927.tsv',
 'amazon_sp_api/GET_FBA_FULFILLMENT_CURRENT_INVENTORY_DATA/p_creation_dt=2021-10-27/p_brand_id=BARVIVO/p_region_id=EU/768415018927.tsv',
 'amazon_sp_api/GET_FBA_FULFILLMENT_CURRENT_INVENTORY_DATA/p_creation_dt=2021-10-27/p_brand_id=BARVIVO/p_region_id=EU/768419018927.tsv',
 'amazon_sp_api/GET_FBA_FULFILLMENT_CURRENT_INVENTORY_DATA/p_creation_dt=2021-10-27/p_brand_id=BARVIVO/p_region_id=EU/768428018927.tsv']

In [15]:
files_to_process = get_csv_files(files)
print(len(files_to_process))

9


In [16]:
files_to_process

['amazon_sp_api/GET_FBA_FULFILLMENT_CURRENT_INVENTORY_DATA/p_creation_dt=2021-12-17/p_brand_id=BARVIVO/p_region_id=EU/834834018977.csv',
 'amazon_sp_api/GET_FBA_FULFILLMENT_CURRENT_INVENTORY_DATA/p_creation_dt=2021-12-17/p_brand_id=BARVIVO/p_region_id=EU/834876018977.csv',
 'amazon_sp_api/GET_FBA_FULFILLMENT_CURRENT_INVENTORY_DATA/p_creation_dt=2021-12-17/p_brand_id=BARVIVO/p_region_id=EU/834997018978.csv',
 'amazon_sp_api/GET_FBA_FULFILLMENT_CURRENT_INVENTORY_DATA/p_creation_dt=2021-12-17/p_brand_id=BARVIVO/p_region_id=EU/835024018978.csv',
 'amazon_sp_api/GET_FBA_FULFILLMENT_CURRENT_INVENTORY_DATA/p_creation_dt=2021-12-17/p_brand_id=BARVIVO/p_region_id=EU/835152018978.csv',
 'amazon_sp_api/GET_FBA_FULFILLMENT_CURRENT_INVENTORY_DATA/p_creation_dt=2021-12-17/p_brand_id=BARVIVO/p_region_id=EU/835168018978.csv',
 'amazon_sp_api/GET_FBA_FULFILLMENT_CURRENT_INVENTORY_DATA/p_creation_dt=2021-12-17/p_brand_id=BARVIVO/p_region_id=EU/835302018978.csv',
 'amazon_sp_api/GET_FBA_FULFILLMENT_CURRE

In [20]:
i = 0
with progressbar.ProgressBar(max_value=len(files_to_process)) as bar:
    
    processed_file_names = []

    for file in files_to_process:
        
        body = read_from_s3(s3, bucket, file)

        df = df_from_str(body, sep="\t")
        
        output_prefix = replace_with_tsv_extension(file)

        df_to_s3_as_tsv(s3_res, df, destination_bucket, output_prefix)

        processed_file_names.append(f"s3://{destination_bucket}/{output_prefix}")

        i = i + 1
        bar.update(i)
        

print(f"Done!!!, processed files:{len(processed_file_names)}")
for x in processed_file_names:
    print(x)


100% (9 of 9) |##########################| Elapsed Time: 0:00:25 Time:  0:00:25


Done!!!, processed files:9
s3://f14-datalake-landing-prod/amazon_sp_api/GET_FBA_FULFILLMENT_CURRENT_INVENTORY_DATA/p_creation_dt=2021-12-17/p_brand_id=BARVIVO/p_region_id=EU/834834018977.tsv
s3://f14-datalake-landing-prod/amazon_sp_api/GET_FBA_FULFILLMENT_CURRENT_INVENTORY_DATA/p_creation_dt=2021-12-17/p_brand_id=BARVIVO/p_region_id=EU/834876018977.tsv
s3://f14-datalake-landing-prod/amazon_sp_api/GET_FBA_FULFILLMENT_CURRENT_INVENTORY_DATA/p_creation_dt=2021-12-17/p_brand_id=BARVIVO/p_region_id=EU/834997018978.tsv
s3://f14-datalake-landing-prod/amazon_sp_api/GET_FBA_FULFILLMENT_CURRENT_INVENTORY_DATA/p_creation_dt=2021-12-17/p_brand_id=BARVIVO/p_region_id=EU/835024018978.tsv
s3://f14-datalake-landing-prod/amazon_sp_api/GET_FBA_FULFILLMENT_CURRENT_INVENTORY_DATA/p_creation_dt=2021-12-17/p_brand_id=BARVIVO/p_region_id=EU/835152018978.tsv
s3://f14-datalake-landing-prod/amazon_sp_api/GET_FBA_FULFILLMENT_CURRENT_INVENTORY_DATA/p_creation_dt=2021-12-17/p_brand_id=BARVIVO/p_region_id=EU/835168