In [9]:
import boto3
import pandas as pd

aws_session = boto3.session.Session(profile_name="data-maintainer-prod")
s3_client = aws_session.client("s3")
s3 = aws_session.resource("s3")

landing_bucket = s3.Bucket("f14-datalake-landing-prod")
input_bucket = "f14-datalake-landing-prod"
output_bucket = "f14-datalake-raw-prod"
base_prefix = "amazon_sp_api/"

### General Functions

In [10]:
import json
from datetime import datetime
import tempfile
import progressbar
from datetime import datetime
import pathlib
import os


def read_from_s3(s3_client, bucket_name, file_prefix):
    obj = s3_client.get_object(Bucket=bucket_name, Key=file_prefix)
    body = obj["Body"].read().decode("utf-8")
    return body


def upload_to_s3(s3_client, input_file_path, output_bucket, output_prefix):
    with open(input_file_path, "rb") as f:
        object_data = f.read()
        s3_client.put_object(Body=object_data, Bucket=output_bucket, Key=output_prefix)


def replace_with_raw_extension(input_file_name):
    return input_file_name[: input_file_name.find(".")] + ".jsonl"


def get_files_to_proccess(valid_prefix):
    files_to_process = []
    for landing_bucket_object in landing_bucket.objects.all():
        object_last_update = landing_bucket_object.last_modified
        bucket_name = landing_bucket_object.bucket_name
        object_key = landing_bucket_object.key

        if valid_prefix in object_key and (
            ".csv" in object_key or ".tsv" in object_key or ".jsonl" in object_key
        ):
            files_to_process.append([bucket_name, object_key, object_last_update])

    return files_to_process

## Scraper custom functions

### Parse input prefix to get output prefix
Input prefix example: s3://f14-datalake-landing-prod/amazon_sp_api/GET_FBA_FULFILLMENT_CURRENT_INVENTORY_DATA/p_creation_dt=2021-10-27/p_brand_id=BARVIVO/p_region_id=EU/768408018927.tsv

In [11]:
def get_destination_prefix(file_prefix, last_modified_dt=datetime.now()):
    file_prefix = file_prefix.split("/")
    file_name = file_prefix[-1]
    creation_dt = file_prefix[2].replace("p_creation_dt=", "")
    brand = file_prefix[3].replace("p_brand_id=", "")
    region = file_prefix[4].replace("p_region_id=", "")
    entity_name = file_prefix[1].lower()

    destination_prefix = (
        f"amazon_sp_api/{entity_name}/p_creation_dt={creation_dt}/{file_name}"
    )
    return replace_with_raw_extension(destination_prefix), brand, region


def fix_cols_names(df):
    dest_cols = []
    for col in df.columns:
        new_name = (
            col.lower()
            .replace("-", "_")
            .replace(" ", "_")
            .replace("___", "_")
            .replace("(", "")
            .replace(")", "")
        )
        dest_cols.append(new_name)
        df[new_name] = df[col]

    return df[dest_cols]

### Convert file content to JSON list and save in temp

In [12]:
def convert_and_upload_parquet(file_content, brand_id, region_id, output_prefix):
    df = pd.read_csv(
        io.StringIO(file_content),
        quotechar='"',
        sep="\t",
        header=0,
        delim_whitespace=False,
    )
    if df.empty:
        return

    df["brand_id"] = brand_id.upper()
    df["region_id"] = region_id.upper()
    df["aud_process_ts"] = datetime.now().strftime("%Y-%m-%d-%H:%M:%S")

    df = fix_cols_names(df)

    temp_file_path = f"temp/{output_prefix}"
    destination_folder = temp_file_path[: temp_file_path.rfind("/")]

    if not os.path.exists(destination_folder):
        pathlib.Path(destination_folder).mkdir(parents=True, exist_ok=True)

    df.to_json(temp_file_path, orient="records", lines=True)
    return temp_file_path

### Process File

In [13]:
def process_file(s3_client, file_to_process):
    last_modified_dt = file_to_process[2]
    input_bucket = file_to_process[0]
    input_prefix = file_to_process[1]

    file_content = read_from_s3(s3_client, input_bucket, input_prefix)
    output_prefix, brand_id, region_id = get_destination_prefix(input_prefix)
    temp_file_path = convert_and_upload_parquet(
        file_content, brand_id, region_id, output_prefix
    )
    if temp_file_path:
        upload_to_s3(s3_client, temp_file_path, output_bucket, output_prefix)

## Main Execution

In [14]:
import time
import progressbar
import io

files_to_process_all = get_files_to_proccess(base_prefix)

In [15]:
from datetime import timedelta

previous_day_str = (datetime.now() - timedelta(days=1)).strftime("%Y-%m-%d")
current_day_str = datetime.now().strftime("%Y-%m-%d")

files_to_process = [
    file_prefix
    for file_prefix in files_to_process_all
    if previous_day_str in file_prefix[1] or current_day_str in file_prefix[1]
]
len(files_to_process)

328

In [None]:
i = 0
with progressbar.ProgressBar(max_value=len(files_to_process) + 1) as bar:
    for file_to_process in files_to_process:
        process_file(s3_client, file_to_process)
        i = i + 1
        bar.update(i)

print("Done!")

 13% (43 of 329) |###                    | Elapsed Time: 0:01:54 ETA:   2:25:45

## Functions Tests

In [None]:
test_input_prefix = "amazon_sp_api/GET_FBA_FULFILLMENT_CURRENT_INVENTORY_DATA/p_creation_dt=2021-10-27/p_brand_id=BARVIVO/p_region_id=EU/768408018927.tsv"
get_destination_prefix(test_input_prefix)

In [None]:
files_to_process = get_files_to_proccess(base_prefix)
files_to_process[0:2]