In [72]:
import boto3
import pandas as pd

aws_session = boto3.session.Session(profile_name='data-dev')
s3_client = boto3.client('s3')
s3 = aws_session.resource('s3')

landing_bucket = s3.Bucket('f14-datalake-landing-dev')
input_bucket = 'f14-datalake-landing-dev'
output_bucket = 'f14-datalake-raw-dev'
base_prefix = 'scrapers/seller_central/detail_page_sales_and_traffic_by_child_item_by_asin'

### General Functions

In [73]:
import json 
from datetime import datetime
import tempfile
import progressbar
from datetime import datetime
import pathlib
import os

def read_from_s3(s3_client, bucket_name, file_prefix):
    obj = s3_client.get_object(Bucket=bucket_name, Key=file_prefix)
    body = obj['Body'].read().decode('utf-8')
    return body


def upload_to_s3(s3_client, input_file_path, output_bucket, output_prefix):
    with open(input_file_path, "rb") as f:
        object_data = f.read()
        s3_client.put_object(Body=object_data, Bucket=output_bucket, Key=output_prefix)
        
def replace_with_raw_extension(input_file_name):
    return input_file_name[:input_file_name.find('.')] + '.jsonl'


def get_files_to_proccess(valid_prefix):
    files_to_process = []
    for landing_bucket_object in landing_bucket.objects.all():
        object_last_update = landing_bucket_object.last_modified
        bucket_name = landing_bucket_object.bucket_name
        object_key = landing_bucket_object.key

        if valid_prefix in object_key and ('.csv' in object_key or '.tsv' in object_key or '.jsonl' in object_key):
            files_to_process.append([bucket_name, object_key, object_last_update])   
    
    return files_to_process

## Scraper custom functions

### Parse input prefix to get output prefix
Input prefix example: scrapers/seller_central/detail_page_sales_and_traffic_by_asin/p_creation_dt=2019-11-16/p_brand_id=BARVIVO/p_region_id=NA/p_country_id=US/barvivo-na-us-detailsalestrafficbysku-2019-11-16.csv

In [74]:
def get_destination_prefix(file_prefix, last_modified_dt = datetime.now()):
    file_prefix = file_prefix.split('/')
    file_name = file_prefix[-1]
    integration = file_prefix[0]
    entity_name = file_prefix[2]
    brand = file_prefix[4].replace('p_brand_id=', '')
    region = file_prefix[5].replace('p_region_id=', '')
    country = file_prefix[6].replace('p_country_id=', '')
    platform = file_prefix[1]
    
    destination_prefix = f'{integration}/{platform}/{entity_name}/p_creation_dt={last_modified_dt.strftime("%Y-%m-%d")}/{file_name}'
    return replace_with_raw_extension(destination_prefix), brand, country, region

In [25]:
test_file_prefix = 'scrapers/seller_central/detail_page_sales_and_traffic_by_asin/p_creation_dt=2019-11-16/p_brand_id=BARVIVO/' \
                   'p_region_id=NA/p_country_id=US/barvivo-na-us-detailsalestrafficbysku-2019-11-16.csv'

get_destination_prefix(test_file_prefix)

('scrapers/seller_central/detail_page_sales_and_traffic_by_asin/p_creation_dt=2021-11-22/barvivo-na-us-detailsalestrafficbysku-2019-11-16.jsonl',
 'BARVIVO',
 'US',
 'NA')

In [101]:
def fix_cols_names(df):
    dest_cols = []
    for col in df.columns:
        new_name = col.lower().replace('-','_').replace(' ','_').replace('___','_').replace('(','').replace(')','')
        dest_cols.append(new_name)
        df[new_name] = df[col]
        
    return df[dest_cols]

### Convert file content to JSON list and save in temp

In [102]:
def convert_and_upload_parquet(file_content, brand_id, country_id, region_id, output_prefix):
    df = pd.read_csv(io.StringIO(test_file_content), quotechar='"', delim_whitespace=False)
    if df.empty:
        return

    df['brand_id'] = brand_id.upper()
    df['country_id'] = country_id.upper()
    df['region_id'] = region_id.upper()
    df['aud_process_ts'] = datetime.now().strftime('%Y-%m-%d-%H:%M:%S')
    
    df = fix_cols_names(df)
    
    temp_file_path = f"temp/{output_prefix}"
    destination_folder = temp_file_path[:temp_file_path.rfind("/")]
    
    if not os.path.exists(destination_folder):
        pathlib.Path(destination_folder).mkdir(parents=True, exist_ok=True)
        
    df.to_json(temp_file_path, orient="records", lines=True) 
    return temp_file_path

In [None]:
test_file_prefix = 'scrapers/seller_central/detail_page_sales_and_traffic_by_asin/p_creation_dt=2019-11-16/p_brand_id=BARVIVO/' \
                   'p_region_id=NA/p_country_id=US/barvivo-na-us-detailsalestrafficbysku-2019-11-16.csv'

test_file_content = read_from_s3(s3_client, input_bucket, test_file_prefix)
test_output_prefix, brand_id, country_id, region_id = get_destination_prefix(test_file_prefix)

test_df = convert_and_upload_parquet(test_file_content, brand_id, country_id, region_id, test_output_prefix)

### Process File

In [103]:
def process_file(s3_client, file_to_process):
    last_modified_dt = file_to_process[2]
    input_bucket = file_to_process[0]
    input_prefix = file_to_process[1]

    file_content = read_from_s3(s3_client, input_bucket, input_prefix)
    output_prefix, brand_id, country_id, region_id = get_destination_prefix(input_prefix)
    temp_file_path = convert_and_upload_parquet(file_content, brand_id, country_id, region_id, output_prefix)
    upload_to_s3(s3_client, temp_file_path, output_bucket, output_prefix)

In [None]:
test_file_to_process = ['f14-datalake-landing-dev', 'scrapers/seller_central/detail_page_sales_and_traffic_by_asin/p_creation_dt=2019-11-16/p_brand_id=BARVIVO/p_region_id=NA/p_country_id=US/barvivo-na-us-detailsalestrafficbysku-2019-11-16.csv', datetime(2021, 11, 22, 16, 57, 26)]
process_file(s3_client, test_file_to_process)


## Main Execution

In [105]:
import time
import progressbar

files_to_process = get_files_to_proccess(base_prefix)

i = 0
with progressbar.ProgressBar(max_value=len(files_to_process)+1) as bar:
    for file_to_process in files_to_process:   
        try:
            process_file(s3_client, file_to_process)        
        except:
            print(f"An exception occurred trying to process file {file_to_process}")
        
        i = i + 1
        bar.update(i)

print("Done!")

100% (1041 of 1041) |####################| Elapsed Time: 0:04:44 Time:  0:04:44


Done!
