In [1]:
import boto3
import pandas as pd

aws_session = boto3.session.Session(profile_name='data-dev')
s3_client = boto3.client('s3')
s3 = aws_session.resource('s3')

landing_bucket = s3.Bucket('f14-datalake-landing-dev')
input_bucket = 'f14-datalake-landing-dev'
output_bucket = 'f14-datalake-raw-dev'
base_prefix = 'airbyte/shopify'

### General Functions

In [2]:
import json 
from datetime import datetime
import tempfile
import progressbar
from datetime import datetime
import pathlib
import os

def read_from_s3(s3_client, bucket_name, file_prefix):
    obj = s3_client.get_object(Bucket=bucket_name, Key=file_prefix)
    body = obj['Body'].read().decode('utf-8')
    return body


def upload_to_s3(s3_client, input_file_path, output_bucket, output_prefix):
    with open(input_file_path, "rb") as f:
        object_data = f.read()
        s3_client.put_object(Body=object_data, Bucket=output_bucket, Key=output_prefix)
        
def replace_with_raw_extension(input_file_name):
    return input_file_name[:input_file_name.find('.')] + '.jsonl'


def get_files_to_proccess(valid_prefix):
    files_to_process = []
    for landing_bucket_object in landing_bucket.objects.all():
        object_last_update = landing_bucket_object.last_modified
        bucket_name = landing_bucket_object.bucket_name
        object_key = landing_bucket_object.key

        if valid_prefix in object_key and ('.csv' in object_key or '.tsv' in object_key or '.jsonl' in object_key):
            files_to_process.append([bucket_name, object_key, object_last_update])   
    
    return files_to_process

## Airbyte Custom Functions

In [3]:
def get_files_to_proccess(valid_prefix):
    files_to_process = []
    for landing_bucket_object in landing_bucket.objects.all():
        object_last_update = landing_bucket_object.last_modified
        bucket_name = landing_bucket_object.bucket_name
        object_key = landing_bucket_object.key

        if valid_prefix in object_key \
            and ('fulfillments' in object_key or 'orders_refunds' in object_key or 'orders' in object_key) \
            and ('.csv' in object_key or '.tsv' in object_key or '.jsonl' in object_key):
            files_to_process.append([bucket_name, object_key, object_last_update])   
    
    return files_to_process

In [4]:
def get_destination_prefix(file_prefix, last_modified_dt = datetime.now()):
    file_prefix = file_prefix.split('/')
    file_name = file_prefix[4]
    integration = file_prefix[0]
    entity_name = file_prefix[3]
    brand = file_prefix[2][:-1]
    platform = file_prefix[1]
    
    destination_prefix = f'{integration}/p_integration_id={platform}/p_entity_id={entity_name}/p_creation_dt={last_modified_dt.strftime("%Y-%m-%d")}/{file_name}'
    return replace_with_raw_extension(destination_prefix), brand

In [5]:
def convert_and_upload_parquet(file_content, brand_name, output_prefix):
    file_content = file_content.split('\n')[:-1]
    df = pd.DataFrame.from_dict([json.loads(item) for item in file_content])
    if df.empty:
        return

    df['brand_id'] = brand_name.upper()
    df['aud_process_ts'] = datetime.now().strftime('%Y-%m-%d-%H:%M:%S')
    df = df[['_airbyte_data', 'brand_id', 'aud_process_ts', '_airbyte_emitted_at', '_airbyte_ab_id']]

    
    temp_file_path = f"temp/{output_prefix}"
    destination_folder = temp_file_path[:temp_file_path.rfind("/")]
    
    if not os.path.exists(destination_folder):
        pathlib.Path(destination_folder).mkdir(parents=True, exist_ok=True)
        
    df.to_json(temp_file_path, orient="records", lines=True) 
    return temp_file_path

In [6]:
def process_file(s3_client, file_to_process):
    last_modified_dt = file_to_process[2]
    input_bucket = file_to_process[0]
    input_prefix = file_to_process[1]

    file_content = read_from_s3(s3_client, input_bucket, input_prefix)
    output_prefix, brand_id = get_destination_prefix(input_prefix)
    temp_file_path = convert_and_upload_parquet(file_content, brand_id, output_prefix)
    if temp_file_path:
        upload_to_s3(s3_client, temp_file_path, output_bucket, output_prefix)

In [7]:
import time
import progressbar

files_to_process = get_files_to_proccess(base_prefix)

i = 0
with progressbar.ProgressBar(max_value=len(files_to_process)+1) as bar:
    for file_to_process in files_to_process:   
        process_file(s3_client, file_to_process)                
        i = i + 1
        bar.update(i)

print("Done!")

100% (1207 of 1207) |####################| Elapsed Time: 0:22:12 Time:  0:22:12


Done!


In [None]:
 50% (462 of 919) |###########           | Elapsed Time: 0:04:16 ETA:   0:26:23

# Manual Testing

In [None]:
from dateutil.tz import tzutc

test_bucket_name = 'f14-datalake-landing-dev'
test_file_prefix = 'airbyte/shopify/barvivo_/orders/2021_11_16_1637063315292_0.jsonl'
test_last_mod_dt = datetime(2021, 11, 16, 11, 51, 42, tzinfo=tzutc())

destination_prefix, brand_name = get_destination_prefix(test_file_prefix, last_modified_dt=test_last_mod_dt)
file_content = read(s3_client, test_bucket_name, test_file_prefix)

In [None]:
df = pd.DataFrame.from_dict([json.loads(item) for item in file_content])

In [None]:
# pd.set_option('display.max_colwidth', 10)
# df['_airbyte_data'] = df['_airbyte_data'].apply(json.dumps).str.replace('{}','null').apply(json.loads)
df['_airbyte_data'] = df['_airbyte_data']

In [None]:
df['brand_id'] = brand_name.upper()
df['process_ts'] = datetime.now()
df.head()

In [None]:
# df.to_parquet('temp/temp.parquet')
df.to_json('temp/temp.jsonl', orient="records")

In [None]:
# readed_df = pd.read_parquet('temp/temp.parquet')
readed_df = pd.read_json('temp/temp.jsonl')
readed_df.head()

In [None]:
readed_df.info()