In [13]:
import pandas as pd
import s3fs
import boto3
import numpy as np
from io import StringIO

In [5]:
_BUCKET_NAME = 'bsa-correlation-one'
_PREFIX = 'electronics/'
product_group = _PREFIX.replace('/','')

client = boto3.client('s3', aws_access_key_id='',
                            aws_secret_access_key='')

def ListFiles(client):
    """List files in specific S3 URL"""
    response = client.list_objects(Bucket=_BUCKET_NAME, Prefix=_PREFIX)
    for content in response.get('Contents', []):
        yield content.get('Key')

file_list = ListFiles(client)
product_types = []
for file in file_list:
    try:
        f = file.split(_PREFIX)[-1]
        if len(f.split('_')) == 3:
            types.append(f.split('_')[1])
    except:
        pass
    
product_types = list(dict.fromkeys(types))

### Process Type functions

In [35]:
def read_orders(client, product_group, product_type):
    bucket = 'bsa-correlation-one'
    object_key = product_group + "/" + product_group + "_" +product_type + "_orders.csv"

    csv_obj = client.get_object(Bucket=bucket, Key=object_key)
    body = csv_obj['Body']
    csv_string = body.read().decode('utf-8')

    
    orders = pd.read_csv(StringIO(csv_string), dtype={'seller_item_gtin':'str'})
    
    orders = orders[['gmv', 'revenue', 'cogs', 'gross_profit',
       'channel_slug', 'channel_store', 'olist_order_purchase_timestamp',
       'olist_order_status',
       'olist_order_estimated_delivery_date',
       'olist_order_estimated_delivery_shift',
       'olist_order_total_discount',
       'olist_item_sku',
       'olist_item_full_name', 'olist_item_quantity',
       'olist_item_price', 'carrier', 'olist_item_freight_value',
       'olist_item_freight_mode',
       'seller_item_product_sku', 'seller_item_price',
       'seller_item_gtin',
       'seller_item_product_attributes',
       'seller_item_freight_value',
       'seller_item_commission_freight_olist',
       'seller_item_commission_product_olist',
       'seller_item_price_freight_shift',
       'seller_item_price_discount', 'seller_item_enable_subsidy']].copy() 

    orders['olist_order_purchase_timestamp'] = pd.to_datetime(orders.olist_order_purchase_timestamp)

    orders = orders.groupby([pd.Grouper(key='olist_order_purchase_timestamp', freq='D'),
                                 'seller_item_gtin', 'olist_item_full_name', 'seller_item_enable_subsidy']).sum()\
                        .sort_values(by=['seller_item_gtin', 'olist_order_purchase_timestamp']).reset_index()

    for column in ['olist_order_total_discount', 'olist_item_price', 'olist_item_freight_value',
           'seller_item_price', 'seller_item_freight_value',
           'seller_item_commission_freight_olist',
           'seller_item_commission_product_olist',
           'seller_item_price_freight_shift', 'seller_item_price_discount']:
    
        orders[column] = orders[column].values / orders['olist_item_quantity'].values
        
    orders.columns = ['purchase_timestamp', 'gtin', 'item_name',
                  'enable_subsidy', 'gmv', 'revenue', 'cogs', 'gross_profit',
                  'total_discount', 'orders', 'olist_price',
                  'freight_value', 'seller_item_price', 'seller_item_freight_value',
                  'commission_freight', 'commission_product', 
                  'price_freight_shift', 'price_discount']

    orders.drop(columns=['seller_item_price', 'seller_item_freight_value'], inplace=True)
    
    return orders


def read_competition(client, product_group, product_type):
    bucket = 'bsa-correlation-one'
    object_key = product_group + '/competition_' + product_group + '.csv'

    csv_obj = client.get_object(Bucket=bucket, Key=object_key)
    body = csv_obj['Body']
    csv_string = body.read().decode('utf-8')

    competition = pd.read_csv(StringIO(csv_string), dtype={'gtin':'str'})

    competition['updated_at'] = pd.to_datetime(competition.updated_at)

    competition.drop(columns=['Unnamed: 0'], inplace=True)
    
    return competition


def read_history(client, product_group, product_type):
    bucket = 'bsa-correlation-one'
    object_key = product_group + "/" + product_group+"_" + product_type + "_history.csv"

    csv_obj = client.get_object(Bucket=bucket, Key=object_key)
    body = csv_obj['Body']
    csv_string = body.read().decode('utf-8')

    history = pd.read_csv(StringIO(csv_string), lineterminator='\n', dtype={'gtin':'str'})

    history['updated_at'] = pd.to_datetime(history['updated_at'])
    
    return history


def process_gtin(gtin, orders, competition, history):
    temp_orders = orders[orders['gtin']==gtin].copy()
    temp_competition = competition[competition['gtin']==gtin].copy()
    temp_history = history[(history['gtin']==gtin) & (history['status']=='published')].copy()
    # Get first date and create range dataframe
    created_at = temp_history.updated_at.min()
    df = pd.DataFrame(data=pd.date_range(start=created_at.date(), end='2020-04-20', freq='D'),
                      columns=['date'])
    
    return temp_orders, temp_competition, temp_history, df


def process_competition(temp_competition, df):
    ### Competition
    # filter competition on extremely low and high prices
    std = temp_competition['value'].std()
    mean = temp_competition['value'].mean()
    temp_competition = temp_competition[(temp_competition['value']>= mean-3*std)
                                        & (temp_competition['value']<= mean+3*std)]

    # Group by freq and get minimum value
    temp_competition = temp_competition.groupby([pd.Grouper(key='updated_at', freq='D'),'gtin']).min()[['value']].reset_index()
    temp_competition['updated_at'] = temp_competition.updated_at.apply(lambda x: x.date())
    temp_competition['updated_at'] = pd.to_datetime(temp_competition.updated_at)
    temp_competition.rename(columns={'value':'competition_price'}, inplace=True)

    ### Clean df competition
    df = pd.merge(left=df, right=temp_competition,
                  left_on='date', right_on='updated_at', how='left')

    df.drop(columns=['updated_at','gtin'], inplace=True)

    df['competition_price'] = df['competition_price'].fillna(method='pad')#.fillna(method='bfill')
    
    return df


def process_orders(temp_orders, df):
    # Orders

    df = pd.merge(left=df, right=temp_orders,
                  left_on='date', right_on='purchase_timestamp', how='left')

    df.drop(columns=['purchase_timestamp'], inplace=True)
    
    return df


def process_history(temp_history, df):
    ### History
    # filter zero and outlier values
    temp_history = temp_history[temp_history['offer'] > 0].copy()
    std = temp_history['offer'].std()
    mean = temp_history['offer'].mean()
    temp_history = temp_history[(temp_history['offer']>= mean-3*std) & (temp_history['offer']<= mean+3*std)]
    temp_history = temp_history.groupby([pd.Grouper(key='updated_at', freq='D')])\
                               .agg({'stock':['max', 'min', 'mean'], 
                                     'price':['max', 'min', 'mean'],
                                     'offer':['max', 'min', 'mean']}).reset_index()

    temp_history.columns = temp_history.columns.droplevel(0)

    temp_history.columns = ['date',
                            'stock_max', 'stock_min', 'stock_avg',
                            'price_max','price_min','price_avg',
                            'offer_max','offer_min','offer_avg']

    temp_history = temp_history.fillna(method='pad').fillna(method='bfill')
    
    df = pd.merge(left=df, right=temp_history, left_on='date', right_on='date')
    
    return df


def adjustments(df):
    df['gtin'] = gtin
    df['item_name'] = df['item_name'].fillna(method='pad').fillna(method='bfill')

    df[['gmv','revenue','cogs','gross_profit','orders']] = df[['gmv','revenue','cogs','gross_profit','orders']].fillna(value=0)

    df.loc[df["olist_price"].isnull(),'olist_price'] = df["offer_avg"]
    
    return df

def save_type(master, product_group, product_type):
    fs = s3fs.S3FileSystem(anon=False, key='', secret='')
    bytes_to_write = master.to_csv(None).encode()
    with fs.open('s3://bsa-correlation-one/{}/{}_data.csv'.format(product_group, product_type), 'wb') as f:
        f.write(bytes_to_write)
        
        
def save_group(full_df, product_group):
    fs = s3fs.S3FileSystem(anon=False, key='', secret='')
    bytes_to_write = full_df.to_csv(None).encode()
    with fs.open('s3://bsa-correlation-one/{}_data.csv'.format(product_group), 'wb') as f:
        f.write(bytes_to_write)
    

In [7]:
product_types[:5]

['Adaptador eletrônico',
 'Alto-falante',
 'Amplificador de linha',
 'Amplificador de potência',
 'Amplificador']

In [11]:
product_group

'electronics'

In [36]:
total_types = len(product_types)
processed_types = 0
masters = []
for j, product_type in enumerate(product_types):
    print('-'*72)
    print("PRODUCT TYPE: {}\t {}/{}".format(product_type,j+1, total_types))

    try:
        orders = read_orders(client, product_group, product_type)
        competition = read_competition(client, product_group, product_type)
        history = read_history(client, product_group, product_type)
    except:
        continue
    
    if (orders.shape[0] > 0):
        gtins = list(set(orders.gtin))

        dfs = []
        success = 0
        failed = 0
        total = len(gtins)
        for i, gtin in enumerate(gtins):
            try:
                temp_orders, temp_competition, temp_history, df = process_gtin(gtin, orders, competition, history)
                try:
                    df = process_competition(temp_competition, df)
                except:
                    df['competition_price'] = np.nan
                df = process_orders(temp_orders, df)
                df = process_history(temp_history, df)
                df = adjustments(df)

                dfs.append(df)
                success += 1

                if (success+failed) % 50 == 0:
                    print("Success: {}\tFailed: {}\tProcessed: {}\tTotal: {}".format(success, failed, success+failed, total))
            except Exception as e:
                failed += 1
                if (success+failed) % 50 == 0:
                    print("Success: {}\tFailed: {}\tProcessed: {}\tTotal: {}".format(success, failed, success+failed, total))


    print("Success: {}\tFailed: {}\tProcessed: {}\tTotal: {}".format(success, failed, success+failed, total))

    
    try:
        master = pd.concat(dfs)
        master['product_type'] = product_type

        masters.append(master)

        save_type(master, product_group, product_type)
    except Exception as e:
        print(e)
    
full_df = pd.concat(masters)
full_df['product_group'] = product_group

save_group(full_df,product_group)



------------------------------------------------------------------------
PRODUCT TYPE: Adaptador eletrônico	 1/107
Success: 50	Failed: 0	Processed: 50	Total: 324
Success: 100	Failed: 0	Processed: 100	Total: 324
Success: 149	Failed: 1	Processed: 150	Total: 324
Success: 199	Failed: 1	Processed: 200	Total: 324
Success: 249	Failed: 1	Processed: 250	Total: 324
Success: 298	Failed: 2	Processed: 300	Total: 324
Success: 322	Failed: 2	Processed: 324	Total: 324
------------------------------------------------------------------------
PRODUCT TYPE: Alto-falante	 2/107
Success: 50	Failed: 0	Processed: 50	Total: 245
Success: 100	Failed: 0	Processed: 100	Total: 245
Success: 150	Failed: 0	Processed: 150	Total: 245
Success: 199	Failed: 1	Processed: 200	Total: 245
Success: 243	Failed: 2	Processed: 245	Total: 245
------------------------------------------------------------------------
PRODUCT TYPE: Amplificador de linha	 3/107
Success: 1	Failed: 0	Processed: 1	Total: 1
-----------------------------------

In [37]:
full_df.shape

(3079658, 28)

In [39]:
full_df.product_type.nunique()

101