In [29]:
import pandas as pd
import boto3
import numpy as np
import s3fs
from io import StringIO

In [2]:
pd.set_option('display.max_columns', None)

In [6]:
def credentials():
    client = boto3.client('s3', aws_access_key_id='', aws_secret_access_key='')
    return client


def read_orders(client, product_group, product_type):
    bucket = 'bsa-correlation-one'
    object_key = 'electronics/cellphone_orders.csv' # product_group + "/" + product_group+"_"+product_type+"_orders.csv"

    csv_obj = client.get_object(Bucket=bucket, Key=object_key)
    body = csv_obj['Body']
    csv_string = body.read().decode('utf-8')

    orders = pd.read_csv(StringIO(csv_string), dtype={'seller_item_gtin':'str'})
    
    orders = orders[['gmv', 'revenue', 'cogs', 'gross_profit',
       'channel_slug', 'channel_store', 'olist_order_purchase_timestamp',
       'olist_order_status',
       'olist_order_estimated_delivery_date',
       'olist_order_estimated_delivery_shift',
       'olist_order_total_discount',
       'olist_item_sku',
       'olist_item_full_name', 'olist_item_quantity',
       'olist_item_price', 'carrier', 'olist_item_freight_value',
       'olist_item_freight_mode',
       'seller_item_product_sku', 'seller_item_price',
       'seller_item_gtin',
       'seller_item_product_attributes',
       'seller_item_freight_value',
       'seller_item_commission_freight_olist',
       'seller_item_commission_product_olist',
       'seller_item_price_freight_shift',
       'seller_item_price_discount', 'seller_item_enable_subsidy']].copy() 

    orders['olist_order_purchase_timestamp'] = pd.to_datetime(orders.olist_order_purchase_timestamp)

    orders = orders.groupby([pd.Grouper(key='olist_order_purchase_timestamp', freq='D'),
                                 'seller_item_gtin', 'olist_item_full_name', 'seller_item_enable_subsidy']).sum()\
                        .sort_values(by=['seller_item_gtin', 'olist_order_purchase_timestamp']).reset_index()

    for column in ['olist_order_total_discount', 'olist_item_price', 'olist_item_freight_value',
           'seller_item_price', 'seller_item_freight_value',
           'seller_item_commission_freight_olist',
           'seller_item_commission_product_olist',
           'seller_item_price_freight_shift', 'seller_item_price_discount']:
    
        orders[column] = orders[column].values / orders['olist_item_quantity'].values
        
    orders.columns = ['purchase_timestamp', 'gtin', 'item_name',
                  'enable_subsidy', 'gmv', 'revenue', 'cogs', 'gross_profit',
                  'total_discount', 'orders', 'olist_price',
                  'freight_value', 'seller_item_price', 'seller_item_freight_value',
                  'commission_freight', 'commission_product', 
                  'price_freight_shift', 'price_discount']

    orders.drop(columns=['seller_item_price', 'seller_item_freight_value'], inplace=True)
    
    return orders


def read_competition(client, product_group, product_type):
    bucket = 'bsa-correlation-one'
    object_key = 'electronics/competition_electronics.csv' # product_group + '/competition_' + product_group + '.csv'

    csv_obj = client.get_object(Bucket=bucket, Key=object_key)
    body = csv_obj['Body']
    csv_string = body.read().decode('utf-8')

    competition = pd.read_csv(StringIO(csv_string), dtype={'gtin':'str'})

    competition['updated_at'] = pd.to_datetime(competition.updated_at)

    competition.drop(columns=['Unnamed: 0'], inplace=True)
    
    return competition


def read_history(client, product_group, product_type):
    bucket = 'bsa-correlation-one'
    object_key = 'electronics/cellphone_history.csv' # product_group + "/" + product_group+"_" + product_type + "_history.csv"

    csv_obj = client.get_object(Bucket=bucket, Key=object_key)
    body = csv_obj['Body']
    csv_string = body.read().decode('utf-8')

    history = pd.read_csv(StringIO(csv_string), lineterminator='\n', dtype={'gtin':'str'})

    history['updated_at'] = pd.to_datetime(history['updated_at'])
    
    return history


In [15]:
def process_gtin(gtin, orders, competition, history):
    temp_orders = orders[orders['gtin']==gtin].copy()
    temp_competition = competition[competition['gtin']==gtin].copy()
    temp_history = history[(history['gtin']==gtin) & (history['status']=='published')].copy()
    # Get first date and create range dataframe
    created_at = temp_history.updated_at.min()
    df = pd.DataFrame(data=pd.date_range(start=created_at.date(), end='2020-04-20', freq='D'),
                      columns=['date'])
    
    return temp_orders, temp_competition, temp_history, df


def process_competition(temp_competition, df):
    ### Competition
    # filter competition on extremely low and high prices
    std = temp_competition['value'].std()
    mean = temp_competition['value'].mean()
    temp_competition = temp_competition[(temp_competition['value']>= mean-3*std)
                                        & (temp_competition['value']<= mean+3*std)]

    # Group by freq and get minimum value
    temp_competition = temp_competition.groupby([pd.Grouper(key='updated_at', freq='D'),'gtin']).min()[['value']].reset_index()
    temp_competition['updated_at'] = temp_competition.updated_at.apply(lambda x: x.date())
    temp_competition['updated_at'] = pd.to_datetime(temp_competition.updated_at)
    temp_competition.rename(columns={'value':'competition_price'}, inplace=True)

    ### Clean df competition
    df = pd.merge(left=df, right=temp_competition,
                  left_on='date', right_on='updated_at', how='left')

    df.drop(columns=['updated_at','gtin'], inplace=True)

    df['competition_price'] = df['competition_price'].fillna(method='pad')#.fillna(method='bfill')
    
    return df


def process_orders(temp_orders, df):
    # Orders

    df = pd.merge(left=df, right=temp_orders,
                  left_on='date', right_on='purchase_timestamp', how='left')

    df.drop(columns=['purchase_timestamp'], inplace=True)
    
    return df


def process_history(temp_history, df):
    ### History
    # filter zero and outlier values
    temp_history = temp_history[temp_history['offer'] > 0].copy()
    std = temp_history['offer'].std()
    mean = temp_history['offer'].mean()
    temp_history = temp_history[(temp_history['offer']>= mean-3*std) & (temp_history['offer']<= mean+3*std)]
    temp_history = temp_history.groupby([pd.Grouper(key='updated_at', freq='D')])\
                               .agg({'stock':['max', 'min', 'mean'], 
                                     'price':['max', 'min', 'mean'],
                                     'offer':['max', 'min', 'mean']}).reset_index()

    temp_history.columns = temp_history.columns.droplevel(0)

    temp_history.columns = ['date',
                            'stock_max', 'stock_min', 'stock_avg',
                            'price_max','price_min','price_avg',
                            'offer_max','offer_min','offer_avg']

    temp_history = temp_history.fillna(method='pad').fillna(method='bfill')
    
    df = pd.merge(left=df, right=temp_history, left_on='date', right_on='date')
    
    return df


def adjustments(df):
    df['gtin'] = gtin
    df['item_name'] = df['item_name'].fillna(method='pad').fillna(method='bfill')

    df[['gmv','revenue','cogs','gross_profit','orders']] = df[['gmv','revenue','cogs','gross_profit','orders']].fillna(value=0)

    df.loc[df["olist_price"].isnull(),'olist_price'] = df["offer_avg"]
    
    return df

In [31]:
def save_results(master):
    fs = s3fs.S3FileSystem(anon=False, key='', secret='')
    bytes_to_write = master.to_csv(None).encode()
    with fs.open('s3://bsa-correlation-one/cellphone_data.csv', 'wb') as f:
        f.write(bytes_to_write)
    

#### Main Loop

In [32]:
client = credentials()

orders = read_orders(client, None, None)
competition = read_competition(client, None, None)
history = read_history(client, None, None)

gtins = list(set(orders.gtin))

dfs = []
success = 0
failed = 0
total = len(gtins)
for i, gtin in enumerate(gtins):
    try:
        temp_orders, temp_competition, temp_history, df = process_gtin(gtin, orders, competition, history)
        try:
            df = process_competition(temp_competition, df)
        except:
            df['competition_price'] = np.nan
        df = process_orders(temp_orders, df)
        df = process_history(temp_history, df)
        df = adjustments(df)
        
        dfs.append(df)
        success += 1
        
        if (success+failed) % 50 == 0:
            print("Success: {}\tFailed: {}\tProcessed: {}\tTotal: {}".format(success, failed, success+failed, total))
    except Exception as e:
        print("error ", e)
        failed += 1
        if (success+failed) % 50 == 0:
            print("Success: {}\tFailed: {}\tProcessed: {}\tTotal: {}".format(success, failed, success+failed, total))
            
print("Success: {}\tFailed: {}\tProcessed: {}\tTotal: {}".format(success, failed, success+failed, total))

master = pd.concat(dfs)

save_results(master)

In [33]:
master.head()

Unnamed: 0,date,competition_price,gtin,item_name,enable_subsidy,gmv,revenue,cogs,gross_profit,total_discount,orders,olist_price,freight_value,commission_freight,commission_product,price_freight_shift,price_discount,stock_max,stock_min,stock_avg,price_max,price_min,price_avg,offer_max,offer_min,offer_avg
0,2018-12-03,,7893299910753,Smartphone Lg K11+ 32gb 5.3 Octa Core Câmera...,,0.0,0.0,0.0,0.0,,0.0,879.99,,,,,,50.0,50.0,50.0,999.0,999.0,999.0,879.99,879.99,879.99
1,2018-12-04,,7893299910753,Smartphone Lg K11+ 32gb 5.3 Octa Core Câmera...,,0.0,0.0,0.0,0.0,,0.0,879.99,,,,,,49.0,45.0,47.0,999.0,999.0,999.0,879.99,879.99,879.99
2,2018-12-05,,7893299910753,Smartphone Lg K11+ 32gb 5.3 Octa Core Câmera...,,0.0,0.0,0.0,0.0,,0.0,879.99,,,,,,50.0,46.0,48.0,999.0,999.0,999.0,879.99,879.99,879.99
3,2018-12-06,,7893299910753,Smartphone Lg K11+ 32gb 5.3 Octa Core Câmera...,,0.0,0.0,0.0,0.0,,0.0,879.99,,,,,,50.0,46.0,48.0,999.0,999.0,999.0,879.99,879.99,879.99
4,2018-12-07,,7893299910753,Smartphone Lg K11+ 32gb 5.3 Octa Core Câmera...,,0.0,0.0,0.0,0.0,,0.0,879.99,,,,,,50.0,46.0,48.0,999.0,999.0,999.0,879.99,879.99,879.99


### Draft

In [3]:
client = boto3.client('s3', aws_access_key_id='', aws_secret_access_key='')

In [57]:
bucket = 'bsa-correlation-one'
object_key = 'electronics/cellphone_orders.csv'

csv_obj = client.get_object(Bucket=bucket, Key=object_key)
body = csv_obj['Body']
csv_string = body.read().decode('utf-8')

orders = pd.read_csv(StringIO(csv_string), dtype={'seller_item_gtin':'str'})

In [58]:
orders = orders[['gmv', 'revenue', 'cogs', 'gross_profit',
       'channel_slug', 'channel_store', 'olist_order_purchase_timestamp',
       'olist_order_status',
       'olist_order_estimated_delivery_date',
       'olist_order_estimated_delivery_shift',
       'olist_order_total_discount',
       'olist_item_sku',
       'olist_item_full_name', 'olist_item_quantity',
       'olist_item_price', 'carrier', 'olist_item_freight_value',
       'olist_item_freight_mode',
       'seller_item_product_sku', 'seller_item_price',
       'seller_item_gtin',
       'seller_item_product_attributes',
       'seller_item_freight_value',
       'seller_item_commission_freight_olist',
       'seller_item_commission_product_olist',
       'seller_item_price_freight_shift',
       'seller_item_price_discount', 'seller_item_enable_subsidy']].copy() 

orders['olist_order_purchase_timestamp'] = pd.to_datetime(orders.olist_order_purchase_timestamp)

orders = orders.groupby([pd.Grouper(key='olist_order_purchase_timestamp', freq='D'),
                             'seller_item_gtin', 'olist_item_full_name', 'seller_item_enable_subsidy']).sum()\
                    .sort_values(by=['seller_item_gtin', 'olist_order_purchase_timestamp']).reset_index()

for column in ['olist_order_total_discount', 'olist_item_price', 'olist_item_freight_value',
       'seller_item_price', 'seller_item_freight_value',
       'seller_item_commission_freight_olist',
       'seller_item_commission_product_olist',
       'seller_item_price_freight_shift', 'seller_item_price_discount']:
    
    orders[column] = orders[column].values / orders['olist_item_quantity'].values

In [59]:
orders.columns = ['purchase_timestamp', 'gtin', 'item_name',
                  'enable_subsidy', 'gmv', 'revenue', 'cogs', 'gross_profit',
                  'total_discount', 'orders', 'olist_price',
                  'freight_value', 'seller_item_price', 'seller_item_freight_value',
                  'commission_freight', 'commission_product', 
                  'price_freight_shift', 'price_discount']

orders.drop(columns=['seller_item_price', 'seller_item_freight_value'], inplace=True)

In [60]:
orders.shape

(7981, 16)

### Competition

In [10]:
bucket = 'bsa-correlation-one'
object_key = 'electronics/competition_electronics.csv'

csv_obj = client.get_object(Bucket=bucket, Key=object_key)
body = csv_obj['Body']
csv_string = body.read().decode('utf-8')

competition = pd.read_csv(StringIO(csv_string), dtype={'gtin':'str'})

competition['updated_at'] = pd.to_datetime(competition.updated_at)

competition.drop(columns=['Unnamed: 0'], inplace=True)

In [11]:
competition.head(2)

Unnamed: 0,updated_at,gtin,value,slug
0,2019-04-01 16:53:31.448673+00:00,5173,60.0,olist-b2w
1,2018-09-01 04:03:30.531583+00:00,7774096,58.89,olist-b2w


### History

In [14]:
bucket = 'bsa-correlation-one'
object_key = 'electronics/cellphone_history.csv'

csv_obj = client.get_object(Bucket=bucket, Key=object_key)
body = csv_obj['Body']
csv_string = body.read().decode('utf-8')

history = pd.read_csv(StringIO(csv_string), lineterminator='\n', dtype={'gtin':'str'})

history['updated_at'] = pd.to_datetime(history['updated_at'])

In [15]:
history.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 434589 entries, 0 to 434588
Data columns (total 31 columns):
availability_days      434589 non-null int64
brand                  434589 non-null object
branded_store_slug     434589 non-null object
canonical_sku          434589 non-null object
catalog_feed_date      11084 non-null object
catalog_feed_id        11084 non-null float64
category               430796 non-null object
category_info          31296 non-null object
channel_slug           434589 non-null object
commission_plan        434589 non-null object
created_at             434589 non-null object
currency               434589 non-null object
description            434589 non-null object
external_id            312808 non-null object
group_id               418060 non-null object
gtin                   434589 non-null object
id                     434589 non-null object
name                   434589 non-null object
offer                  434589 non-null float64
offer_discount   

### Single product

In [79]:
gtin ='7893299910340'

temp_orders = orders[orders['gtin']==gtin].copy()
temp_competition = competition[competition['gtin']==gtin].copy()
temp_history = history[(history['gtin']==gtin) & (history['status']=='published')].copy()

# Get first date and create range dataframe
created_at = temp_history.updated_at.min()

df = pd.DataFrame(data=pd.date_range(start=created_at.date(), end='2020-04-20', freq='D'),
                  columns=['date'])

In [80]:
### Competition

# filter competition on extremely low and high prices
std = temp_competition['value'].std()
mean = temp_competition['value'].mean()
temp_competition = temp_competition[(temp_competition['value']>= mean-3*std)
                                    & (temp_competition['value']<= mean+3*std)]

# Group by freq and get minimum value
temp_competition = temp_competition.groupby([pd.Grouper(key='updated_at', freq='D'),'gtin']).min()[['value']].reset_index()
temp_competition['updated_at'] = temp_competition.updated_at.apply(lambda x: x.date())
temp_competition['updated_at'] = pd.to_datetime(temp_competition.updated_at)
temp_competition.rename(columns={'value':'competition_price'}, inplace=True)


### Clean df competition
df = pd.merge(left=df, right=temp_competition,
              left_on='date', right_on='updated_at', how='left')

df.drop(columns=['updated_at','gtin'], inplace=True)

df['competition_price'] = df['competition_price'].fillna(method='pad')#.fillna(method='bfill')

In [82]:
# Orders

df = pd.merge(left=df, right=temp_orders,
              left_on='date', right_on='purchase_timestamp', how='left')

df.drop(columns=['purchase_timestamp'], inplace=True)

In [84]:
df.head(2)

Unnamed: 0,date,competition_price,gtin,item_name,enable_subsidy,gmv,revenue,cogs,gross_profit,total_discount,quantity,olist_price,freight_value,commission_freight,commission_product,price_freight_shift,price_discount
0,2018-05-16,,,,,,,,,,,,,,,,
1,2018-05-17,,,,,,,,,,,,,,,,


In [88]:
### History

# filter zero and outlier values
temp_history = temp_history[temp_history['offer'] > 0].copy()
std = temp_history['offer'].std()
mean = temp_history['offer'].mean()

temp_history = temp_history[(temp_history['offer']>= mean-3*std) & (temp_history['offer']<= mean+3*std)]

temp_history = temp_history.groupby([pd.Grouper(key='updated_at', freq='D')])\
                           .agg({'stock':['max', 'min', 'mean'], 
                                 'price':['max', 'min', 'mean'],
                                 'offer':['max', 'min', 'mean']}).reset_index()

temp_history.columns = temp_history.columns.droplevel(0)

temp_history.columns = ['date',
                        'stock_max', 'stock_min', 'stock_avg',
                        'price_max','price_min','price_avg',
                        'offer_max','offer_min','offer_avg']



temp_history = temp_history.fillna(method='pad').fillna(method='bfill')

temp_history.head()

In [90]:
df = pd.merge(left=df, right=temp_history, left_on='date', right_on='date')

In [96]:
df['gtin'] = gtin
df['item_name'] = df['item_name'].fillna(method='pad').fillna(method='bfill')

df[['gmv','revenue','cogs','gross_profit','quantity']] = df[['gmv','revenue','cogs','gross_profit','quantity']].fillna(value=0)

df.loc[df["olist_price"].isnull(),'olist_price'] = df["offer_avg"]

In [97]:
df.head()

Unnamed: 0,date,competition_price,gtin,item_name,enable_subsidy,gmv,revenue,cogs,gross_profit,total_discount,quantity,olist_price,freight_value,commission_freight,commission_product,price_freight_shift,price_discount,stock_max,stock_min,stock_avg,price_max,price_min,price_avg,offer_max,offer_min,offer_avg
0,2018-09-19,649.99,7893299910340,Celular Lg K9 Tv 4g Dual Chip Tv Digital 16gb ...,,0.0,0.0,0.0,0.0,,0.0,649.99,,,,,,5.0,5.0,5.0,649.99,649.99,649.99,649.99,649.99,649.99
1,2018-09-20,649.99,7893299910340,Celular Lg K9 Tv 4g Dual Chip Tv Digital 16gb ...,,0.0,0.0,0.0,0.0,,0.0,649.99,,,,,,5.0,5.0,5.0,649.99,649.99,649.99,649.99,649.99,649.99
2,2018-09-21,649.99,7893299910340,Celular Lg K9 Tv 4g Dual Chip Tv Digital 16gb ...,,0.0,0.0,0.0,0.0,,0.0,649.99,,,,,,5.0,5.0,5.0,649.99,649.99,649.99,649.99,649.99,649.99
3,2018-09-22,649.99,7893299910340,Celular Lg K9 Tv 4g Dual Chip Tv Digital 16gb ...,,0.0,0.0,0.0,0.0,,0.0,649.99,,,,,,5.0,5.0,5.0,649.99,649.99,649.99,649.99,649.99,649.99
4,2018-09-23,649.99,7893299910340,Celular Lg K9 Tv 4g Dual Chip Tv Digital 16gb ...,,0.0,0.0,0.0,0.0,,0.0,649.99,,,,,,5.0,5.0,5.0,649.99,649.99,649.99,649.99,649.99,649.99
