In [1]:
from pyathena import connect
from pyathena.util import as_pandas
import s3fs

In [2]:
import pandas as pd

In [3]:
#products = pd.read_csv('data/products.csv', dtype={'gtin':'str'})
products = pd.read_csv('../data/electronics_products.csv', dtype={'value':'str'})

In [4]:
products.head()

Unnamed: 0,odin,product_type,product_type_group,identifier_type,value
0,O18AD42D9617AE40FBB5E72A174F98DE5B,Celular,eletronicos,gtin,7899838829838
1,O1E75679BD569C48E384C38838A701E301,Celular,eletronicos,gtin,6941059621027
2,O1DDFC947FF78A45F793AF84151F9F4785,Celular,eletronicos,gtin,848958032991
3,O1A5F8A4C0DADE4C179ABD49FE3117D7B3,Celular,eletronicos,gtin,6941377760507
4,O12829428330F5485CA7D16A6503B7FA28,Placa circuito,eletronicos,gtin,7896637630306


In [5]:
gtins = tuple(products.value)

In [6]:
gtins[0:2]

('7899838829838', '6941059621027')

In [7]:
def get_data(products):
    """
    Queries Athena and gets daily GMV
    """
    cursor = connect(aws_access_key_id='',
                     aws_secret_access_key='',
                     s3_staging_dir='',
                     region_name='').cursor()

    cursor.execute("""
SELECT availability_days, brand, branded_store_slug, canonical_sku, catalog_feed_date, catalog_feed_id, category, category_info, channel_slug, commission_plan, created_at, currency, description, external_id, group_id, gtin, id, name, offer, offer_discount, parent_id, part_number, partition_0, price, price_freight_shift, reject_reason, seller_product_sku, sent_error_reason, status, stock, updated_at
FROM "olist-datalake-athena".channels_api_products_channelproducthistory
where gtin in {}
UNION
SELECT availability_days, brand, branded_store_slug, canonical_sku, catalog_feed_date, catalog_feed_id, category, category_info, channel_slug, commission_plan, created_at, currency, description, external_id, group_id, gtin, id, name, offer, offer_discount, parent_id, part_number, partition_0, price, price_freight_shift, reject_reason, seller_product_sku, sent_error_reason, status, stock, updated_at
FROM "olist-datalake-athena".channels_api_products_channelproducthistory_2019
where gtin in {}
UNION
SELECT availability_days, brand, branded_store_slug, canonical_sku, catalog_feed_date, catalog_feed_id, category, category_info, channel_slug, commission_plan, created_at, currency, description, external_id, group_id, gtin, id, name, offer, offer_discount, parent_id, part_number, partition_0, price, price_freight_shift, reject_reason, seller_product_sku, sent_error_reason, status, stock, updated_at
FROM "olist-datalake-athena".channels_api_products_channelproducthistory_2020
where gtin in {}
""".format(products,products,products))

    df = as_pandas(cursor)


    return df

In [8]:
def get_orders(gtin):
    """
    """
    cursor = connect(aws_access_key_id='',
                     aws_secret_access_key='',
                     s3_staging_dir='',
                     region_name='').cursor()

    cursor.execute("""
SELECT
*
FROM "olist-dw-athena".orders
WHERE
    seller_item_gtin IN {};
""".format(gtin))

    df = as_pandas(cursor)


    return df
    

In [40]:
def batch_execution(product_type, gtins):
    fs = s3fs.S3FileSystem(anon=False)
    # history
    print('EXECUTING HISTORY: {}'.format(product_type))
    df = get_data(gtins)
    print('SAVING HISTORY')
    #df.to_csv('../data/electronics/electronics_{}_history.csv'.format(product_type))
    #bytes_to_write = df.to_csv(None).encode()
    with fs.open('s3://bsa-correlation-one/electronics/electronics_{}_history.csv'.format(product_type), 'w') as f:
        df.to_csv(f)
    
    print('EXECUTING ORDERS: {}'.format(product_type))
    orders = get_orders(gtins)
    to_drop = ['customer_id', 'payer_id', 'shipment_id',
       'shipping_id', 'olist_shipment_id',
       'olist_shipment_tracking_protocol', 'olist_shipment_tracking_url', 'olist_shipping_id',
       'olist_shipping_shipping_estimate_id',
       'olist_shipping_shipping_method_id',
       'olist_shipping_shipping_method_name',
        'olist_payer_id',
       'olist_payer_name', 'olist_payer_document_number', 'olist_payer_email',
       'olist_payer_birth_date', 'olist_payer_address_id', 'olist_customer_id',
       'olist_customer_name', 'olist_customer_document_number',
       'olist_customer_email', 'olist_customer_address_id', 'olist_order_raw_order']
    
    hashed = orders.drop(columns=to_drop)
    print('SAVING {} ORDERS'.format(hashed.shape[0]))
    #hashed.to_csv('../data/electronics/electronics_{}_orders.csv'.format(product_type), index=False)
    #bytes_to_write = hashed.to_csv(None).encode()
    with fs.open('s3://bsa-correlation-one/electronics/electronics_{}_orders.csv'.format(product_type), 'w') as f:
        hashed.to_csv(f)
    print('FINISHED {}'.format(product_type))
    print('')
    

In [44]:
types = products.product_type.unique().tolist()

for i, product_type in enumerate(types[120:]):
    current = products[products.product_type == product_type]
    gtins = tuple(current.value)
    print('Executing... {}/{}'.format(i, len(types[1:])))
    print('Total gtins: ', len(gtins))
    try:
        batch_execution(product_type, gtins)
    except Exception as e:
        print('Failed {}'.format(product_type))
        print(e)
        print()

In [None]:
# cellphones = products[products.product_type == 'Celular']

In [None]:
# cells = tuple(cellphones.value)

In [None]:
# len(cells)

In [None]:
prods = tuple(products.value)

In [None]:
len(cells)

In [None]:
## CELLPHONES (TYPE) ~ 1879 gtins
# df = get_data(cells)

## ELECTRONICS (GROUP) ~ 29k gtins
df = get_data(prods)


In [None]:
for i, product_type in enumerate(types[1:]):
    current = products[products.product_type == product_type]
    gtins = tuple(current.value)
    print('Executing... {}/{}, type'.format(i, len(types[1:]), product_type))
    print('Total gtins: ', len(gtins))
    try:
        batch_execution(product_type, gtins)
    except:
        print('Failed {}'.format(product_type))
        print()
    

In [None]:
df.shape

In [None]:
df.to_csv('../data/electro_history.csv', index=False)

### Orders information

In [None]:
df.columns

In [None]:
orders_cellphone = get_orders(cells)

In [None]:
orders_cellphone.head()

In [None]:
orders_cellphone.shape

In [None]:
to_drop = ['customer_id', 'payer_id', 'shipment_id',
       'shipping_id', 'olist_shipment_id',
       'olist_shipment_tracking_protocol', 'olist_shipment_tracking_url', 'olist_shipping_id',
       'olist_shipping_shipping_estimate_id',
       'olist_shipping_shipping_method_id',
       'olist_shipping_shipping_method_name',
        'olist_payer_id',
       'olist_payer_name', 'olist_payer_document_number', 'olist_payer_email',
       'olist_payer_birth_date', 'olist_payer_address_id', 'olist_customer_id',
       'olist_customer_name', 'olist_customer_document_number',
       'olist_customer_email', 'olist_customer_address_id', 'olist_order_raw_order']

In [None]:
hashed = orders_cellphone.drop(columns=to_drop)

In [None]:
hashed.shape

In [None]:
hashed.to_csv('../data/cellphone_orders.csv', index=False)