In [41]:
import s3fs
import pandas as pd
import sqlalchemy
import numpy as np
import math
from datetime import datetime, timedelta
from pyathena import connect
from pyathena.util import as_pandas

In [2]:
def read_athena(sql):
    """
    Queries Athena and gets daily GMV
    """
    cursor = connect(aws_access_key_id='',
                     aws_secret_access_key='',
                     s3_staging_dir='',
                     region_name='').cursor()
    

    cursor.execute(sql)

    df = as_pandas(cursor)

    return df

In [3]:
### Cellphones
cellphones = """
select
distinct(pi."value") as gtin
from
	"olist-datalake-athena".catalogs_api_products_catalogproduct as cp
join 
	"olist-datalake-athena".catalogs_api_products_identifier as pi
	on pi.catalog_product_id = cp.id
where cp.product_type = 'Celular'
and pi.identifier_type = 'gtin'
"""

In [4]:
cellphone_gtins = read_athena(cellphones)

In [5]:
### Cellphone orders
sql = """
select
	so.purchase_timestamp,
	soi.price,
	soi.freight_value,
	cp.product_type,
	cp.product_type_group,
	pi."value" as gtin
from
	"olist-datalake-athena".catalogs_api_products_catalogproduct as cp
join 
	"olist-datalake-athena".catalogs_api_products_identifier as pi
	on pi.catalog_product_id = cp.id
join 
	"olist-datalake-athena".orders_api_seller_orders_sellerorderitem as soi
	on soi.product_gtin = pi."value"
join
	"olist-datalake-athena".orders_api_seller_orders_sellerorder as so
	on so.id = soi.seller_order_id
where cp.product_type = 'Celular'
and pi.identifier_type = 'gtin'
"""

In [6]:
orders = read_athena(sql)

In [7]:
orders['price'] = pd.to_numeric(orders['price'])
orders['freight_value'] = pd.to_numeric(orders['freight_value'])

In [8]:
orders.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37158 entries, 0 to 37157
Data columns (total 6 columns):
purchase_timestamp    37158 non-null datetime64[ns]
price                 37158 non-null float64
freight_value         37158 non-null float64
product_type          37158 non-null object
product_type_group    37158 non-null object
gtin                  37158 non-null object
dtypes: datetime64[ns](1), float64(2), object(3)
memory usage: 1.7+ MB


In [9]:
gtins = tuple(cellphone_gtins.gtin.values)

In [10]:
# Competition prices
competition = """
select
    ph.updated_at,
    ph.gtin,
    ph.value as competition_price,
    po.slug
from
    "olist-datalake-athena".products_api_price_analytics_origin as po
join
    "olist-datalake-athena".products_api_price_analytics_pricinghistory as ph
    on ph.origin_id = po.id
where
    gtin in {}
""".format(gtins)

# po.slug like '%b2w%'
# and ph.active = true
# and gtin in {}

In [11]:
competition = read_athena(competition)

In [12]:
competition['competition_price'] = pd.to_numeric(competition['competition_price'])

In [13]:
# history prices and stocks
history = """
SELECT availability_days, brand, branded_store_slug, canonical_sku, catalog_feed_date, catalog_feed_id, category, category_info, channel_slug, commission_plan, created_at, currency, description, external_id, group_id, gtin, id, name, offer, offer_discount, parent_id, part_number, partition_0, price, price_freight_shift, reject_reason, seller_product_sku, sent_error_reason, status, stock, updated_at
FROM "olist-datalake-athena".channels_api_products_channelproducthistory
where gtin in {}
UNION
SELECT availability_days, brand, branded_store_slug, canonical_sku, catalog_feed_date, catalog_feed_id, category, category_info, channel_slug, commission_plan, created_at, currency, description, external_id, group_id, gtin, id, name, offer, offer_discount, parent_id, part_number, partition_0, price, price_freight_shift, reject_reason, seller_product_sku, sent_error_reason, status, stock, updated_at
FROM "olist-datalake-athena".channels_api_products_channelproducthistory_2019
where gtin in {}
UNION
SELECT availability_days, brand, branded_store_slug, canonical_sku, catalog_feed_date, catalog_feed_id, category, category_info, channel_slug, commission_plan, created_at, currency, description, external_id, group_id, gtin, id, name, offer, offer_discount, parent_id, part_number, partition_0, price, price_freight_shift, reject_reason, seller_product_sku, sent_error_reason, status, stock, updated_at
FROM "olist-datalake-athena".channels_api_products_channelproducthistory_2020
where gtin in {}
""".format(gtins,gtins,gtins)

In [14]:
history = read_athena(history)

In [15]:
history.head()

Unnamed: 0,availability_days,brand,branded_store_slug,canonical_sku,catalog_feed_date,catalog_feed_id,category,category_info,channel_slug,commission_plan,...,part_number,partition_0,price,price_freight_shift,reject_reason,seller_product_sku,sent_error_reason,status,stock,updated_at
0,0,Motorola,olist,PRDGEHJ09TRE4FZN,NaT,,b3167ed2-ca85-4ddb-a2b8-c2a81b9e8307,,cnova,default,...,,3,699.99,0.0,,PRDIK0EC4OWQ7CKE,,published,29,2019-03-12 08:02:55
1,0,Multilaser,olist,PRDKXM53CPMHJGY6,NaT,,4a4233cc-c30d-4282-ac56-95a855f17d9e,,mercadolivre,default,...,,3,399.99,0.0,,PRDZKG0N3WYQJW9G,,published,26,2019-03-11 22:22:13
2,0,Multilaser,olist,PRDU93XWKFNJJ1X3,NaT,,6b8a078a-3667-4876-946d-20d4218122a3,,b2w,default,...,,12,186.36,0.0,,PRDRYK0XPYYD11IX,,published,1,2020-01-02 10:16:24
3,0,Lg,girolar,GIRVF9UOPFVFT0YK,NaT,,e1524560-d23b-4dbd-8f16-e89a9134cfae,,mercadolivre,default,...,,12,649.99,0.0,,PRD0M9SJ99LGBED6,,published,240,2020-01-02 11:05:21
4,0,LG,olist,PRDHSA10OG43ZGNR,NaT,,d34b2a7b-5329-4dde-baf4-f2274881ec8b,,zoom,default,...,,12,449.9,0.0,,PRDBWVB9HWS3LD61,,published,77,2020-01-02 13:12:10


In [16]:
history['offer'] = pd.to_numeric(history.offer)
history['stock'] = pd.to_numeric(history.stock)

In [17]:
dfs = []
for gtin in gtins:
    try:
        t_hist = history[history['gtin']==gtin][['created_at','updated_at', 'gtin', 'offer','stock','status']].copy().reset_index(drop=True)

        creation_date = min(t_hist[t_hist['status']=='published']['updated_at']).date()
        end_date = '2020-05-12'

        df = pd.DataFrame(data=pd.date_range(start=creation_date, end=end_date, freq='D'), columns=['date'])
        df['gtin'] = gtin

        ### offer and stocks cleaning
        ### clean outlier prices
        std = t_hist['offer'].std()
        mean = t_hist['offer'].mean()
        t_hist = t_hist[(t_hist.offer <= mean + 3*std) & (t_hist.offer >= mean - 3*std)]
        t_hist = t_hist.groupby(pd.Grouper(key='updated_at',freq='D')).agg({'offer':'mean', 'stock':'min'}).reset_index()
        t_hist.fillna(method='ffill', inplace=True)
        df = pd.merge(left=df, right=t_hist, left_on=['date'], right_on=['updated_at'], how='left')
        df.drop(columns=['updated_at'],inplace=True)

        ### orders cleaning
        t_orders = orders[orders['gtin']==gtin][['purchase_timestamp','price','freight_value']].copy().reset_index(drop=True)
        t_orders['orders'] = 1

        t_orders = t_orders.groupby(pd.Grouper(key='purchase_timestamp',freq='D')).agg({'price':'mean', 
                                                                                        'freight_value':'mean',
                                                                                        'orders':'sum'}).reset_index()


        df = pd.merge(left=df, right=t_orders, left_on=['date'], right_on=['purchase_timestamp'], how='left')
        df.drop(columns=['purchase_timestamp'],inplace=True)
        df['price'].fillna(df['offer'], inplace=True)
        df['orders'].fillna(0, inplace=True)
        try:
            freight_mean = df['freight_value'].mean()
            df['freight_value'].fillna(freight_mean, inplace=True)
        except:
            pass

        ### Competition history
        t_competition = competition[competition['gtin']==gtin].copy()

        ### Clean outlier values
        std = t_competition['competition_price'].std()
        mean = t_competition['competition_price'].mean()
        t_competition = t_competition[(t_competition.competition_price <= mean + 3*std)
                                      & (t_competition.competition_price >= mean - 3*std)]
        t_competition = t_competition.groupby(pd.Grouper(key='updated_at', freq='D'))[['competition_price']].mean().reset_index()

        df = pd.merge(left=df, right=t_competition, left_on=['date'], right_on=['updated_at'], how='left')
        df.drop(columns=['updated_at'],inplace=True)

        df['competition_price'].fillna(method='ffill', inplace=True)
        df['competition_price'].fillna(df['price'], inplace=True)

        if (t_hist.shape[0] > 0): # and (t_orders.shape[0] > 0):
            dfs.append(df)
    except:
        pass

In [18]:
final = pd.concat(dfs)

In [19]:
final.shape

(1142367, 8)

In [20]:
final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1142367 entries, 0 to 735
Data columns (total 8 columns):
date                 1142367 non-null datetime64[ns]
gtin                 1142367 non-null object
offer                803585 non-null float64
stock                803585 non-null float64
price                804760 non-null float64
freight_value        529962 non-null float64
orders               1142367 non-null float64
competition_price    1113624 non-null float64
dtypes: datetime64[ns](1), float64(6), object(1)
memory usage: 78.4+ MB


In [21]:
final['price'] = final['price'].fillna(final['offer'])

In [22]:
final.dropna(subset=['offer'], inplace=True)

In [23]:
final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 803585 entries, 0 to 449
Data columns (total 8 columns):
date                 803585 non-null datetime64[ns]
gtin                 803585 non-null object
offer                803585 non-null float64
stock                803585 non-null float64
price                803585 non-null float64
freight_value        380520 non-null float64
orders               803585 non-null float64
competition_price    803585 non-null float64
dtypes: datetime64[ns](1), float64(6), object(1)
memory usage: 55.2+ MB


In [24]:
price_category = {}
price_category['celular'] = np.quantile(final.price.values, np.arange(0.25,1,0.25))

In [25]:
def price_range(prod_category, price):
    prices = price_category[prod_category]
    if price < prices[0]:
        return 'A'
    elif price < prices[1]:
        return 'B'
    elif price < prices[2]:
        return 'C'
    else:
        return 'D'

In [26]:
final['price_category'] = final.apply(lambda row: price_range('celular', row['price']), axis=1)

In [27]:
final.head()

Unnamed: 0,date,gtin,offer,stock,price,freight_value,orders,competition_price,price_category
0,2019-10-23,1572984320591,1259.28,0.0,1259.28,,0.0,1259.28,C
1,2019-10-24,1572984320591,1259.28,5.0,1259.28,,0.0,1259.28,C
2,2019-10-25,1572984320591,1259.28,5.0,1259.28,,0.0,1259.28,C
3,2019-10-26,1572984320591,1259.28,5.0,1259.28,,0.0,1259.28,C
4,2019-10-27,1572984320591,1259.28,5.0,1259.28,,0.0,1259.28,C


In [28]:
# Freights mean

a = round(final[final.price_category == "A"]['freight_value'].mean(),2)
b = round(final[final.price_category == "B"]['freight_value'].mean(),2)
c = round(final[final.price_category == "C"]['freight_value'].mean(),2)
d = round(final[final.price_category == "D"]['freight_value'].mean(),2)

In [29]:
final.reset_index(drop=True, inplace=True)

In [30]:
def fill_freight(category, value):
    if math.isnan(value) == True:
        if category == 'A':
            #x = a
            x = 20
        elif category == 'B':
            #x = b
            x = 24
        elif category == 'C':
            #x = c
            x = 31
        else:
            #x = d
            x = 39
        return x

In [31]:
math.isnan(final.iloc[0].freight_value)

True

In [32]:
final['freight_value'] = final.apply(lambda row: fill_freight(row['price_category'], row['freight_value']), axis=1)

In [33]:
final.head()

Unnamed: 0,date,gtin,offer,stock,price,freight_value,orders,competition_price,price_category
0,2019-10-23,1572984320591,1259.28,0.0,1259.28,31.0,0.0,1259.28,C
1,2019-10-24,1572984320591,1259.28,5.0,1259.28,31.0,0.0,1259.28,C
2,2019-10-25,1572984320591,1259.28,5.0,1259.28,31.0,0.0,1259.28,C
3,2019-10-26,1572984320591,1259.28,5.0,1259.28,31.0,0.0,1259.28,C
4,2019-10-27,1572984320591,1259.28,5.0,1259.28,31.0,0.0,1259.28,C


In [34]:
grouped = final.groupby([pd.Grouper(key='date',freq='D'), 'price_category']).agg({'offer':'mean',
                                                                                  'price':'mean',
                                                                                  'freight_value':'mean',
                                                                                  'competition_price':'mean',
                                                                                  'stock':'sum',
                                                                                  'orders':'sum'}).reset_index()

In [35]:
#grouped.to_csv('../models/cellphonedata.csv', index=False)

In [36]:
grouped.head()

Unnamed: 0,date,price_category,offer,price,freight_value,competition_price,stock,orders
0,2017-06-30,A,165.9,165.9,20.0,165.9,0.0,0.0
1,2017-06-30,B,455.015,455.015,24.0,455.015,0.0,0.0
2,2017-06-30,C,931.184286,931.184286,31.0,931.184286,0.0,0.0
3,2017-06-30,D,3522.147143,3522.147143,39.0,3522.147143,0.0,0.0
4,2017-07-01,A,165.9,165.9,20.0,165.9,30.0,0.0


In [38]:
# get date parameters
grouped['year']      = grouped['date'].apply(lambda x: x.year)
grouped['month']     = grouped['date'].apply(lambda x: x.month)
grouped['dayofweek'] = grouped['date'].apply(lambda x: x.dayofweek)
grouped['day']       = grouped['date'].apply(lambda x: x.day)

### holidays

In [42]:
def get_holidays():
    """
    Get Black Friday dates
    """
    black_friday = pd.DataFrame({
                  'holiday': 'black_friday',
                  'ds': pd.to_datetime(['2017-11-24', '2018-11-23', '2019-11-29',
                                        '2020-11-27', '2021-11-26', '2022-11-25',
                                        '2023-11-24', '2024-11-29', '2025-11-28']),
                  'lower_window': -3,
                  'upper_window': 3,
                })

    xmas = pd.DataFrame({
                  'holiday': 'christmas',
                  'ds': pd.to_datetime(['2017-12-25', '2018-12-25', '2019-12-25',
                                        '2020-12-25', '2021-12-25', '2022-12-25',
                                        '2023-12-25', '2024-12-25', '2025-12-25']),
                  'lower_window': -1,
                  'upper_window': 0,
                })

    new_year = pd.DataFrame({
                  'holiday': 'new_year',
                  'ds': pd.to_datetime(['2017-01-01', '2018-01-01', '2019-01-01',
                                        '2020-01-01', '2021-01-01', '2022-01-01',
                                        '2023-01-01', '2024-01-01', '2025-01-01']),
                  'lower_window': -1,
                  'upper_window': 0,
                })

    carnival = pd.DataFrame({
                  'holiday': 'carnival',
                  'ds': pd.to_datetime(['2017-02-28', '2018-02-13', '2019-03-05',
                                        '2020-02-25', '2021-02-16', '2022-03-01',
                                        '2023-02-21', '2024-02-13', '2025-03-04']),
                  'lower_window': -1,
                  'upper_window': 0,
                })

    good_friday = pd.DataFrame({
                  'holiday': 'good_friday',
                  'ds': pd.to_datetime(['2018-03-18','2019-04-19','2020-04-10',
                                        '2021-04-02','2022-04-15','2023-04-07',
                                        '2024-03-29']),
                  'lower_window': 0,
                  'upper_window': 2,
                })

    valentines = pd.DataFrame({
                  'holiday': 'valentines',
                  'ds': pd.to_datetime(['2016-06-12','2017-06-12','2018-06-12',
                                        '2019-06-12','2020-06-12','2021-06-12']),
                  'lower_window': -7,
                  'upper_window': 0,
                })
    
    mothers_day = pd.DataFrame({
                  'holiday': 'mothers_day',
                  'ds': pd.to_datetime(['2018-05-13','2019-05-12']),
                  'lower_window': -7,
                  'upper_window': 0,
                })
    # long holidays

    monday = pd.DataFrame({
                  'holiday': 'monday',
                  'ds': pd.to_datetime(['2020-09-07','2020-10-12','2020-11-02',
                                        '2021-11-15']),
                  'lower_window': -2,
                  'upper_window': 0,
                })

    friday = pd.DataFrame({
                  'holiday': 'friday',
                  'ds': pd.to_datetime(['2018-09-07','2018-10-12','2018-11-02',
                                        '2019-11-15','2020-05-01']),
                  'lower_window': 0,
                  'upper_window': 2,
                })

    # Different cases
    # 2019-12-30 (christmas and New year's eve from tuesday to wednesday. Monday was also affected)
    others = pd.DataFrame({
                  'holiday': 'others',
                  'ds': pd.to_datetime(['2019-12-23','2019-12-30']),
                  'lower_window': 0,
                  'upper_window': 0,
                })

    holidays = pd.concat((black_friday, xmas, new_year, carnival, valentines, mothers_day,
                         monday, friday, others))

    return holidays

# get holidays dataframe
holidays = get_holidays()

# transform holidays
hday=[]
hdate=[]
for row in holidays.itertuples():
    lower = row.lower_window
    upper = row.upper_window
    holiday  = row.holiday
    day = row.ds
    
    for i in range(lower, upper+1):
        hday.append(holiday)
        hdate.append(day + timedelta(days=i))
        
hdays = pd.DataFrame(zip(hday,hdate), columns=['holiday','date'])

In [43]:
grouped = pd.merge(grouped, hdays, left_on='date', right_on='date', how='left')

# one-hot encode holidays
one_hot = pd.get_dummies(data=grouped['holiday'], dummy_na=True)
# drop nan col (will serve to avoid multicollinearity)
one_hot.drop(columns=[np.nan], inplace=True)

# drop holiday columns
grouped.drop(columns=['holiday'], inplace=True)

# Join df with one-hot
grouped = grouped.join(one_hot)

In [45]:
grouped.rename(columns={'date':'ds', 'orders':'y', 'price':'olist_price'}, inplace=True)

In [46]:
grouped.columns

Index(['ds', 'price_category', 'offer', 'olist_price', 'freight_value',
       'competition_price', 'stock', 'y', 'year', 'month', 'dayofweek', 'day',
       'black_friday', 'carnival', 'christmas', 'friday', 'mothers_day',
       'new_year', 'others', 'valentines'],
      dtype='object')

In [47]:
save = grouped[['ds', 'year', 'month', 'dayofweek', 'day',
                'price_category', 'offer', 'olist_price', 'freight_value', 'competition_price', 'stock',
                'black_friday', 'carnival', 'christmas', 'friday', 'mothers_day', 'new_year', 'others', 'valentines',
                'y']].copy()

In [49]:
save.to_csv('../models/cellphones/cellphonedata.csv', index=False)

In [50]:
holidays.to_csv('../models/cellphones/holidays.csv', index=False)