## 4 - ETL of Celular Dataset to later use

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

### Loading the full celular dataset

In [2]:
df_all = pd.read_csv('../data/celular_data.csv', dtype={'gtin':'str'})
df_all['date'] = pd.to_datetime(df_all['date'])

### Basic ETL, basically transforming NaNs in median for stocks and equal to olist_price for competition prices

In [3]:
#filtering dates when stock >0
df = df_all[df_all['stock_avg'] >0]

#probably there's some way to do that easier...but I can't find it.
#calculate the median of freight_value by product_gtin
gtin_freight = df.groupby(['gtin'])['freight_value'].median()
gtin_freight = gtin_freight.fillna(0)

#split the datasets where freight_value = nan
df_freightna = df[df['freight_value'].isna() == True]
df_freightok = df[df['freight_value'].isna() == False]

#merge the freight_value median with the dataset with freight = na.
#drop the original freight_value column and rename the new.
df_freightna = df_freightna.merge(gtin_freight, on = 'gtin').drop(['freight_value_x'], axis = 1).rename({'freight_value_y' : 'freight_value'}, axis = 1)

#join the datasets with freight = nan and the others
df = df_freightok.append(df_freightna)

#fill competition_price nan as olist_price
df.competition_price = df.competition_price.fillna(df['olist_price'])

### Selecting columns to dataset

In [4]:
sel_col = ['date', 'gtin', 'item_name', 'competition_price', 'olist_price', 'freight_value', 'orders']
df = df[df.columns.intersection(sel_col)]

### Calculating price_ratio, freight_ratio and dummies

In [5]:
df['price_ratio'] = df['olist_price'] / df['competition_price']
df['freight_ratio'] = df['freight_value'] / df['olist_price']
df['month'] = pd.DatetimeIndex(df['date']).month
df['monthday'] = pd.DatetimeIndex(df['date']).day
df['weekday'] = pd.DatetimeIndex(df['date']).weekday
df['week'] = pd.DatetimeIndex(df['date']).week

In [6]:
vars_cat = ['weekday', 'week', 'month', 'monthday']
df_dummies = pd.get_dummies(df, columns = vars_cat, drop_first=True)

### Calculating the total orders by gtin to, later, select only those with total ordes > 50

In [7]:
mean_price = pd.DataFrame(df.groupby('gtin')['olist_price'].mean())
total_orders = pd.DataFrame(df.groupby('gtin')['orders'].sum())
df_summary = mean_price.merge(total_orders, on='gtin').sort_values('orders', ascending = False)

In [8]:
#Create a df with products total orders > 50.
df_50 = df_summary[df_summary['orders'] > 50]
print(len(df_50))

#filter df original only with gtins with order > 50
df_clean = df_dummies[df_dummies['gtin'].isin(df_50.index)]

55


From 537 gtins of cellphones, we have 55 that had total orders > 50. Let's filter only those gtin to model. I think there's no data enough to model all.

### Saving the dataset post-ETL to use on predictions

In [10]:
df_clean.to_csv('../data/celular_over50.csv', index = False)