# ETL - Store database

## Goal:
1. Crie o processo de ETL a partir das tabelas fornecidas; modele os dados e explique a metodologia utilizada.


2. Crie dois dashboards usando o Tableau Public a partir do seu modelo de dados, apresentando:

- Análise por lojas, unidades de negócios, canais, produtos, tipos de cliente e período, considerando as medidas: meta, venda, margem; e possíveis métricas como ticket médio por cliente, qtd de items por cupom, etc.

- Análises relevantes que apontem oportunidades e permitam a tomada de decisões estratégicas para o negocio (explique as análises e as conclusões em um storytelling).

## 1. Extraction

Load the local files ina dataframe to be manipulated.

In [1]:
import pandas as pd
import os
import glob
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# define style for charts
plt.style.use('ggplot')

# expand number of columns to better viz
pd.set_option('display.max_columns', 50)
  
# use glob to get all the csv files 
# in the folder
path = os.getcwd()
csv_files = glob.glob(os.path.join(path + '/files/', "*.csv"))

for f in csv_files:
    # get the file name from file and remove 'bd_' and '.csv' to use the name in the dataframe
    file_name = f.split("\\")[-1].replace('bd_', '').replace('.csv', '')

    # create dinamicaly a dataframe with the name of the file
    # use ; as separator
    # change the encoding type to accept files with portuguese (brazilian) words
    globals()[f"df_{file_name}"] = pd.read_csv(f, sep=';', encoding='iso-8859-1')

## 2. Transforming
Performe some adjustments on imported data.

### 2.1 Normalizing the column names

In [2]:
df_channel.columns = ['id_channel', 'channel']

df_address_customers.columns = ['id_address_sale', 'customer_state']

df_customer_type.columns = ['id_customer_type', 'customer_type'] 

df_stores.columns = ['id_store', 'store_code', 'start_date', 'branch', 'district', 'city', 'state']

df_business_goal.columns = ['date', 'id_store', 'id_business_unit', 'id_channel', 'sales_goal']

df_products.columns = ['id_product', 'supplier', 'product_name', 'category', 'sub-category']

df_business_unit.columns = ['id_business_unit', 'business_unit']

df_sales.columns = ['date', 'id_store', 'id_business_unit', 'id_channel', 
                        'id_product', 'id_coupon', 'id_customer', 'id_address_sale', 
                        'id_customer_type', 'items', 'gross_revenue', 'tax_value', 'costs']

### 2.2 Change datatypes


#### 2.2.1 Change the columns related to date, originally as string

In [3]:
def change_to_date(df, cols):
    for c in cols:
        df[c] = pd.to_datetime(df[c], format="%Y/%m/%d")

change_to_date(df_sales, ['date'])

change_to_date(df_business_goal, ['date' ])


#### 2.2.2 Change the columns related to numbers to float, originally as string.

In [4]:
def change_to_float(df, cols):
    for c in cols:
        df[c] = df[c].str.replace(',', '.').astype(float)

change_to_float(df_sales, ['items', 'gross_revenue', 'tax_value', 'costs'])

change_to_float(df_business_goal, ['sales_goal' ])

#### 2.2.3 Removing null or columns with 0 items of Sales dataframe

In [5]:
print("Before: {}".format(df_sales[df_sales['items'] == 0]['items'].count()))

df_sales = df_sales[df_sales['items'] > 0]

print("After: {}".format(df_sales[df_sales['items'] == 0]['items'].count()))

Before: 11397
After: 0


### 2.3 Creating new columns to be analized/ plotted.

#### 2.3.1 Creating the tax rate column to understand the percentagem of taxes of each sale.


In [6]:
df_sales['tax_rate'] = round(df_sales['tax_value'] / df_sales['gross_revenue'], 4)

df_sales[['gross_revenue', 'tax_value', 'tax_rate']].sample(5)

Unnamed: 0,gross_revenue,tax_value,tax_rate
138281,2.394,0.0,0.0
592548,9.444,2.574,0.2726
1674051,13.794,2.484,0.1801
1032961,1.68,0.456,0.2714
1722246,7.734,2.106,0.2723


#### 2.3.2 Create a net revenue column

In [7]:
df_sales['net_revenue'] = df_sales['gross_revenue'] - (df_sales['tax_value'] + df_sales['costs'])

df_sales[['gross_revenue', 'net_revenue', 'tax_value' , 'costs']].sample(5)

Unnamed: 0,gross_revenue,net_revenue,tax_value,costs
1820072,2.988,1.494,0.276,1.218
2608069,31.188,10.092,8.502,12.594
1081806,5.994,3.168,1.08,1.746
1054518,10.494,3.99,0.972,5.532
1504108,1.194,0.258,0.114,0.822


### 2.3.3 New column with cumulative sales for each customer

In [8]:
df_sales['cumulative_sales'] = df_sales.assign(temp=~df_sales.duplicated(subset=['id_customer','date'])).groupby('id_customer')['temp'].cumsum()

### 2.4 Create new dataframe 'Customers' to be used in analysis and data visualization. 

#### 2.4.1 Create the new dataframe using information of other tables

In [9]:
# get the customer information in the sales dataframe
df_customers = df_sales.groupby(['id_customer', 'id_customer_type'])\
                                        .agg({'id_customer': 'count', 'items': 'sum','date': ['min', 'max'], 'gross_revenue': 'sum'})\
                                        .reset_index()

# rename columns
df_customers.columns = ['id_customer', 'id_customer_type', 'purchase_count', 'items_purchased', 'first_purchase', 'last_purchase', 'total_spent']

# get the customer type information
df_customers = df_customers.merge(df_customer_type, on='id_customer_type', how='left')

# create a column to identify the customer type
df_customers['customer_type_code'] = df_customers.apply(lambda x: 1 if x['customer_type'] == 'Identificado' else 0, axis=1)

df_customers.sample(5)

Unnamed: 0,id_customer,id_customer_type,purchase_count,items_purchased,first_purchase,last_purchase,total_spent,customer_type,customer_type_code
182306,"O\%$*%O61U&H,5M&6Y$_H(","N3ZH'W$AE#+&45Z8N8""S*#",2,1.2,2021-03-10,2021-03-10,52.26,Não Identificado,0
150449,M0'&50E0^+MU;F_P>QV!?),"N3ZH'W$AE#+&45Z8N8""S*#",3,4.2,2022-10-18,2022-10-18,110.658,Não Identificado,0
84630,"GW.?6/8^""[7:QB6V^=G(&-",F+9C/:YY=_[^&$L90;9D_%,3,1.8,2022-05-05,2022-05-05,118.128,Identificado,1
37962,"D+],4*PW9+7L:[ZAP(:GD+",F+9C/:YY=_[^&$L90;9D_%,1,0.6,2022-04-16,2022-04-16,8.88,Identificado,1
1100,"A&FQC/U9R\Y^""5D%=L5T90",F+9C/:YY=_[^&$L90;9D_%,1,0.6,2022-12-26,2022-12-26,62.994,Identificado,1


#### 2.4.2 Agreggate information by customer.

In [10]:
df_customers = df_customers.groupby('id_customer')\
                                        .agg({'purchase_count': 'sum', 
                                              'items_purchased': 'sum',
                                              'first_purchase': 'min', 
                                              'last_purchase': 'max', 
                                              'total_spent': 'sum', 'customer_type_code': 'max'})\
                                        .reset_index()
df_customers.sample(5)

Unnamed: 0,id_customer,purchase_count,items_purchased,first_purchase,last_purchase,total_spent,customer_type_code
84918,H7YK4L402TQ!)[EA<AJ9L),9,5.4,2021-08-14,2022-07-03,295.47,0
57069,"E]ESOOG:9F?;;B/BVO4/7,",3,9.0,2022-06-10,2022-06-10,30.474,0
156418,NG8:)[^AFQ&/%8A^';6T6),3,3.0,2021-12-09,2021-12-09,86.37,1
22077,B[YR6JA(*?\-NKE'PDCN)-,38,40.2,2021-01-14,2022-06-30,1087.14,0
51701,"EA""8JABR2SZX*M21""7Y#2-",4,2.4,2021-01-21,2022-12-14,26.928,0


#### 2.4.3 Get the Customer state from address customers table

In [11]:
temp_state_customer = df_sales.merge(df_address_customers, on='id_address_sale', how='left')

temp_state_customer[['id_customer', 'customer_state']]

df_customers = df_customers.merge(temp_state_customer[['id_customer', 'customer_state']], on='id_customer', how='left')

df_customers.sample(5)

Unnamed: 0,id_customer,purchase_count,items_purchased,first_purchase,last_purchase,total_spent,customer_type_code,customer_state
2741897,M]0DCLXK>DKR%MJ'WEXFS.,111,76.8,2021-01-02,2022-12-30,2544.564,0,SP
3240400,"M]X;GRX3YFUIIRU_,;:AB+",678206,527065.8,2021-01-02,2022-12-31,12646910.0,0,
3185460,"M]X;GRX3YFUIIRU_,;:AB+",678206,527065.8,2021-01-02,2022-12-31,12646910.0,0,
1761074,"I>%F`N&GN%/?S,-(D[9[U*",41,30.6,2021-01-06,2022-09-17,610.272,0,SP
3677281,"O03_JW@QRLH'0D,V=TM`--",135,215.4,2021-01-28,2022-11-22,1667.712,0,SP


### 2.5 Aggregate data to Sales dataframe

New: df_sales
- Sales information from "df_sales"
- Business unit information from "df_business_unit"
- Channel information from "df_channel"

In [12]:
df_sales = df_sales.merge(df_business_unit, on='id_business_unit', how='left')

df_sales = df_sales.merge(df_channel, on='id_channel', how='left')

df_sales = df_sales.merge(df_customer_type, on='id_customer_type', how='left')

In [13]:
df_sales = df_sales[['date', 
                     'id_store', 'business_unit', 
                     'channel', 'id_product',
                     'id_customer', 'customer_type',
                     'id_coupon', 'id_address_sale', 
                     'items', 'gross_revenue', 'tax_value', 'costs', 'tax_rate','net_revenue', 
                     'cumulative_sales']]

df_sales

Unnamed: 0,date,id_store,business_unit,channel,id_product,id_customer,customer_type,id_coupon,id_address_sale,items,gross_revenue,tax_value,costs,tax_rate,net_revenue,cumulative_sales
0,2022-09-21,F)T`P;^+F]5F7YX^S\=+?&,Produtos,Loja,"AR^$EA+5@,Q][""V`\\VQC,",N$P5WZFC9VKQM(XS1DBJZ*,Não Identificado,"N_N,M-K1I34E(DW*-FHTX.","H=ZO(L""MR+7D](@#\""/NG)",0.6,16.200,4.410,5.736,0.2722,6.054,1
1,2022-08-06,F!25!6;D=./F%2(E)D;]P0,Produtos,Loja,FC^22=\(:F=0J=F6TNPD.&,"P0K'8UWIADS?T""+9:-W@6*",Não Identificado,"OEUPW7V[BY>]:>T;Y3""KM(",F'^..@O;\5E;#O4(^_'$0+,0.6,11.994,2.154,3.084,0.1796,6.756,1
2,2022-08-08,"FO1G""YC0G6I&C(,H&(MT3-",Produtos,Loja,C2O9ATWBXT.B)L-4@Y-FI$,E\^N9TRHKU5ABQ1=?;J./',Não Identificado,"B7,9,^VTQPPN)M\$G""/I,,","E0(*AW^9CG6ACQ2*,&@LL-",0.6,11.172,3.048,3.192,0.2728,4.932,1
3,2022-07-28,"FO)5JW59TP?&C:?,ZG$$L*",Produtos,Loja,"CIJ6#@,X$@9;"",SB)891P$","K*""!]6VAE;=C*BS-]/@_*%",Não Identificado,JT=[J-@0OCG!X.YRK&Q&3!,"H:E2P/CNLQ@T$!(,0BJ,G+",0.6,2.394,0.222,0.468,0.0927,1.704,1
4,2022-10-26,F!25!6;D=./F%2(E)D;]P0,Produtos,Loja,"NLPPIQIA=`-_>0I1)P[$5""",EO2UJ\6RWGUP-OSON/+Y)%,Não Identificado,"E+VB+$M>3GM`:$V#W?V,P*","A$ER-0#JA]ENP.WRSZ)#""+",1.8,4.482,1.224,1.296,0.2731,1.962,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4048285,2022-11-26,"FO1G""YC0G6I&C(,H&(MT3-",Produtos,Loja,"C+]_@QHN^Z""=13$34#KGF#",GC9:='0DCELH82JYG)CE(!,Não Identificado,"DUUZ-0-*N=1SDL-JN#[>R,","OBDILQJOCNA?0*%*,8Q:F0",0.6,64.794,5.994,34.380,0.0925,24.420,193
4048286,2022-08-25,"FO1G""YC0G6I&C(,H&(MT3-",Produtos,Loja,"CK&\:`,K_NP=E*Y<]:\GM%",AZ^6'B)X#8HHSQ^9M5PJP*,Não Identificado,"KK\WXP)(JCG_H'8N$\*#B""","D&K,JBR?""-U2O&$.SR<;@$",0.6,53.964,4.986,30.816,0.0924,18.162,24
4048287,2022-11-02,F!25!6;D=./F%2(E)D;]P0,Produtos,Loja,M90DRT:D\`0DC!C8'J:-M#,"DS"";/NU6Y#[GLME[D<I/7&",Não Identificado,"CR:""Y2$*\ZQ:ND49WDPA$+","P`4O-RQI48+`O%\OK?YZ(,",0.6,36.114,6.498,18.498,0.1799,11.118,2
4048288,2022-10-28,"FO1G""YC0G6I&C(,H&(MT3-",Produtos,Loja,C3VP.=?FF8`!%X^4TLZG-0,"C!LPVVC?4RXH?GP_?OL5Q""",Não Identificado,"MP<Q*-GE8""R^DH?JF$G;W""","I>--[,+0/3X)N`'TLHBG+-",0.6,6.714,1.830,2.406,0.2726,2.478,36


### 2.6 Agregate to Business Goals dataframe more information.

#### 2.6.1 Get the sales of each day, store and branch

In [14]:
sales_period_store = df_sales.groupby(['date', 'id_store'])\
                .agg({
                    'items': 'sum',
                    'gross_revenue': 'sum'}).reset_index()

sales_period_store

Unnamed: 0,date,id_store,items,gross_revenue
0,2021-01-02,F!25!6;D=./F%2(E)D;]P0,1119.6,25898.100
1,2021-01-02,"F%#+YX,X!FRF<FHD):`=9+",907.8,23662.638
2,2021-01-02,F)T`P;^+F]5F7YX^S\=+?&,1154.4,30992.874
3,2021-01-02,"FO)5JW59TP?&C:?,ZG$$L*",916.2,19486.866
4,2021-01-03,F!25!6;D=./F%2(E)D;]P0,878.4,23619.408
...,...,...,...,...
3569,2022-12-31,F!25!6;D=./F%2(E)D;]P0,706.8,19767.834
3570,2022-12-31,"F%#+YX,X!FRF<FHD):`=9+",699.0,18471.312
3571,2022-12-31,F)T`P;^+F]5F7YX^S\=+?&,1224.6,35505.378
3572,2022-12-31,"FO)5JW59TP?&C:?,ZG$$L*",844.2,18085.818


### 2.6.2 Get the goal for each day, store

In [15]:
goal_period_store = df_business_goal.groupby(['date', 'id_store'])\
                .agg({
                    'sales_goal': 'sum' }).reset_index()

goal_period_store

Unnamed: 0,date,id_store,sales_goal
0,2021-01-01,F!25!6;D=./F%2(E)D;]P0,0.000
1,2021-01-01,"F%#+YX,X!FRF<FHD):`=9+",0.000
2,2021-01-01,F)T`P;^+F]5F7YX^S\=+?&,0.000
3,2021-01-01,"FO)5JW59TP?&C:?,ZG$$L*",0.000
4,2021-01-02,F!25!6;D=./F%2(E)D;]P0,22445.046
...,...,...,...
3614,2022-12-31,F!25!6;D=./F%2(E)D;]P0,18323.364
3615,2022-12-31,"F%#+YX,X!FRF<FHD):`=9+",16336.596
3616,2022-12-31,F)T`P;^+F]5F7YX^S\=+?&,33015.048
3617,2022-12-31,"FO)5JW59TP?&C:?,ZG$$L*",19925.220


#### 2.6.3 Merge the Goal per day and store with Sales per day and store.

In [16]:
df_goal_store_period = goal_period_store.merge(sales_period_store, on=['id_store', 'date'], how='left')

# drop the days without goal
df_goal_store_period = df_goal_store_period.dropna(subset=['sales_goal', 'gross_revenue'])

df_goal_store_period

Unnamed: 0,date,id_store,sales_goal,items,gross_revenue
4,2021-01-02,F!25!6;D=./F%2(E)D;]P0,22445.046,1119.6,25898.100
5,2021-01-02,"F%#+YX,X!FRF<FHD):`=9+",19573.326,907.8,23662.638
6,2021-01-02,F)T`P;^+F]5F7YX^S\=+?&,25188.930,1154.4,30992.874
7,2021-01-02,"FO)5JW59TP?&C:?,ZG$$L*",17902.728,916.2,19486.866
8,2021-01-03,F!25!6;D=./F%2(E)D;]P0,22445.046,878.4,23619.408
...,...,...,...,...,...
3614,2022-12-31,F!25!6;D=./F%2(E)D;]P0,18323.364,706.8,19767.834
3615,2022-12-31,"F%#+YX,X!FRF<FHD):`=9+",16336.596,699.0,18471.312
3616,2022-12-31,F)T`P;^+F]5F7YX^S\=+?&,33015.048,1224.6,35505.378
3617,2022-12-31,"FO)5JW59TP?&C:?,ZG$$L*",19925.220,844.2,18085.818


#### 2.6.4 Create new columns to be used futher.

In [17]:
df_goal_store_period['goal_%'] = round((df_goal_store_period['gross_revenue'] / df_goal_store_period['sales_goal']), 2)

df_goal_store_period['goal_result'] = df_goal_store_period.apply(lambda x: 1 if x['goal_%'] >= 1 else 0, axis=1)

df_goal_store_period

Unnamed: 0,date,id_store,sales_goal,items,gross_revenue,goal_%,goal_result
4,2021-01-02,F!25!6;D=./F%2(E)D;]P0,22445.046,1119.6,25898.100,1.15,1
5,2021-01-02,"F%#+YX,X!FRF<FHD):`=9+",19573.326,907.8,23662.638,1.21,1
6,2021-01-02,F)T`P;^+F]5F7YX^S\=+?&,25188.930,1154.4,30992.874,1.23,1
7,2021-01-02,"FO)5JW59TP?&C:?,ZG$$L*",17902.728,916.2,19486.866,1.09,1
8,2021-01-03,F!25!6;D=./F%2(E)D;]P0,22445.046,878.4,23619.408,1.05,1
...,...,...,...,...,...,...,...
3614,2022-12-31,F!25!6;D=./F%2(E)D;]P0,18323.364,706.8,19767.834,1.08,1
3615,2022-12-31,"F%#+YX,X!FRF<FHD):`=9+",16336.596,699.0,18471.312,1.13,1
3616,2022-12-31,F)T`P;^+F]5F7YX^S\=+?&,33015.048,1224.6,35505.378,1.08,1
3617,2022-12-31,"FO)5JW59TP?&C:?,ZG$$L*",19925.220,844.2,18085.818,0.91,0


## 3. Load data

After manipulate the data we are ready to save it locally to be used in Tableau to analysis and data visualization.

In [19]:
df_sales.to_csv('data_load/sales.csv', index=None)

df_customers.to_csv('data_load/customers.csv', index=None)

df_goal_store_period.to_csv('data_load/business_goals.csv', index=None)

df_stores.to_csv('data_load/stores.csv', index=None)

df_products.to_csv('data_load/products.csv', index=None)

## Next Steps

1. Automate data extraction
2. Fine tune to performe faster
3. Run it periodically