In [0]:
%run ../../_utils

[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.[0m
[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.[0m


In [0]:
from pyspark.sql.functions import hour, minute, second, concat, lit, date_format, dayofweek, when
from pyspark.sql.types import StringType


# Camada Silver

Na camada silver, limpezas e ajustes em dados devem ser aplicados

Caso seja possível, enriquecer os dados e extrair dados também deve acontecer nessa camada

In [0]:
tb_name = "silver.supermarket_sales"
target_location = "dbfs:/delta/silver/supermarket_sales"

## 1 - Data ingestion

In [0]:
df = spark.read.table("bronze.supermarket_sales") # leituira da delta table

In [0]:
df.printSchema()

root
 |-- invoice_id: string (nullable = true)
 |-- branch: string (nullable = true)
 |-- city: string (nullable = true)
 |-- customer_type: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- product_line: string (nullable = true)
 |-- unit_price: double (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- tax_5: double (nullable = true)
 |-- total: double (nullable = true)
 |-- date: date (nullable = true)
 |-- time: timestamp (nullable = true)
 |-- payment: string (nullable = true)
 |-- cogs: double (nullable = true)
 |-- gross_margin_percentage: double (nullable = true)
 |-- gross_income: double (nullable = true)
 |-- rating: double (nullable = true)



In [0]:
display(df.take(10))

invoice_id,branch,city,customer_type,gender,product_line,unit_price,quantity,tax_5,total,date,time,payment,cogs,gross_margin_percentage,gross_income,rating
750-67-8428,A,Yangon,Member,Female,Health and beauty,74.69,7,26.1415,548.9715,2019-01-05,2024-05-13T13:08:00Z,Ewallet,522.83,4.761904762,26.1415,9.1
226-31-3081,C,Naypyitaw,Normal,Female,Electronic accessories,15.28,5,3.82,80.22,2019-03-08,2024-05-13T10:29:00Z,Cash,76.4,4.761904762,3.82,9.6
631-41-3108,A,Yangon,Normal,Male,Home and lifestyle,46.33,7,16.2155,340.5255,2019-03-03,2024-05-13T13:23:00Z,Credit card,324.31,4.761904762,16.2155,7.4
123-19-1176,A,Yangon,Member,Male,Health and beauty,58.22,8,23.288,489.048,2019-01-27,2024-05-13T20:33:00Z,Ewallet,465.76,4.761904762,23.288,8.4
373-73-7910,A,Yangon,Normal,Male,Sports and travel,86.31,7,30.2085,634.3785,2019-02-08,2024-05-13T10:37:00Z,Ewallet,604.17,4.761904762,30.2085,5.3
699-14-3026,C,Naypyitaw,Normal,Male,Electronic accessories,85.39,7,29.8865,627.6165,2019-03-25,2024-05-13T18:30:00Z,Ewallet,597.73,4.761904762,29.8865,4.1
355-53-5943,A,Yangon,Member,Female,Electronic accessories,68.84,6,20.652,433.692,2019-02-25,2024-05-13T14:36:00Z,Ewallet,413.04,4.761904762,20.652,5.8
315-22-5665,C,Naypyitaw,Normal,Female,Home and lifestyle,73.56,10,36.78,772.38,2019-02-24,2024-05-13T11:38:00Z,Ewallet,735.6,4.761904762,36.78,8.0
665-32-9167,A,Yangon,Member,Female,Health and beauty,36.26,2,3.626,76.146,2019-01-10,2024-05-13T17:15:00Z,Credit card,72.52,4.761904762,3.626,7.2
692-92-5582,B,Mandalay,Member,Female,Food and beverages,54.84,3,8.226,172.746,2019-02-20,2024-05-13T13:27:00Z,Credit card,164.52,4.761904762,8.226,5.9



## 2 - Data Munging

Processo de limpeza e normalizações necessárias

Como esses dados já são bem tratados, vamos apenas corrigir o campo "time" para conter a data e verificar a possibilidade de criação de algum outro campo (como campo timestamp ao unir date e time)


### timestamp da venda

Juntar a data/hora do campo "time" e unir com o campo "date"

In [0]:

# Extrair a hora, minutos e segundos
df = df.withColumns({"hora": hour(col("time")), 
                     "minuto": minute(col("time")),
                     "segundo": second(col("time"))}) # withColumns eu nao costumo usar, mas vi no linkedin e vim testar rs

# concatenação da data com a hora, minutos e segundos, além dos literais T, : e Z para formar string de timestamp
df = df.withColumn("sale_time", 
                   concat(
                       col("date").cast(StringType()), lit("T"),
                       col("hora").cast(StringType()), lit(":"),
                       col("minuto").cast(StringType()), lit(":"),
                       col("segundo").cast(StringType()), lit("Z")
                   ).cast("timestamp")) # cast para timestamp

# drop de campos intermediarios
df = df.drop("time","hora", "minuto", "segundo")


### mes/ano da venda

Campo desse tipo ajuda para calcular vendas mensais

In [0]:
df = df.withColumn("month_year", date_format("date", "MM-yyy"))


### taxa por unidade

um campo apenas para manipulação de dados,
mas será apenas o total de taxa (5%) divididos pela quantidade vendida. tendo assim a taxa unitaria

In [0]:
df = df.withColumn("total_tax_unity", col("tax_5")/col("quantity"))

### Dia da semana (numero e descrição)

In [0]:
df = df.withColumn("number_day_of_week", dayofweek(col("date")))

# Traduzir os números do dia da semana para o português
df = df.withColumn(
    "day_of_week",
    when(df["number_day_of_week"] == 1, "Domingo")
    .when(df["number_day_of_week"] == 2, "Segunda-feira")
    .when(df["number_day_of_week"] == 3, "Terça-feira")
    .when(df["number_day_of_week"] == 4, "Quarta-feira")
    .when(df["number_day_of_week"] == 5, "Quinta-feira")
    .when(df["number_day_of_week"] == 6, "Sexta-feira")
    .when(df["number_day_of_week"] == 7, "Sábado"),
)


### weekend?

criar um campo para informar se a data é um fim de semana (não faz muito sentido, mas vamos supor que foi uma regra de negócio)

In [0]:
df = df.withColumn(
    "weekend",
    when(col("day_of_week").isin(["Sábado", "Domingo"]), True).otherwise(False),
)

In [0]:
display(df.take(10))

invoice_id,branch,city,customer_type,gender,product_line,unit_price,quantity,tax_5,total,date,payment,cogs,gross_margin_percentage,gross_income,rating,sale_time,month_year,total_tax_unity,number_day_of_week,day_of_week,weekend
750-67-8428,A,Yangon,Member,Female,Health and beauty,74.69,7,26.1415,548.9715,2019-01-05,Ewallet,522.83,4.761904762,26.1415,9.1,2019-01-05T13:08:00Z,01-2019,3.7345,7,Sábado,True
226-31-3081,C,Naypyitaw,Normal,Female,Electronic accessories,15.28,5,3.82,80.22,2019-03-08,Cash,76.4,4.761904762,3.82,9.6,2019-03-08T10:29:00Z,03-2019,0.764,6,Sexta-feira,False
631-41-3108,A,Yangon,Normal,Male,Home and lifestyle,46.33,7,16.2155,340.5255,2019-03-03,Credit card,324.31,4.761904762,16.2155,7.4,2019-03-03T13:23:00Z,03-2019,2.3165,1,Domingo,True
123-19-1176,A,Yangon,Member,Male,Health and beauty,58.22,8,23.288,489.048,2019-01-27,Ewallet,465.76,4.761904762,23.288,8.4,2019-01-27T20:33:00Z,01-2019,2.911,1,Domingo,True
373-73-7910,A,Yangon,Normal,Male,Sports and travel,86.31,7,30.2085,634.3785,2019-02-08,Ewallet,604.17,4.761904762,30.2085,5.3,2019-02-08T10:37:00Z,02-2019,4.3155,6,Sexta-feira,False
699-14-3026,C,Naypyitaw,Normal,Male,Electronic accessories,85.39,7,29.8865,627.6165,2019-03-25,Ewallet,597.73,4.761904762,29.8865,4.1,2019-03-25T18:30:00Z,03-2019,4.2695,2,Segunda-feira,False
355-53-5943,A,Yangon,Member,Female,Electronic accessories,68.84,6,20.652,433.692,2019-02-25,Ewallet,413.04,4.761904762,20.652,5.8,2019-02-25T14:36:00Z,02-2019,3.442,2,Segunda-feira,False
315-22-5665,C,Naypyitaw,Normal,Female,Home and lifestyle,73.56,10,36.78,772.38,2019-02-24,Ewallet,735.6,4.761904762,36.78,8.0,2019-02-24T11:38:00Z,02-2019,3.678,1,Domingo,True
665-32-9167,A,Yangon,Member,Female,Health and beauty,36.26,2,3.626,76.146,2019-01-10,Credit card,72.52,4.761904762,3.626,7.2,2019-01-10T17:15:00Z,01-2019,1.813,5,Quinta-feira,False
692-92-5582,B,Mandalay,Member,Female,Food and beverages,54.84,3,8.226,172.746,2019-02-20,Credit card,164.52,4.761904762,8.226,5.9,2019-02-20T13:27:00Z,02-2019,2.7420000000000004,4,Quarta-feira,False



belezura!

Limpamos um cadim, enriquecemos e transformamos algumas coisas e temos nosso dataset preparado para analises (até preditiva e prescritiva -> mas aí tem que fazer um pouco mais de cleanings e etc)


## Saving data

In [0]:
tb_name = 'silver.supermarket_sales'
target_location = '/FileStore/delta/supermarket_sales/silver'

In [0]:
save_dataframe(df, "delta", target_location=target_location)

[LOG] Saving None delta on /FileStore/delta/supermarket_sales/silver... OK!



## create delta table

TODO: implementar UPSERT

o upsert serve para não precisar reescrever todos os dados, mas aproveitar do Delta para fazer um MERGE, caso um registro antigo tenha uma nova versão e INSERT para os dados que são novos

In [0]:
create_table(tb_name, target_location)

[LOG] Creating delta table silver.supermarket_sales on /FileStore/delta/supermarket_sales/silver... OK!


In [0]:
# exit para fechar a execução
dbutils.notebook.exit("OK")

In [0]:
%sql
select
  *
from
  silver.supermarket_sales
limit
  10

invoice_id,branch,city,customer_type,gender,product_line,unit_price,quantity,tax_5,total,date,payment,cogs,gross_margin_percentage,gross_income,rating,sale_time,month_year,total_tax_unity,number_day_of_week,day_of_week,weekend
750-67-8428,A,Yangon,Member,Female,Health and beauty,74.69,7,26.1415,548.9715,2019-01-05,Ewallet,522.83,4.761904762,26.1415,9.1,2019-01-05T13:08:00Z,01-2019,3.7345,7,Sábado,True
226-31-3081,C,Naypyitaw,Normal,Female,Electronic accessories,15.28,5,3.82,80.22,2019-03-08,Cash,76.4,4.761904762,3.82,9.6,2019-03-08T10:29:00Z,03-2019,0.764,6,Sexta-feira,False
631-41-3108,A,Yangon,Normal,Male,Home and lifestyle,46.33,7,16.2155,340.5255,2019-03-03,Credit card,324.31,4.761904762,16.2155,7.4,2019-03-03T13:23:00Z,03-2019,2.3165,1,Domingo,True
123-19-1176,A,Yangon,Member,Male,Health and beauty,58.22,8,23.288,489.048,2019-01-27,Ewallet,465.76,4.761904762,23.288,8.4,2019-01-27T20:33:00Z,01-2019,2.911,1,Domingo,True
373-73-7910,A,Yangon,Normal,Male,Sports and travel,86.31,7,30.2085,634.3785,2019-02-08,Ewallet,604.17,4.761904762,30.2085,5.3,2019-02-08T10:37:00Z,02-2019,4.3155,6,Sexta-feira,False
699-14-3026,C,Naypyitaw,Normal,Male,Electronic accessories,85.39,7,29.8865,627.6165,2019-03-25,Ewallet,597.73,4.761904762,29.8865,4.1,2019-03-25T18:30:00Z,03-2019,4.2695,2,Segunda-feira,False
355-53-5943,A,Yangon,Member,Female,Electronic accessories,68.84,6,20.652,433.692,2019-02-25,Ewallet,413.04,4.761904762,20.652,5.8,2019-02-25T14:36:00Z,02-2019,3.442,2,Segunda-feira,False
315-22-5665,C,Naypyitaw,Normal,Female,Home and lifestyle,73.56,10,36.78,772.38,2019-02-24,Ewallet,735.6,4.761904762,36.78,8.0,2019-02-24T11:38:00Z,02-2019,3.678,1,Domingo,True
665-32-9167,A,Yangon,Member,Female,Health and beauty,36.26,2,3.626,76.146,2019-01-10,Credit card,72.52,4.761904762,3.626,7.2,2019-01-10T17:15:00Z,01-2019,1.813,5,Quinta-feira,False
692-92-5582,B,Mandalay,Member,Female,Food and beverages,54.84,3,8.226,172.746,2019-02-20,Credit card,164.52,4.761904762,8.226,5.9,2019-02-20T13:27:00Z,02-2019,2.7420000000000004,4,Quarta-feira,False
