In [1]:
from pyspark.sql import SparkSession, dataframe
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import DoubleType, IntegerType, StringType
from pyspark.sql import HiveContext
from pyspark.sql.functions import *
from pyspark.sql import functions as f
import os
import re

In [2]:
spark = SparkSession.builder.master("local[*]")\
    .enableHiveSupport()\
    .getOrCreate()

In [3]:
df_regiao = spark.sql("select * from desafio_curso.tbl_regiao")

In [4]:
df_regiao.show()

+-----------+-------------+--------+
|region_code|  region_name| dt_foto|
+-----------+-------------+--------+
|          0|       Canada|20230624|
|          1|      Western|20230624|
|          2|     Southern|20230624|
|          3|    Northeast|20230624|
|          4|      Central|20230624|
|          5|International|20230624|
+-----------+-------------+--------+



In [5]:
df_divisao = spark.sql("select * from desafio_curso.tbl_divisao")

In [6]:
df_divisao.show()

+--------+-------------+--------+
|division|division_name| dt_foto|
+--------+-------------+--------+
|       1|International|20230624|
|       2|     Domestic|20230624|
+--------+-------------+--------+



In [7]:
#Campos com espaço "Não Informado"
query_clientes = '''
select address_number,business_family,business_unit,customer,customerkey,customer_type,division,
case when length(trim(line_of_business)) = 0 then 'Não Informado' else line_of_business end as line_of_business,
phone,region_code,regional_sales_mgr,search_type,dt_foto
from desafio_curso.tbl_clientes'''

df_clientes = spark.sql(query_clientes)

In [8]:
df_clientes.show()

+--------------+---------------+-------------+--------------------+-----------+-------------+--------+----------------+------------+-----------+------------------+-----------+--------+
|address_number|business_family|business_unit|            customer|customerkey|customer_type|division|line_of_business|       phone|region_code|regional_sales_mgr|search_type| dt_foto|
+--------------+---------------+-------------+--------------------+-----------+-------------+--------+----------------+------------+-----------+------------------+-----------+--------+
|      10000000|             R3|            1|    City Supermarket|   10000000|           G2|       2|   Não Informado|816-455-8733|          4|               S16|          C|20230624|
|      10000453|             R3|            1|       A Supermarket|   10000453|           G1|       1|   Não Informado|816-455-8733|          5|               S19|          C|20230624|
|      10000455|             R3|            1|Caribian Supermarket|   10000

In [9]:
#Campos com espaço "Não Informado"
query_endereco = '''
select address_number,
case when length(trim(city)) = 0 then 'Não Informado' else city end as city,country,
case when length(trim(customer_address_1)) = 0 then 'Não Informado' else customer_address_1 end as customer_address_1,
case when length(trim(customer_address_2)) = 0 then 'Não Informado' else customer_address_2 end as customer_address_2,
case when length(trim(customer_address_3)) = 0 then 'Não Informado' else customer_address_3 end as customer_address_3,
case when length(trim(customer_address_4)) = 0 then 'Não Informado' else customer_address_4 end as customer_address_4,
case when length(trim(state)) = 0 then 'Não Informado' else state end as state,
case when length(trim(zip_code)) = 0 then 'Não Informado' else zip_code end as zip_code,dt_foto
from desafio_curso.tbl_endereco'''

df_endereco = spark.sql(query_endereco)

In [10]:
df_endereco.show()

+--------------+----------------+-------+--------------------+--------------------+------------------+------------------+-------------+-------------+--------+
|address_number|            city|country|  customer_address_1|  customer_address_2|customer_address_3|customer_address_4|        state|     zip_code| dt_foto|
+--------------+----------------+-------+--------------------+--------------------+------------------+------------------+-------------+-------------+--------+
|      10000000|           Akron|     US|         PO Box 6258|       Não Informado|     Não Informado|     Não Informado|           OH|        44312|20230624|
|      10000453|   Não Informado|     UK|       Não Informado|       Não Informado|     Não Informado|     Não Informado|Não Informado|Não Informado|20230624|
|      10000455|Huntington Beach|     US|   7392 Count Circle|       Não Informado|     Não Informado|     Não Informado|           CA|        92647|20230624|
|      10000456|        Edmonton|     CA|    8

In [11]:
query_vendas = '''
select actual_delivery_date,customerkey,datekey,
nvl(replace(discount_amount,',','.'),0) as discount_amount,
invoice_date,invoice_number,
nvl(item_class,'Não Informado') as item_class,
nvl(item_number,0) as item_number,
item,line_number,
replace(list_price,',','.') as list_price,
order_number,promise_delivery_date,
replace(sales_amount,',','.') as sales_amount,
replace(sales_amount_based_on_list_price,',','.') as sales_amount_based_on_list_price,
replace(sales_cost_amount,',','.') as sales_cost_amount,
replace(sales_margin_amount,',','.') as sales_margin_amount,
nvl(replace(sales_price,',','.'),0) as sales_price,
sales_quantity,sales_rep,u_m
from desafio_curso.tbl_vendas'''

df_vendas = spark.sql(query_vendas)

In [12]:
df_vendas.show()

+--------------------+-----------+----------+---------------+------------+--------------+-------------+-----------+--------------------+-----------+----------+------------+---------------------+------------+--------------------------------+-----------------+-------------------+-----------+--------------+---------+---+
|actual_delivery_date|customerkey|   datekey|discount_amount|invoice_date|invoice_number|   item_class|item_number|                item|line_number|list_price|order_number|promise_delivery_date|sales_amount|sales_amount_based_on_list_price|sales_cost_amount|sales_margin_amount|sales_price|sales_quantity|sales_rep|u_m|
+--------------------+-----------+----------+---------------+------------+--------------+-------------+-----------+--------------------+-----------+----------+------------+---------------------+------------+--------------------------------+-----------------+-------------------+-----------+--------------+---------+---+
|          28/04/2019|   10000481|28/04/

In [13]:
df_clientes.createOrReplaceTempView('clientes')
df_regiao.createOrReplaceTempView('regiao')
df_divisao.createOrReplaceTempView('divisao')
df_endereco.createOrReplaceTempView('endereco')
df_vendas.createOrReplaceTempView('vendas')

In [14]:
spark.sql("select sum(sales_amount) as total_sales from vendas").show()

+------------------+
|       total_sales|
+------------------+
|1.86186769050001E8|
+------------------+



In [26]:
query_stage = '''
select
    v.actual_delivery_date,
    v.customerkey,
    v.datekey,
    v.discount_amount,
    v.invoice_date,
    v.invoice_number,
    v.item_class,
    v.item_number,
    v.item,
    v.line_number,
    v.list_price,
    v.order_number,
    v.promise_delivery_date,
    v.sales_amount,
    v.sales_amount_based_on_list_price,
    v.sales_cost_amount,
    v.sales_margin_amount,
    v.sales_price,
    v.sales_quantity,
    v.sales_rep,
    v.u_m,
    c.address_number,
    c.business_family,
    c.business_unit,
    c.customer,
    c.customer_type,
    c.division,
    c.line_of_business,
    c.phone,
    c.region_code,
    c.regional_sales_mgr,
    c.search_type,
    d.division_name,
    r.region_name,
    e.city,
    e.country,
    e.customer_address_1,
    e.customer_address_2,
    e.customer_address_3,
    e.customer_address_4,
    e.state,
    e.zip_code
from vendas v
left join clientes c on v.customerkey = c.customerkey
left join endereco e on c.address_number = e.address_number
inner join regiao r on c.region_code = r.region_code
inner join divisao d on c.division = d.division    
'''

df_stage = spark.sql(query_stage)

In [27]:
df_stage.show(5)

+--------------------+-----------+----------+---------------+------------+--------------+----------+-----------+--------------------+-----------+----------+------------+---------------------+------------+--------------------------------+-----------------+-------------------+-----------+--------------+---------+---+--------------+---------------+-------------+--------------+-------------+--------+----------------+------------+-----------+------------------+-----------+-------------+-----------+-------+-------+--------------------+------------------+------------------+------------------+-----+--------+
|actual_delivery_date|customerkey|   datekey|discount_amount|invoice_date|invoice_number|item_class|item_number|                item|line_number|list_price|order_number|promise_delivery_date|sales_amount|sales_amount_based_on_list_price|sales_cost_amount|sales_margin_amount|sales_price|sales_quantity|sales_rep|u_m|address_number|business_family|business_unit|      customer|customer_type|di

In [28]:
df_stage = (df_stage
            .withColumn('Ano',f.year(f.to_timestamp('invoice_date','dd/MM/yyyy')))
            .withColumn('Mes',f.month(f.to_timestamp('invoice_date','dd/MM/yyyy')))
            .withColumn('Dia',f.dayofmonth(f.to_timestamp('invoice_date','dd/MM/yyyy')))
            .withColumn('Trimestre',f.quarter(f.to_timestamp('invoice_date','dd/MM/yyyy')))
           )

In [29]:
df_stage.show()

+--------------------+-----------+----------+---------------+------------+--------------+----------+-----------+--------------------+-----------+----------+------------+---------------------+------------+--------------------------------+-----------------+-------------------+-----------+--------------+---------+---+--------------+---------------+-------------+--------------------+-------------+--------+----------------+------------+-----------+------------------+-----------+-------------+-----------+--------+-------+--------------------+------------------+------------------+------------------+-----+--------+----+---+---+---------+
|actual_delivery_date|customerkey|   datekey|discount_amount|invoice_date|invoice_number|item_class|item_number|                item|line_number|list_price|order_number|promise_delivery_date|sales_amount|sales_amount_based_on_list_price|sales_cost_amount|sales_margin_amount|sales_price|sales_quantity|sales_rep|u_m|address_number|business_family|business_unit| 

In [21]:
query_customerkey = '''select * from df_stage where customerkey = 10000481'''
spark.sql(query_customerkey).show()

AnalysisException: 'Table or view not found: df_stage; line 1 pos 14'

In [31]:
df_stage.createOrReplaceTempView('stage')
query_customerkey = '''select * from stage where customerkey = 10000481'''
spark.sql(query_customerkey).show()

+--------------------+-----------+----------+---------------+------------+--------------+----------+-----------+--------------------+-----------+----------+------------+---------------------+------------+--------------------------------+-----------------+-------------------+-----------+--------------+---------+---+--------------+---------------+-------------+-----------+-------------+--------+----------------+------------+-----------+------------------+-----------+-------------+-----------+-------+-------+------------------+------------------+------------------+------------------+-----+--------+
|actual_delivery_date|customerkey|   datekey|discount_amount|invoice_date|invoice_number|item_class|item_number|                item|line_number|list_price|order_number|promise_delivery_date|sales_amount|sales_amount_based_on_list_price|sales_cost_amount|sales_margin_amount|sales_price|sales_quantity|sales_rep|u_m|address_number|business_family|business_unit|   customer|customer_type|division|l

In [32]:
df_stage.count()

66872