In [0]:
import pandas
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, StringType

spark = SparkSession.builder.appName("data-ingestion").getOrCreate()

# paths
pathStage = "dbfs:/FileStore/tables/rpe/stage"
pathBronze = "dbfs:/FileStore/tables/rpe/bronze"
pathSilver = "dbfs:/FileStore/tables/rpe/silver"
pathGold = "dbfs:/FileStore/tables/rpe/gold"



In [0]:
customers = (
    spark.createDataFrame(
        pandas.read_csv("https://raw.githubusercontent.com/gabrielnascimentost/databricks_rpe/main/database/raw/customer.csv")
    )
)

transactions =  (
    spark.createDataFrame(
        pandas.read_csv("https://raw.githubusercontent.com/gabrielnascimentost/databricks_rpe/main/database/raw/transaction.csv")
    )
)

new_customers = (
    spark.createDataFrame(
        pandas.read_csv("https://raw.githubusercontent.com/gabrielnascimentost/databricks_rpe/main/database/raw/new_customer.csv")
    )
)

new_transactions = (
    spark.createDataFrame(
        pandas.read_csv("https://raw.githubusercontent.com/gabrielnascimentost/databricks_rpe/main/database/raw/new_transaction.csv")
    )
)

In [0]:
import datetime

customers.write.format("csv").mode("overwrite").save(f"{pathStage}/customers/{datetime.datetime.now().year}/{datetime.datetime.now().month}/{datetime.datetime.now().day}/customers.csv")

transactions.write.format("csv").mode("overwrite").save(f"{pathStage}/transactions/{datetime.datetime.now().year}/{datetime.datetime.now().month}/{datetime.datetime.now().day}/transactions.csv")

new_customers.write.format("csv").mode("overwrite").save(f"{pathStage}/new_customers/{datetime.datetime.now().year}/{datetime.datetime.now().month}/{datetime.datetime.now().day}/new_customers.csv")

new_transactions.write.format("csv").mode("overwrite").save(f"{pathStage}/new_transactions/{datetime.datetime.now().year}/{datetime.datetime.now().month}/{datetime.datetime.now().day}/new_transactions.csv")

In [0]:
customers_schema = (
    StructType([
		StructField("customer_id", StringType(), True),
		StructField("name", StringType(), True),
		StructField("email", StringType(), True),
		StructField("signup_date", StringType(), True)
    ])
)

transactions_schema = (
    StructType([
		StructField("transaction_id", StringType(), True),
		StructField("customer_id", StringType(), True),
		StructField("amount", StringType(), True),
		StructField("currency", StringType(), True),
        StructField("transaction_date", StringType(), True)
    ])
)

new_customers_schema = (
    StructType([
		StructField("customer_id", StringType(), True),
		StructField("name", StringType(), True),
		StructField("email", StringType(), True),
		StructField("signup_date", StringType(), True)
    ])
)

new_transactions_schema = (
    StructType([
		StructField("transaction_id", StringType(), True),
		StructField("customer_id", StringType(), True),
		StructField("amount", StringType(), True),
		StructField("currency", StringType(), True),
        StructField("transaction_date", StringType(), True)
    ])
)

In [0]:
customersBronze = (
    spark
    .read
    .format("csv")
    .schema(customers_schema)
    .load(f"{pathStage}/customers/{datetime.datetime.now().year}/{datetime.datetime.now().month}/{datetime.datetime.now().day}/customers.csv")
    .withColumn("dt_loading_stage", current_timestamp())
)

customersBronze.write.format("delta").mode("append").save(f"{pathBronze}/customers")

transactionsBronze = (
    spark
    .read
    .format("csv")
    .schema(transactions_schema)
    .load(f"{pathStage}/transactions/{datetime.datetime.now().year}/{datetime.datetime.now().month}/{datetime.datetime.now().day}/transactions.csv")
    .withColumn("dt_loading_stage", current_timestamp())
)

transactionsBronze.write.format("delta").mode("append").save(f"{pathBronze}/transactions")

newCustomersBronze = (
    spark
    .read
    .format("csv")
    .schema(customers_schema)
    .load(f"{pathStage}/new_customers/{datetime.datetime.now().year}/{datetime.datetime.now().month}/{datetime.datetime.now().day}/new_customers.csv")
    .withColumn("dt_loading_stage", current_timestamp())
)

customersBronze.write.format("delta").mode("append").save(f"{pathBronze}/new_customers")

newTransactionsBronze = (
    spark
    .read
    .format("csv")
    .schema(transactions_schema)
    .load(f"{pathStage}/new_transactions/{datetime.datetime.now().year}/{datetime.datetime.now().month}/{datetime.datetime.now().day}/new_transactions.csv")
    .withColumn("dt_loading_stage", current_timestamp())
)

transactionsBronze.write.format("delta").mode("append").save(f"{pathBronze}/new_transactions")


In [0]:
customersSilver = (
    spark.sql(f'''
       SELECT
            CAST(customer_id AS INTEGER) AS customer_id,
            CAST(name AS STRING) name,
            CAST(email AS STRING) AS email,
            CAST(signup_date AS DATE) AS signup_date,
            CAST(dt_loading_stage AS TIMESTAMP) AS dt_insert_data
       FROM
          (
            SELECT 
                DENSE_RANK() OVER(ORDER BY dt_loading_stage DESC) AS rank, * 
            FROM delta.`{pathBronze}/customers`
          ) AS T
       WHERE
            T.rank = 1
       ''')
)

customersSilver.write.format("delta").mode("overwrite").option("mergeSchema", "false").save(f"{pathSilver}/customers")


transactionsSilver = (
    spark.sql(f'''
       SELECT
            CAST(transaction_id AS INTEGER) AS transaction_id,
            CAST(customer_id AS INTEGER) customer_id,
            CAST(amount AS FLOAT) AS amount,
            CAST(currency AS STRING) AS currency,
            CAST(transaction_date as DATE) as transaction_date,
            CAST(dt_loading_stage AS TIMESTAMP) AS dt_insert_data
       FROM
          (
            SELECT 
                DENSE_RANK() OVER(ORDER BY dt_loading_stage DESC) AS rank, * 
            FROM delta.`{pathBronze}/transactions`
          ) AS T
       WHERE
            T.rank = 1
       ''')
)

transactionsSilver.write.format("delta").mode("overwrite").option("mergeSchema", "false").save(f"{pathSilver}/transactions")

In [0]:
newCustomersSilver = (
    spark.sql(f'''
       SELECT
            CAST(customer_id AS INTEGER) AS customer_id,
            CAST(name AS STRING) name,
            CAST(email AS STRING) AS email,
            CAST(signup_date AS DATE) AS signup_date,
            CAST(dt_loading_stage AS TIMESTAMP) AS dt_insert_data
       FROM
          (
            SELECT 
                DENSE_RANK() OVER(ORDER BY dt_loading_stage DESC) AS rank, * 
            FROM delta.`{pathBronze}/new_customers`
          ) AS T
       WHERE
            T.rank = 1
       ''')
)

newCustomersSilver.write.format("delta").mode("overwrite").option("mergeSchema", "false").save(f"{pathSilver}/new_customers")


newTransactionsSilver = (
    spark.sql(f'''
       SELECT
            CAST(transaction_id AS INTEGER) AS transaction_id,
            CAST(customer_id AS INTEGER) customer_id,
            CAST(amount AS FLOAT) AS amount,
            CAST(currency AS STRING) AS currency,
            CAST(transaction_date as DATE) as transaction_date,
            CAST(dt_loading_stage AS TIMESTAMP) AS dt_insert_data
       FROM
          (
            SELECT 
                DENSE_RANK() OVER(ORDER BY dt_loading_stage DESC) AS rank, * 
            FROM delta.`{pathBronze}/new_transactions`
          ) AS T
       WHERE
            T.rank = 1
       ''')
)

newTransactionsSilver.write.format("delta").mode("overwrite").option("mergeSchema", "false").save(f"{pathSilver}/new_transactions")

####Disclaimer

1- Para fins didáticos, a criação das tabelas na camada **gold** foi feita em tempo de execução junto com suas respectivas cargas. Em um sistema de produção, somente seria executado 1x vez a criação e em cada execução só seria executada a inserção de registros novos e/ou atualização de registros existentes (método upsert).  



In [0]:
#spark.sql(f''' 
    #DROP TABLE IF EXISTS delta.`{pathGold}/dim_customer`
#''')

#spark.sql(f'''
    ##CREATE OR REPLACE TABLE delta.`{pathGold}/dim_customer`
    ##(
    ##    customer_sk long GENERATED ALWAYS AS IDENTITY,
    ##    customer_id integer,
    ##    name string,
    ##    email string,
    ##    signup_date date
    ##) 
#''')

spark.sql(f''' 
     MERGE INTO delta.`{pathGold}/dim_customer` dest
        USING (
             select 
                c.customer_id as customer_id,
                c.name,
                c.email,
                c.signup_date
                from delta.`{pathSilver}/customers` c
        ) orig
        ON orig.customer_id = dest.customer_id
        WHEN MATCHED THEN
        UPDATE SET
            dest.customer_id = orig.customer_id,
            dest.name        = orig.name,
            dest.email       = orig.email,
            dest.signup_date = orig.signup_date
        WHEN NOT MATCHED
        THEN INSERT (
            customer_id,
            name,
            email,
            signup_date
        )
        VALUES (
            orig.customer_id,
            orig.name,
            orig.email,
            orig.signup_date
        )         
''')

Out[12]: DataFrame[num_affected_rows: bigint, num_updated_rows: bigint, num_deleted_rows: bigint, num_inserted_rows: bigint]

In [0]:
##spark.sql(f''' 
    ##DROP TABLE IF EXISTS delta.`{pathGold}/fat_sales`
##''')

##spark.sql(f'''
    ##CREATE OR REPLACE TABLE delta.`{pathGold}/fat_sales`
    ##(
        ##transaction_sk long GENERATED ALWAYS AS IDENTITY,
        ##customer_sk integer,
        ##transaction_id integer,
        ##amount_sale string,
        ##transaction_date date
    ##) 
    ##USING DELTA PARTITIONED BY (transaction_date) 
##''')

spark.sql(f'''   
        insert into delta.`{pathGold}/fat_sales`
        (
            customer_sk,
            transaction_id,
            amount_sale,
            transaction_date
        )
        select 
            dc.customer_sk,
            t.transaction_id,
            case t.currency
            when 'USD' then concat('$ ', cast(t.amount as string))
            when 'BRL' then concat('R$ ', cast(t.amount as string))
            when 'EUR' then concat('Ç', cast(t.amount as string))
            else 'NA'
            end as amount_sale,
            t.transaction_date
        from delta.`{pathSilver}/transactions` t 
        join delta.`{pathSilver}/customers` c on c.customer_id = t.customer_id
        join delta.`{pathGold}/dim_customer` dc on dc.customer_id = c.customer_id
''')


Out[13]: DataFrame[num_affected_rows: bigint, num_inserted_rows: bigint]