##### Disclaimer 

A estratégia da carga FULL se torna mais fácil na maioria dos cenários, pois o procedimento padrão é replicar os dados da origem atráves do processo de ETL sem ter a necessidade de gerar um histórico de alterações de dados. O método padrão é TRUNCATE INSERT. 

Como boa prática, adicionamos uma coluna de timestamp para guardamos o dia/hora da última carga na tabela.  

Refs. 

https://www.linkedin.com/pulse/data-load-strategies-full-vs-incremental-janardhan-reddy-kasireddy/

https://mesum.medium.com/data-warehousing-historical-load-full-load-incremental-load-cb46f0d0c4f5

In [0]:
import pandas
import datetime
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, StringType

spark = SparkSession.builder.appName("data-ingestion").getOrCreate()

# paths
pathStage = "dbfs:/FileStore/tables/rpe/stage"
pathBronze = "dbfs:/FileStore/tables/rpe/bronze"
pathSilver = "dbfs:/FileStore/tables/rpe/silver"
pathGold = "dbfs:/FileStore/tables/rpe/gold"


## External data source
customers = (
    spark.createDataFrame(
        pandas.read_csv("https://raw.githubusercontent.com/gabrielnascimentost/databricks_rpe/main/database/raw/customer.csv")
    )
)

transactions =  (
    spark.createDataFrame(
        pandas.read_csv("https://raw.githubusercontent.com/gabrielnascimentost/databricks_rpe/main/database/raw/transaction.csv")
    )
)

customers.write.format("csv").mode("overwrite").save(f"{pathStage}/customers/{datetime.datetime.now().year}/{datetime.datetime.now().month}/{datetime.datetime.now().day}/customers.csv")

transactions.write.format("csv").mode("overwrite").save(f"{pathStage}/transactions/{datetime.datetime.now().year}/{datetime.datetime.now().month}/{datetime.datetime.now().day}/transactions.csv")

## define schema
customers_schema = (
    StructType([
		StructField("customer_id", StringType(), True),
		StructField("name", StringType(), True),
		StructField("email", StringType(), True),
		StructField("signup_date", StringType(), True)
    ])
)

transactions_schema = (
    StructType([
		StructField("transaction_id", StringType(), True),
		StructField("customer_id", StringType(), True),
		StructField("amount", StringType(), True),
		StructField("currency", StringType(), True),
        StructField("transaction_date", StringType(), True)
    ])
)

## copy "as is" - landing para bronze
customersBronze = (
    spark
    .read
    .format("csv")
    .schema(customers_schema)
    .load(f"{pathStage}/customers/{datetime.datetime.now().year}/{datetime.datetime.now().month}/{datetime.datetime.now().day}/customers.csv")
    .withColumn("dt_loading_stage", current_timestamp())
)

customersBronze.write.format("delta").mode("overwrite").save(f"{pathBronze}/customers")

transactionsBronze = (
    spark
    .read
    .format("csv")
    .schema(transactions_schema)
    .load(f"{pathStage}/transactions/{datetime.datetime.now().year}/{datetime.datetime.now().month}/{datetime.datetime.now().day}/transactions.csv")
    .withColumn("dt_loading_stage", current_timestamp())
)

transactionsBronze.write.format("delta").mode("overwrite").save(f"{pathBronze}/transactions")

## copy "as is" - bronze para silver

spark.sql(f'''
    TRUNCATE TABLE delta.`{pathSilver}/customers`
''')
 

customersSilver = (
    spark.sql(f'''
       SELECT
            CAST(customer_id AS INTEGER) AS customer_id,
            CAST(name AS STRING) name,
            CAST(email AS STRING) AS email,
            CAST(signup_date AS DATE) AS signup_date,
            CAST(dt_loading_stage AS TIMESTAMP) AS dt_insert_data 
        FROM delta.`{pathBronze}/customers`
    ''')
)

customersSilver.write.format("delta").mode("overwrite").option("mergeSchema", "true").save(f"{pathSilver}/customers")

spark.sql(f'''
    TRUNCATE TABLE delta.`{pathSilver}/transactions`
''')
 
transactionsSilver = (
    spark.sql(f'''
      SELECT
            CAST(transaction_id AS INTEGER) AS transaction_id,
            CAST(customer_id AS INTEGER) customer_id,
            CAST(amount AS FLOAT) AS amount,
            CAST(currency AS STRING) AS currency,
            CAST(transaction_date as DATE) as transaction_date,
            CAST(dt_loading_stage AS TIMESTAMP) AS dt_insert_data 
        FROM delta.`{pathBronze}/transactions`
''')
)

transactionsSilver.write.format("delta").mode("overwrite").option("mergeSchema", "true").save(f"{pathSilver}/transactions")


## Copy "as is" - silver para gold

spark.sql(f'''
    TRUNCATE TABLE delta.`{pathGold}/dim_customer`
''')
 
dim_customer = (
    spark.sql(f'''
       SELECT
            CAST(customer_id AS INTEGER) AS customer_id,
            CAST(name AS STRING) name,
            CAST(email AS STRING) AS email,
            CAST(signup_date AS DATE) AS signup_date,
            CAST(dt_insert_data AS TIMESTAMP) AS dt_insert_data 
        FROM delta.`{pathSilver}/customers`
    ''')
)

dim_customer.write.format("delta").mode("overwrite").option("mergeSchema", "true").save(f"{pathGold}/dim_customer")

spark.sql(f'''
    TRUNCATE TABLE delta.`{pathGold}/fat_sales`
''')

spark.sql(f'''   
        insert into delta.`{pathGold}/fat_sales`
        (
            customer_sk,
            transaction_id,
            amount_sale,
            transaction_date
        )
        select 
            dc.customer_sk,
            t.transaction_id,
            case t.currency
            when 'USD' then concat('$ ', cast(t.amount as string))
            when 'BRL' then concat('R$ ', cast(t.amount as string))
            when 'EUR' then concat('Ç', cast(t.amount as string))
            else 'NA'
            end as amount_sale,
            t.transaction_date
        from delta.`{pathSilver}/transactions` t 
        join delta.`{pathSilver}/customers` c on c.customer_id = t.customer_id
        join delta.`{pathGold}/dim_customer` dc on dc.customer_id = c.customer_id
''')

Out[9]: DataFrame[num_affected_rows: bigint, num_inserted_rows: bigint]