In [8]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F 
from pyspark.sql.types import *
from delta import *

In [9]:
builder = SparkSession.builder.appName("olist_demo") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")\
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .config("spark.sql.repl.eagerEval.enabled", "true") \
    .config("spark.executor.memory", "2g")\
    .config("spark.driver.memory", "4g")\
    .config("spark.driver.memory", "4g")

spark = configure_spark_with_delta_pip(builder).getOrCreate()

## Config

In [10]:
environment = '' # dev;uat;prd

In [None]:
base_path = '../../../data'  # This will be overwritten by Papermill

In [11]:
path = {
    'brz': {'orders': f'{base_path}/bronze/orders'}, 
    'snk': f'{base_path}/silver/evt/evt_orders'
}

## Load

In [12]:
df = spark.read.format('delta').load(path['brz']['orders'])

In [13]:
df_snk = df.select(
    F.col("order_id").alias("id_order"),
    F.col("customer_id").alias("id_customer"),
    F.col("order_status").alias("st_order"),
    F.col("order_purchase_timestamp").alias("dh_order_purchase"),
    F.col("order_approved_at").alias("dh_order_approved"),
    F.col("order_delivered_carrier_date").alias("dh_order_delivered_carrier"),
    F.col("order_delivered_customer_date").alias("dh_order_delivered_customer"),
    F.col("order_estimated_delivery_date").alias("dh_order_estimated_delivery")
)

In [14]:
df_snk = df_snk.withColumn("dh_insert_slv", F.current_timestamp())

## Sink

In [15]:
df_snk.write.format('delta')\
.mode(saveMode='overwrite')\
.save(path['snk'])

                                                                                