In [13]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F 
from pyspark.sql.types import *
from delta import *

In [14]:
builder = SparkSession.builder.appName("olist_demo") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .config("spark.sql.repl.eagerEval.enabled", "true") \
    .config("spark.executor.memory", "2g")\
    .config("spark.driver.memory", "4g")\
    .config("spark.driver.memory", "4g")

spark = configure_spark_with_delta_pip(builder).getOrCreate()

## Config

In [15]:
environment = '' # dev;uat;prd

In [16]:
base_path = '../../data'  # This will be overwritten by Papermill

In [17]:
path = {
    'stg': {'order_items': f'{base_path}/stage/olist_order_items_dataset.csv'},
    'snk': f'{base_path}/bronze/order_items'
}

In [18]:
options = {
    "inferSchema": "True",
    "delimiter": ",",
    "header": "True"
}

## Load

In [19]:
df = spark.read.options(**options).format('csv').load(path['stg']['order_items'])

                                                                                

In [20]:
df_snk = df.select(
    F.col("order_id").astype("string"),
    F.col("order_item_id").astype("integer"),
    F.col("product_id").astype("string"),
    F.col("seller_id").astype("string"),
    F.col("shipping_limit_date").astype("timestamp"),
    F.col("price").astype("double"),
    F.col("freight_value").astype("double")
)

In [21]:
df_snk = df_snk.withColumn("datetime_insert_brz", F.current_timestamp())

## Sink

In [22]:
df_snk.write.format('delta')\
.option("overwriteSchema", "true")\
.mode(saveMode='overwrite')\
.save(path['snk'])

                                                                                