In [21]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F 
from pyspark.sql.types import *
from delta import *
from IPython.display import display

In [22]:
builder = SparkSession.builder.appName("olist_demo") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")\
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .config("spark.sql.repl.eagerEval.enabled", "true")    

spark = configure_spark_with_delta_pip(builder).getOrCreate()

## Config

In [23]:
environment = '' # dev;uat;prd

In [24]:
path = {
    'stg': {'customer': f'../{environment}/data/stage/olist_customers_dataset.csv'}, 
    'snk': '../data/bronze/customer'
}

In [25]:
options = {
    "inferSchema": "True",
    "delimiter": ",",
    "header": "True"
}

## Load

In [26]:
df = spark.read.options(**options).format('csv').load(path['stg']['customer'])

In [27]:
print(df)

DataFrame[customer_id: string, customer_unique_id: string, customer_zip_code_prefix: int, customer_city: string, customer_state: string]


In [28]:
df_snk = df.select(
    F.col('customer_id').alias('customer_id'),
    F.col('customer_unique_id').alias('customer_unique_id'),
    F.col('customer_zip_code_prefix').alias('customer_zip_code_prefix'),
    F.col('customer_city').alias('customer_city'),
    F.col('customer_state').alias('customer_state')
)

In [29]:
df_snk = df_snk.withColumn("datetime_insert_brz", F.current_date())

## Sink

In [30]:
df_snk.write.format('delta')\
.mode(saveMode='overwrite')\
.save(path['snk'])