## Start Spark Session

In [1]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
    .master("spark://iceberg-sandbox-spark-master:7077")
    .appName("iceberg-sandbox")
    .config("spark.driver.host", "iceberg-sandbox-jupyter")
    .config("spark.driver.bindAddress", "0.0.0.0")
    .config("spark.driver.port", "7078")
    .config("spark.blockManager.port", "7079")
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")
    .config("spark.sql.catalog.iceberg_jdbc", "org.apache.iceberg.spark.SparkCatalog")
    .config("spark.sql.catalog.iceberg_jdbc.catalog-impl", "org.apache.iceberg.jdbc.JdbcCatalog")
    .config("spark.sql.catalog.iceberg_jdbc.uri", "jdbc:postgresql://iceberg-sandbox-postgres:5432/iceberg-jdbc-catalog")
    .config("spark.sql.catalog.iceberg_jdbc.jdbc.user", "iceberg")
    .config("spark.sql.catalog.iceberg_jdbc.jdbc.password", "iceberg")
    .config("spark.sql.catalog.iceberg_jdbc.jdbc.driver", "org.postgresql.Driver")
    .config("spark.sql.catalog.iceberg_jdbc.warehouse", "s3a://iceberg/warehouse")
    .config("spark.hadoop.fs.s3a.endpoint", "http://iceberg-sandbox-minio:9000")
    .config("spark.hadoop.fs.s3a.access.key", "minioadmin")
    .config("spark.hadoop.fs.s3a.secret.key", "minioadmin")
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config("spark.executor.cores", "1")
    .config("spark.executor.memory", "1g")
    .config("spark.cores.max", "1")
    .config("spark.ui.showConsoleProgress", "true")
    .getOrCreate()
)


In [2]:
spark.sparkContext.uiWebUrl

'http://iceberg-sandbox-jupyter:4041'

In [3]:
spark

## Create Database

In [4]:
spark.sql("CREATE DATABASE IF NOT EXISTS iceberg_jdbc.ecommerce")

DataFrame[]

In [5]:
spark.sql("SHOW SCHEMAS FROM iceberg_jdbc").show()

+---------+
|namespace|
+---------+
|ecommerce|
+---------+



In [6]:
spark.sql("SHOW CATALOGS").show()

+-------------+
|      catalog|
+-------------+
| iceberg_jdbc|
|spark_catalog|
+-------------+



## Load Ecommerce Data

### Customers

In [7]:
df_customers = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv("s3a://dataset/ecommerce_order_dataset/train/df_Customers.csv")

df_customers.createOrReplaceTempView("customers_raw")

spark.sql("""CREATE OR REPLACE TABLE iceberg_jdbc.ecommerce.customers
USING iceberg
AS SELECT * FROM customers_raw;
""")

df_check = spark.sql("select * from iceberg_jdbc.ecommerce.customers limit 5")
df_check.show()

+------------+------------------------+------------------+--------------+
| customer_id|customer_zip_code_prefix|     customer_city|customer_state|
+------------+------------------------+------------------+--------------+
|hCT0x9JiGXBQ|                   58125|   varzea paulista|            SP|
|PxA7fv9spyhx|                    3112|armacao dos buzios|            RJ|
|g3nXeJkGI0Qw|                    4119|           jandira|            SP|
|EOEsCQ6QlpIg|                   18212|        uberlandia|            MG|
|mVz5LO2Vd6cL|                   88868|          ilhabela|            SP|
+------------+------------------------+------------------+--------------+



### Orders

In [8]:
df_orders = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv("s3a://dataset/ecommerce_order_dataset/train/df_Orders.csv")

df_orders.createOrReplaceTempView("orders_raw")

spark.sql("""CREATE OR REPLACE TABLE iceberg_jdbc.ecommerce.orders
USING iceberg
AS SELECT * FROM orders_raw;
""")
df_check = spark.sql("select * from iceberg_jdbc.ecommerce.orders limit 5")
df_check.show()

+------------+------------+------------+------------------------+-------------------+-------------------------+-----------------------------+
|    order_id| customer_id|order_status|order_purchase_timestamp|  order_approved_at|order_delivered_timestamp|order_estimated_delivery_date|
+------------+------------+------------+------------------------+-------------------+-------------------------+-----------------------------+
|Axfy13Hk4PIk|hCT0x9JiGXBQ|   delivered|     2017-10-22 18:57:54|2017-10-22 19:14:13|      2017-10-26 22:19:52|                   2017-11-09|
|v6px92oS8cLG|PxA7fv9spyhx|   delivered|     2018-06-20 21:40:31|2018-06-20 22:20:20|      2018-07-03 22:51:22|                   2018-07-24|
|Ulpf9skrhjfm|g3nXeJkGI0Qw|   delivered|     2018-02-16 16:19:31|2018-02-17 16:15:35|      2018-02-27 01:29:50|                   2018-03-08|
|bwJVWupf2keN|EOEsCQ6QlpIg|   delivered|     2018-08-18 18:04:29|2018-08-18 18:15:16|      2018-08-27 20:03:51|                   2018-09-19|
|Dd0Qn

### Order Items

In [9]:
df_order_items = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv("s3a://dataset/ecommerce_order_dataset/train/df_OrderItems.csv")

df_order_items.createOrReplaceTempView("order_items_raw")

spark.sql("""CREATE OR REPLACE TABLE iceberg_jdbc.ecommerce.order_items
USING iceberg
AS SELECT * FROM order_items_raw;
""")
df_check = spark.sql("select * from iceberg_jdbc.ecommerce.order_items limit 5")
df_check.show()

+------------+------------+------------+------+----------------+
|    order_id|  product_id|   seller_id| price|shipping_charges|
+------------+------------+------------+------+----------------+
|Axfy13Hk4PIk|90K0C1fIyQUf|ZWM05J9LcBSF|223.51|           84.65|
|v6px92oS8cLG|qejhpMGGVcsl|IjlpYfhUbRQs| 170.8|           23.79|
|Ulpf9skrhjfm|qUS5d2pEAyxJ|77p2EYxcM9MD|  64.4|           17.38|
|bwJVWupf2keN|639iGvMyv0De|jWzS0ayv9TGf| 264.5|           30.72|
|Dd0QnrMk9Cj5|1lycYGcsic2F|l1pYW6GBnPMr| 779.9|           30.66|
+------------+------------+------------+------+----------------+



### Payments

In [10]:
df_payments = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv("s3a://dataset/ecommerce_order_dataset/train/df_Payments.csv")

df_payments.createOrReplaceTempView("payments_raw")

spark.sql("""CREATE OR REPLACE TABLE iceberg_jdbc.ecommerce.payments
USING iceberg
AS SELECT * FROM payments_raw;
""")
df_check = spark.sql("select * from iceberg_jdbc.ecommerce.payments limit 5")
df_check.show()

+------------+------------------+------------+--------------------+-------------+
|    order_id|payment_sequential|payment_type|payment_installments|payment_value|
+------------+------------------+------------+--------------------+-------------+
|Axfy13Hk4PIk|                 1| credit_card|                   1|       259.14|
|v6px92oS8cLG|                 1| credit_card|                   8|       382.39|
|Ulpf9skrhjfm|                 1| credit_card|                   4|       249.25|
|bwJVWupf2keN|                 1| credit_card|                   2|        27.79|
|Dd0QnrMk9Cj5|                 1| credit_card|                   1|        76.15|
+------------+------------------+------------+--------------------+-------------+



### Products

In [11]:
df_products = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv("s3a://dataset/ecommerce_order_dataset/train/df_Products.csv")

df_products.createOrReplaceTempView("products_raw")

spark.sql("""CREATE OR REPLACE TABLE iceberg_jdbc.ecommerce.products
USING iceberg
AS SELECT * FROM products_raw;
""")
df_check = spark.sql("select * from iceberg_jdbc.ecommerce.products limit 5")
df_check.show()

+------------+---------------------+----------------+-----------------+-----------------+----------------+
|  product_id|product_category_name|product_weight_g|product_length_cm|product_height_cm|product_width_cm|
+------------+---------------------+----------------+-----------------+-----------------+----------------+
|90K0C1fIyQUf|                 toys|           491.0|             19.0|             12.0|            16.0|
|qejhpMGGVcsl|        watches_gifts|           440.0|             18.0|             14.0|            17.0|
|qUS5d2pEAyxJ| costruction_tools...|          2200.0|             16.0|             16.0|            16.0|
|639iGvMyv0De|                 toys|          1450.0|             68.0|              3.0|            48.0|
|1lycYGcsic2F|                 toys|           300.0|             17.0|              4.0|            12.0|
+------------+---------------------+----------------+-----------------+-----------------+----------------+

