In [1]:
import os, sys
import logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

logger.info("Application started successfully.")

2025-06-30 07:18:57,749 - INFO - Application started successfully.


In [2]:
# load environment variables

NESSIE_URI = os.environ['NESSIE_URI']
REF = "etl"
FULL_PATH_TO_WAREHOUSE = os.environ['WAREHOUSE']
AWS_S3_ENDPOINT = os.environ["AWS_S3_ENDPOINT"]
AWS_ACCESS_KEY = os.environ['AWS_ACCESS_KEY_ID']
AWS_SECRET_KEY  = os.environ['AWS_SECRET_ACCESS_KEY']

In [3]:
# set pyspark configuration
from pyspark import SparkConf

conf = SparkConf() \
    .setAppName("Read_csv_from_azure_blob") \
    .setMaster("local[*]") \
    .set("spark.executor.memory", "4g") \
    .set("spark.driver.memory", "2g")

In [4]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.config(conf=conf).getOrCreate()
spark

25/06/30 07:18:59 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [5]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.config(conf=conf).getOrCreate()
spark

In [6]:
# create silver in nessie catalog
spark.sql(f"USE REFERENCE main IN nessie").show()
namespace = "silver"
df = spark.sql("SHOW NAMESPACES in nessie")
namespace_exists = df.filter(df.namespace == namespace).count() > 0

logger.info(f"namespace {namespace} exists: {namespace_exists}")
if not namespace_exists:
    spark.sql(f"CREATE NAMESPACE nessie.{namespace}").show()

spark.sql("SHOW NAMESPACES in nessie").show()


+-------+----+--------------------+
|refType|name|                hash|
+-------+----+--------------------+
| Branch|main|358bf992df5de616a...|
+-------+----+--------------------+



25/06/30 07:19:05 WARN S3FileIO: Unclosed S3FileIO instance created by:
	org.apache.iceberg.aws.s3.S3FileIO.initialize(S3FileIO.java:444)
	org.apache.iceberg.CatalogUtil.loadFileIO(CatalogUtil.java:402)
	org.apache.iceberg.CatalogUtil.loadFileIO(CatalogUtil.java:349)
	org.apache.iceberg.nessie.NessieCatalog.initialize(NessieCatalog.java:132)
	org.apache.iceberg.CatalogUtil.loadCatalog(CatalogUtil.java:277)
	org.apache.iceberg.CatalogUtil.buildIcebergCatalog(CatalogUtil.java:331)
	org.apache.iceberg.spark.SparkCatalog.buildIcebergCatalog(SparkCatalog.java:153)
	org.apache.iceberg.spark.SparkCatalog.initialize(SparkCatalog.java:752)
	org.apache.spark.sql.connector.catalog.Catalogs$.load(Catalogs.scala:65)
	org.apache.spark.sql.connector.catalog.CatalogManager.$anonfun$catalog$1(CatalogManager.scala:54)
	scala.collection.mutable.HashMap.getOrElseUpdate(HashMap.scala:86)
	org.apache.spark.sql.connector.catalog.CatalogManager.catalog(CatalogManager.scala:54)
	org.apache.spark.sql.connector.

+---------+
|namespace|
+---------+
|   silver|
|   bronze|
+---------+



In [7]:
# create etl branch
spark.sql(f"USE REFERENCE main IN nessie").show()
spark.sql(f"DROP BRANCH IF EXISTS {REF}").show()
spark.sql(f"CREATE BRANCH {REF} IN nessie FROM main").show()
spark.sql(f"LIST REFERENCES IN nessie").show()

+-------+----+--------------------+
|refType|name|                hash|
+-------+----+--------------------+
| Branch|main|358bf992df5de616a...|
+-------+----+--------------------+

+------+
|status|
+------+
|    OK|
+------+

+-------+----+--------------------+
|refType|name|                hash|
+-------+----+--------------------+
| Branch| etl|358bf992df5de616a...|
+-------+----+--------------------+

+-------+----+--------------------+
|refType|name|                hash|
+-------+----+--------------------+
| Branch| etl|358bf992df5de616a...|
| Branch|main|358bf992df5de616a...|
+-------+----+--------------------+



In [8]:
import notebooks.lib.silver_fact_adventureWorks_sql as sql
import importlib
importlib.reload(sql)

source_list = [
    {
        "table_name": "SALES_HEADER",
        "sql_create_table": sql.sql_create_silver_sales_header,
        "sql_select": sql.sql_select_bronze_sales_header,
        "enabled": True
    },
    {
        "table_name": "SALES_DETAIL",
        "sql_create_table": sql.sql_create_silver_sales_detail,
        "sql_select": sql.sql_select_bronze_sales_detail,
        "enabled": True
    },
    {
        "table_name": "CURRENCY_RATE_HISTORY",
        "sql_create_table": sql.sql_create_silver_currency_rate_history,
        "sql_select": sql.sql_select_bronze_currency_rate_history,
        "enabled": True
    }
]

In [9]:
import pyspark.sql.functions as sqlf

spark.sql(f"USE REFERENCE {REF} IN nessie").show()

for item in [item for item in source_list if item["enabled"]]:
    table_name = f'{namespace}.{item["table_name"].upper()}'
    table_exists = spark.catalog.tableExists(f"nessie.{table_name}")
    
    if table_exists:
        # truncate table
        logger.info(f'truncate table: {table_name}')
        spark.sql(f"truncate TABLE {table_name}")
    
    # create table
    logger.info(f'creating table if not exists:{table_name}')
    spark.sql(item["sql_create_table"])

    # select data from bronze table
    logger.info('selecting data from bronze')
    df = spark.sql(item["sql_select"])
    num_rows, num_columns = df.count(), len(df.columns)

    # insert data into silver
    logger.info(f'inserting data ({num_rows} rows {num_columns}, columns) {table_name}')
    df.writeTo(f"{table_name}").append()
    logger.info(f'data inserted successfully')



+-------+----+--------------------+
|refType|name|                hash|
+-------+----+--------------------+
| Branch| etl|358bf992df5de616a...|
+-------+----+--------------------+



2025-06-30 07:19:09,918 - INFO - creating table if not exists:silver.SALES_HEADER
25/06/30 07:19:41 WARN S3FileIO: Unclosed S3FileIO instance created by:
	org.apache.iceberg.aws.s3.S3FileIO.initialize(S3FileIO.java:444)
	org.apache.iceberg.CatalogUtil.loadFileIO(CatalogUtil.java:402)
	org.apache.iceberg.CatalogUtil.loadFileIO(CatalogUtil.java:349)
	org.apache.iceberg.nessie.NessieCatalog.initialize(NessieCatalog.java:132)
	org.apache.iceberg.CatalogUtil.loadCatalog(CatalogUtil.java:277)
	org.apache.iceberg.CatalogUtil.buildIcebergCatalog(CatalogUtil.java:331)
	org.apache.iceberg.spark.SparkCatalog.buildIcebergCatalog(SparkCatalog.java:153)
	org.apache.iceberg.spark.SparkCatalog.initialize(SparkCatalog.java:752)
	org.apache.spark.sql.execution.datasources.v2.NessieCatalogBridge.setCurrentRefForSpark(NessieCatalogBridge.java:105)
	org.apache.spark.sql.execution.datasources.v2.UseReferenceExec.runInternal(UseReferenceExec.scala:44)
	org.apache.spark.sql.execution.datasources.v2.NessieExec

In [10]:
spark.sql("LIST REFERENCES IN nessie").show()

+-------+----+--------------------+
|refType|name|                hash|
+-------+----+--------------------+
| Branch| etl|2f9bce27970a3c95c...|
| Branch|main|358bf992df5de616a...|
+-------+----+--------------------+



In [11]:
spark.sql(f"MERGE BRANCH {REF} INTO main IN nessie").show()
spark.sql(f"DROP BRANCH {REF} IN nessie").show()

+----+--------------------+
|name|                hash|
+----+--------------------+
|main|ace3a8757bc1d4cf4...|
+----+--------------------+

+------+
|status|
+------+
|    OK|
+------+



In [12]:
spark.sql("LIST REFERENCES IN nessie").show()

+-------+----+--------------------+
|refType|name|                hash|
+-------+----+--------------------+
| Branch|main|ace3a8757bc1d4cf4...|
+-------+----+--------------------+

