In [1]:
import os, sys
import logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

logger.info("Application started successfully.")

2025-06-13 09:40:32,246 - INFO - Application started successfully.


In [2]:
# load environment variables

storage_account = os.environ["AZURE_STORAGE_ACCOUNT_NAME"]
access_key = os.environ["AZURE_STORAGE_ACCOUNT_ACCESS_KEY"]
NESSIE_URI = os.environ['NESSIE_URI']
REF = "etl"
FULL_PATH_TO_WAREHOUSE = os.environ['WAREHOUSE']
AWS_S3_ENDPOINT = os.environ["AWS_S3_ENDPOINT"]
AWS_ACCESS_KEY = os.environ['AWS_ACCESS_KEY_ID']
AWS_SECRET_KEY  = os.environ['AWS_SECRET_ACCESS_KEY']

In [3]:
# set pyspark configuration
from pyspark import SparkConf

conf = SparkConf() \
    .setAppName("Read_csv_from_azure_blob") \
    .setMaster("local[*]") \
    .set("spark.executor.memory", "4g") \
    .set("spark.driver.memory", "2g")

conf.set("spark.sql.catalog.nessie.ref", REF)

<pyspark.conf.SparkConf at 0x77252883e5f0>

In [4]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.config(conf=conf).getOrCreate()
spark

25/06/13 09:40:32 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [5]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.config(conf=conf).getOrCreate()
spark

In [17]:
%%sql 
DROP BRANCH etl IN nessie

status
OK


In [18]:
# create etl branch
df = spark.sql(f"LIST REFERENCES IN nessie")
reference_exists = df.filter(df.name == REF).count() > 0
if not reference_exists:
    spark.sql(f"CREATE BRANCH etl IN nessie FROM main").show()


+-------+----+--------------------+
|refType|name|                hash|
+-------+----+--------------------+
| Branch| etl|016d6a46b6679d403...|
+-------+----+--------------------+



25/06/13 09:51:41 WARN S3FileIO: Unclosed S3FileIO instance created by:
	org.apache.iceberg.aws.s3.S3FileIO.initialize(S3FileIO.java:444)
	org.apache.iceberg.CatalogUtil.loadFileIO(CatalogUtil.java:402)
	org.apache.iceberg.CatalogUtil.loadFileIO(CatalogUtil.java:349)
	org.apache.iceberg.nessie.NessieCatalog.initialize(NessieCatalog.java:132)
	org.apache.iceberg.CatalogUtil.loadCatalog(CatalogUtil.java:277)
	org.apache.iceberg.CatalogUtil.buildIcebergCatalog(CatalogUtil.java:331)
	org.apache.iceberg.spark.SparkCatalog.buildIcebergCatalog(SparkCatalog.java:153)
	org.apache.iceberg.spark.SparkCatalog.initialize(SparkCatalog.java:752)
	org.apache.spark.sql.connector.catalog.Catalogs$.load(Catalogs.scala:65)
	org.apache.spark.sql.connector.catalog.CatalogManager.$anonfun$catalog$1(CatalogManager.scala:54)
	scala.collection.mutable.HashMap.getOrElseUpdate(HashMap.scala:86)
	org.apache.spark.sql.connector.catalog.CatalogManager.catalog(CatalogManager.scala:54)
	org.apache.spark.sql.connector.

In [19]:
table_name = "nessie.bronze.currency_rate"
df = spark.sql(f"select * from {table_name}")
for c in df.columns:
    print(f", {c}")

, currency
, date
, rate
, source_filepath
, ingestion_datetime


In [28]:
import notebooks.lib.silver_fact_adventureWorks_sql as sql
import importlib
importlib.reload(sql)

source_list = [
    {
        "table_name": "nessie.silver.sales_header",
        "sql_create_table": sql.sql_create_silver_sales_header,
        "sql_select": sql.sql_select_bronze_sales_header,
        "enabled": True
    },
    {
        "table_name": "nessie.silver.sales_detail",
        "sql_create_table": sql.sql_create_silver_sales_detail,
        "sql_select": sql.sql_select_bronze_sales_detail,
        "enabled": True
    },
    {
        "table_name": "nessie.silver.currency_rate_history",
        "sql_create_table": sql.sql_create_silver_currency_rate_history,
        "sql_select": sql.sql_select_bronze_currency_rate_history,
        "enabled": True
    }
]

In [29]:
# change branch to etl
spark.sql(f"USE REFERENCE {REF} IN nessie").show()

+-------+----+--------------------+
|refType|name|                hash|
+-------+----+--------------------+
| Branch| etl|747137a0faba4562e...|
+-------+----+--------------------+



In [30]:
for item in source_list:
    table_name = item["table_name"]
    # create table
    logger.info(f'creating table if not exists:{table_name}')
    spark.sql(item["sql_create_table"])

    # select data from bronze table
    logger.info('selecting data from bronze')
    df = spark.sql(item["sql_select"])
    num_rows, num_columns = df.count(), len(df.columns)

    # insert data into silver
    logger.info(f'inserting data ({num_rows} rows {num_columns}, columns) {table_name}')
    df.writeTo(table_name).createOrReplace()
    logger.info(f'data inserted successfully')



2025-06-13 09:57:02,749 - INFO - creating table if not exists:nessie.silver.sales_header
2025-06-13 09:57:02,791 - INFO - selecting data from bronze
2025-06-13 09:57:03,127 - INFO - inserting data (27659 rows 12, columns) nessie.silver.sales_header
2025-06-13 09:57:03,976 - INFO - data inserted successfully
2025-06-13 09:57:03,978 - INFO - creating table if not exists:nessie.silver.sales_detail
2025-06-13 09:57:04,019 - INFO - selecting data from bronze
2025-06-13 09:57:04,098 - INFO - inserting data (60398 rows 9, columns) nessie.silver.sales_detail
2025-06-13 09:57:04,636 - INFO - data inserted successfully
2025-06-13 09:57:04,637 - INFO - creating table if not exists:nessie.silver.currency_rate_history
2025-06-13 09:57:04,658 - INFO - selecting data from bronze
2025-06-13 09:57:04,742 - INFO - inserting data (14264 rows 5, columns) nessie.silver.currency_rate_history
2025-06-13 09:57:05,067 - INFO - data inserted successfully


In [31]:
spark.sql("LIST REFERENCES IN nessie").show()

+-------+----+--------------------+
|refType|name|                hash|
+-------+----+--------------------+
| Branch| etl|4c368f1399ae741aa...|
| Branch|main|016d6a46b6679d403...|
+-------+----+--------------------+



In [32]:
spark.sql(f"MERGE BRANCH {REF} INTO main IN nessie").show()

+----+--------------------+
|name|                hash|
+----+--------------------+
|main|5b392d39d1e805fce...|
+----+--------------------+



In [33]:
spark.sql("LIST REFERENCES IN nessie").show()

+-------+----+--------------------+
|refType|name|                hash|
+-------+----+--------------------+
| Branch| etl|4c368f1399ae741aa...|
| Branch|main|5b392d39d1e805fce...|
+-------+----+--------------------+

