In [1]:
import os 
import logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

logger.info("Application started successfully.")

2025-06-13 11:00:40,851 - INFO - Application started successfully.


In [7]:
# load environment variables

storage_account = os.environ["AZURE_STORAGE_ACCOUNT_NAME"]
access_key = os.environ["AZURE_STORAGE_ACCOUNT_ACCESS_KEY"]
NESSIE_URI = os.environ['NESSIE_URI']
REF = "etl"
FULL_PATH_TO_WAREHOUSE = os.environ['WAREHOUSE']
AWS_S3_ENDPOINT = os.environ["AWS_S3_ENDPOINT"]
AWS_ACCESS_KEY = os.environ['AWS_ACCESS_KEY_ID']
AWS_SECRET_KEY  = os.environ['AWS_SECRET_ACCESS_KEY']

In [3]:
# set pyspark configuration
from pyspark import SparkConf

conf = SparkConf() \
    .setAppName("process_bronze_adventureWorks") \
    .setMaster("local[*]") \
    .set("spark.executor.memory", "4g") \
    .set("spark.driver.memory", "2g") \

# set MinIO config
conf.set("fs.s3a.access.key", AWS_ACCESS_KEY) 
conf.set("fs.s3a.secret.key", AWS_SECRET_KEY) 
conf.set("fs.s3a.endpoint", AWS_S3_ENDPOINT) 
conf.set("fs.s3a.connection.ssl.enabled", "false") 
conf.set("fs.s3a.path.style.access", "true") 

<pyspark.conf.SparkConf at 0x78d75960abc0>

In [4]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.config(conf=conf).getOrCreate()
spark

25/06/13 11:00:41 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


# Process sales header

In [16]:
# create bronze namespace in nessie catalog
namespace = "bronze"
df = spark.sql("SHOW NAMESPACES in nessie")
namespace_exists = df.filter(df.namespace == namespace).count() > 0

logger.info(f"namespace {namespace} exists: {namespace_exists}")
if not namespace_exists:
    spark.sql(f"CREATE NAMESPACE nessie.{namespace}").show()


2025-06-13 11:02:51,059 - INFO - namespace bronze exists: True


In [21]:
# define list of data
source_list = [
    {
        "path": "s3a://seed/adventureWorks/currency_rate.csv",
        "iceberg_table": "nessie.bronze.currency_rate",
        "enabled": True
    },
    {
        "path": "s3a://seed/adventureWorks/currency.csv",
        "iceberg_table": "nessie.bronze.currency",
        "enabled": True
    },
    {
        "path": "s3a://seed/adventureWorks/customer.csv",
        "iceberg_table": "nessie.bronze.customer",
        "enabled": True
    },
    {
        "path": "s3a://seed/adventureWorks/product.csv",
        "iceberg_table": "nessie.bronze.product",
        "enabled": True
    },
    {
        "path": "s3a://seed/adventureWorks/sales.csv",
        "iceberg_table": "nessie.bronze.sales",
        "enabled": True
    }
]

In [22]:
# create etl branch
df = spark.sql(f"LIST REFERENCES IN nessie")
reference_exists = df.filter(df.name == REF).count() > 0
if not reference_exists:
    spark.sql(f"CREATE BRANCH etl IN nessie FROM main").show()


In [23]:
for item in [item for item in source_list if item["enabled"]]:
    # Read CSV from MinIO
    logger.info(f'read csv from bucket: {item["path"]}')
    df = spark.read.option("header", "true") \
        .csv(item["path"])
    
    num_columns, num_rows = len(df.columns), df.count()

    # Write to Nessie table using createOrReplace
    table_name = item["iceberg_table"]
    logger.info(f'writing data ({num_rows} rows, {num_columns} columns) into: {table_name}')
    try:
        df.writeTo(item["iceberg_table"]).createOrReplace() 
    except Exception as e:
        logger.error(f"Error writing into: {table_name}")

    logger.info(f'Writing complete: {item["iceberg_table"]}')

2025-06-13 11:04:18,847 - INFO - read csv from bucket: s3a://seed/adventureWorks/currency_rate.csv
2025-06-13 11:04:19,160 - INFO - writing data (14264 rows, 5 columns) into: nessie.bronze.currency_rate
2025-06-13 11:04:19,730 - INFO - Writing complete: nessie.bronze.currency_rate
2025-06-13 11:04:19,732 - INFO - read csv from bucket: s3a://seed/adventureWorks/currency.csv
2025-06-13 11:04:20,053 - INFO - writing data (105 rows, 4 columns) into: nessie.bronze.currency
2025-06-13 11:04:20,399 - INFO - Writing complete: nessie.bronze.currency
2025-06-13 11:04:20,400 - INFO - read csv from bucket: s3a://seed/adventureWorks/customer.csv
2025-06-13 11:04:20,722 - INFO - writing data (18484 rows, 17 columns) into: nessie.bronze.customer
2025-06-13 11:04:21,511 - INFO - Writing complete: nessie.bronze.customer
2025-06-13 11:04:21,512 - INFO - read csv from bucket: s3a://seed/adventureWorks/product.csv
2025-06-13 11:04:21,848 - INFO - writing data (295 rows, 10 columns) into: nessie.bronze.pro

In [24]:
spark.sql("""
select * from nessie.bronze.currency_rate
""").show()

+--------+----------+------+--------------------+--------------------+
|currency|      date|  rate|     source_filepath|  ingestion_datetime|
+--------+----------+------+--------------------+--------------------+
|     ARS|2010-12-29|1.0000|abfss://datalake@...|2025-06-13T10:59:...|
|     ARS|2010-12-30|1.0000|abfss://datalake@...|2025-06-13T10:59:...|
|     ARS|2010-12-31|1.0000|abfss://datalake@...|2025-06-13T10:59:...|
|     ARS|2011-01-01|1.0000|abfss://datalake@...|2025-06-13T10:59:...|
|     ARS|2011-01-02|1.0000|abfss://datalake@...|2025-06-13T10:59:...|
|     ARS|2011-01-03|1.0000|abfss://datalake@...|2025-06-13T10:59:...|
|     ARS|2011-01-04|1.0000|abfss://datalake@...|2025-06-13T10:59:...|
|     ARS|2011-01-05|1.0000|abfss://datalake@...|2025-06-13T10:59:...|
|     ARS|2011-01-06|1.0000|abfss://datalake@...|2025-06-13T10:59:...|
|     ARS|2011-01-07|1.0000|abfss://datalake@...|2025-06-13T10:59:...|
|     ARS|2011-01-08|1.0000|abfss://datalake@...|2025-06-13T10:59:...|
|     

In [25]:
spark.sql(f"MERGE BRANCH {REF} INTO main IN nessie").show()
spark.sql(f"DROP BRANCH {REF} IN nessie").show()

+----+--------------------+
|name|                hash|
+----+--------------------+
|main|87ae237b6dde6e090...|
+----+--------------------+

+------+
|status|
+------+
|    OK|
+------+

