In [1]:
import os, sys
import logging
from datetime import datetime
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
logger.setLevel(logging.NOTSET)

logger.info("Application started successfully.")


2025-06-17 10:47:51,773 - INFO - Application started successfully.


In [2]:
# load environment variables

storage_account = os.environ["AZURE_STORAGE_ACCOUNT_NAME"]
access_key = os.environ["AZURE_STORAGE_ACCOUNT_ACCESS_KEY"]
NESSIE_URI = os.environ['NESSIE_URI']
REF = "etl"
FULL_PATH_TO_WAREHOUSE = os.environ['WAREHOUSE']
AWS_S3_ENDPOINT = os.environ["AWS_S3_ENDPOINT"]
AWS_ACCESS_KEY = os.environ['AWS_ACCESS_KEY_ID']
AWS_SECRET_KEY  = os.environ['AWS_SECRET_ACCESS_KEY']

In [None]:
# set pyspark configuration
from pyspark import SparkConf

conf = SparkConf() \
    .setAppName("Read_csv_from_azure_blob") \
    .setMaster("local[*]") \
    .set("spark.executor.memory", "4g") \
    .set("spark.driver.memory", "4g")

conf.set("spark.sql.shuffle.partitions", "200")
conf.set("spark.sql.shuffle.autoBroadcastJoinThreshold", "1")
conf.set("spark.spark.executor.extraJavaOptions", "-XX:+UseG1GC")
conf.set("spark.spark.driver.extraJavaOptions", "-XX:+UseG1GC")


<pyspark.conf.SparkConf at 0x7f14c14a1bd0>

In [4]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.config(conf=conf).getOrCreate()
spark

25/06/17 10:47:52 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [5]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.config(conf=conf).getOrCreate()
spark

In [6]:
# create namespace gold
namespace = "GOLD"
spark.sql(f"CREATE NAMESPACE IF NOT EXISTS {namespace}")
spark.sql(f"SHOW NAMESPACES IN nessie").show()

+---------+
|namespace|
+---------+
|   BRONZE|
|   SILVER|
|     GOLD|
+---------+



In [7]:
# create etl branch
spark.sql(f"USE REFERENCE main IN nessie")
spark.sql(f"DROP BRANCH IF EXISTS {REF}")
spark.sql(f"CREATE BRANCH {REF} IN nessie FROM main")
spark.sql(f"LIST REFERENCES IN nessie").show()


+-------+----+--------------------+
|refType|name|                hash|
+-------+----+--------------------+
| Branch| etl|ee87ff24009d348fc...|
| Branch|main|ee87ff24009d348fc...|
+-------+----+--------------------+



In [8]:
import notebooks.lib.gold_fact_sales_sql as sql
import importlib
importlib.reload(sql)

source_list = [
    {
        "table_name": "INTERNET_SALES_HEADER",
        "sql_create_table": sql.sql_create_gold_internet_sales_header,
        "sql_select_temp": sql.sql_select_temp_silver_internet_sales_header,
        "sql_merge": sql.sql_merge_gold_internet_sales_header,
        "enabled": True
    },
    {
        "table_name": "INTERNET_SALES_DETAIL",
        "sql_create_table": sql.sql_create_gold_internet_sales_detail,
        "sql_select_temp": sql.sql_select_temp_silver_internet_sales_header,
        "sql_merge": sql.sql_merge_gold_internet_sales_detail,
        "enabled": False
    }
]

In [None]:
# change branch to etl
spark.sql(f"USE REFERENCE main IN nessie").show()

for item in [item for item in source_list if item["enabled"]]:
    table_name = item["table_name"].upper()
    # create table
    # logger.info(f'creating table if not exists:{table_name}')
    spark.sql(item["sql_create_table"])

    # select data into temp tables
    df = spark.sql(item["sql_select_temp"])

    # repartition and store into temp tables
    staging_table_name = table_name + "_" + datetime.now().strftime("%Y%m%d%H%M%S")
    logger.info(f'repartitioning temp table: {staging_table_name}')
    df.repartition(200).createOrReplaceTempView(staging_table_name)

    # merge data into gold
    try:
        logger.info(f'merging into: {table_name}')
        spark.sql(item["sql_merge"].format(source=staging_table_name))
        logger.info(f'data merged successfully')
    except Exception as e:
        print(str(e))
    finally:
        logger.info(f'dropping table: {staging_table_name}')
        spark.sql(f"DROP TABLE IF EXISTS {staging_table_name}")



+-------+----+--------------------+
|refType|name|                hash|
+-------+----+--------------------+
| Branch|main|ee87ff24009d348fc...|
+-------+----+--------------------+



25/06/17 10:47:57 WARN S3FileIO: Unclosed S3FileIO instance created by:
	org.apache.iceberg.aws.s3.S3FileIO.initialize(S3FileIO.java:444)
	org.apache.iceberg.CatalogUtil.loadFileIO(CatalogUtil.java:402)
	org.apache.iceberg.CatalogUtil.loadFileIO(CatalogUtil.java:349)
	org.apache.iceberg.nessie.NessieCatalog.initialize(NessieCatalog.java:132)
	org.apache.iceberg.CatalogUtil.loadCatalog(CatalogUtil.java:277)
	org.apache.iceberg.CatalogUtil.buildIcebergCatalog(CatalogUtil.java:331)
	org.apache.iceberg.spark.SparkCatalog.buildIcebergCatalog(SparkCatalog.java:153)
	org.apache.iceberg.spark.SparkCatalog.initialize(SparkCatalog.java:752)
	org.apache.spark.sql.connector.catalog.Catalogs$.load(Catalogs.scala:65)
	org.apache.spark.sql.connector.catalog.CatalogManager.$anonfun$catalog$1(CatalogManager.scala:54)
	scala.collection.mutable.HashMap.getOrElseUpdate(HashMap.scala:86)
	org.apache.spark.sql.connector.catalog.CatalogManager.catalog(CatalogManager.scala:54)
	org.apache.spark.sql.connector.

An error occurred while calling o46.sql.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 18.0 failed 1 times, most recent failure: Lost task 0.0 in stage 18.0 (TID 577) (5b0bafe9d834 executor driver): java.lang.OutOfMemoryError: Java heap space
	at java.base/java.io.ByteArrayOutputStream.<init>(ByteArrayOutputStream.java:81)
	at org.apache.iceberg.shaded.org.apache.parquet.hadoop.CodecFactory$HeapBytesCompressor.<init>(CodecFactory.java:222)
	at org.apache.iceberg.shaded.org.apache.parquet.hadoop.CodecFactory.createCompressor(CodecFactory.java:273)
	at org.apache.iceberg.shaded.org.apache.parquet.hadoop.CodecFactory.getCompressor(CodecFactory.java:255)
	at org.apache.iceberg.parquet.ParquetWriter.<init>(ParquetWriter.java:93)
	at org.apache.iceberg.parquet.Parquet$WriteBuilder.build(Parquet.java:428)
	at org.apache.iceberg.parquet.Parquet$DataWriteBuilder.build(Parquet.java:829)
	at org.apache.iceberg.data.BaseFileWriterFactory.newDataWriter(BaseFil

In [11]:
spark.sql(f"MERGE BRANCH {REF} INTO main IN nessie").show()
spark.sql(f"DROP BRANCH {REF} IN nessie").show()

2025-06-17 10:50:32,479 - INFO - Closing down clientserver connection


ConnectionRefusedError: [Errno 111] Connection refused

In [None]:
spark.sql("LIST REFERENCES IN nessie").show()