## Data ETL Notebook

**Layer**: Bronze

**Domain**: Risk-free

**Action**: Ingest RBNZ Yields and Series Data

The purpose of this notebook is to ingest the file hb2-daily-close.xlsx from raw_data volume, apply SCD Type 2 data lineage, and write to a bronze data table with full history.

In [0]:
# Install project requirements
%pip install openpyxl

In [0]:
# Import libraries
import pandas as pd
import datetime
from pyspark.sql.functions import lit
from delta.tables import DeltaTable
import os

In [0]:
# Define path for input data file
source_directory = '/Volumes/workspace/riskfree_bronze/raw_data/'
source_file_names = [f for f in os.listdir(source_directory) if os.path.isfile(os.path.join(source_directory, f))]
source_file_names

In [0]:
# Check source file names
valid_files = ['hb2-daily-close.xlsx', 'hb2-daily.xlsx']
if not any(file in source_file_names for file in valid_files):
    raise ValueError("Source file names must be one of: 'hb2-daily-close.xlsx', 'hb2-daily.xlsx'")

In [0]:
ingestion_timestamp = datetime.datetime.now()
print('Ingestion timestamp:', ingestion_timestamp)

Ingest Series Data into bronze table (overwrite)

In [0]:
for file in source_file_names:

    # Set location
    excel_path = f"{source_directory}{file}"

    # Load 'Series Definitions' sheet
    df_series = pd.read_excel(excel_path, sheet_name="Series Definitions")

    # Clean column names
    df_series.columns = [col.strip().replace(" ", "_").lower() for col in df_series.columns]

    # Add source info
    df_series["source_file_name"] = file
    df_series["ingestion_timestamp"] = ingestion_timestamp

    # Convert to Spark and write to Delta
    spark_series = spark.createDataFrame(df_series)

    # Create the Delta table if it does not exist
    spark.sql("""
    CREATE TABLE IF NOT EXISTS workspace.riskfree_metadata.series_definitions (
        series_id STRING,
        group STRING,
        series STRING,
        unit STRING,
        note STRING,
        source_file_name STRING,
        ingestion_timestamp TIMESTAMP,
        PRIMARY KEY (series_id)
    ) USING DELTA
    """)

    delta_series_table = DeltaTable.forName(spark, "workspace.riskfree_metadata.series_definitions")
    delta_series_table.alias("t").merge(
        spark_series.alias("s"),
        "t.series_id = s.series_id"
    ).whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()

    print("Series metadata saved to workspace.riskfree_metadata.series_definitions from ", file)

Ingest Nominal Yield Data into bronze table (append, SCD Type 2)

In [0]:
for file in source_file_names:

    # Set location
    excel_path = f"{source_directory}{file}"

    # Get publish date
    df = pd.read_excel(excel_path, sheet_name="Table Description")
    publish_date = pd.to_datetime(df[df["Published By"] == "Published Date"]["Reserve Bank of New Zealand"]).dt.strftime('%Y-%m-%d').values[0]

    # Imports & Config
    sheet_name = "Data"
    bronze_table = "workspace.riskfree_bronze.rbnz_yields_raw"

    # Load Excel Sheet
    df = pd.read_excel(excel_path, sheet_name=sheet_name, skiprows=4)
    df = df.rename(columns={"Series Id": "date"})

    df["source_file_name"] = file
    df["ingestion_timestamp"] = ingestion_timestamp
    df["publish_date"] = publish_date

    # Normalize to long format
    df_long = pd.melt(
        df,
        id_vars=["date", "source_file_name", "ingestion_timestamp", "publish_date"],
        var_name="series_id",
        value_name="yield_percent"
    )
    df_long = df_long.dropna(subset=["yield_percent"])

    # Convert to Spark & Add SCD2 fields
    spark_df = spark.createDataFrame(df_long)
    spark_df = spark_df \
        .withColumn("effective_start", lit(ingestion_timestamp)) \
        .withColumn("effective_end", lit(None).cast("timestamp")) \
        .withColumn("is_current", lit(True))

    # Create Bronze Table (if needed)
    spark.sql(f"""
    CREATE TABLE IF NOT EXISTS {bronze_table} (
        date DATE,
        series_id STRING,
        yield_percent DOUBLE,
        publish_date DATE,
        source_file_name STRING,
        ingestion_timestamp TIMESTAMP,
        effective_start TIMESTAMP,
        effective_end TIMESTAMP,
        is_current BOOLEAN
    )
    USING DELTA
    """)

    # SCD Type 2 Merge
    delta_table = DeltaTable.forName(spark, bronze_table)
    delta_table.alias("t").merge(
        spark_df.alias("s"),
        "t.date = s.date AND t.series_id = s.series_id AND t.publish_date = s.publish_date AND t.is_current = true"
    ).whenMatchedUpdate(
        condition="t.yield_percent != s.yield_percent",
        set={
            "effective_end": "s.effective_start",
            "is_current": "false"
        }
    ).whenNotMatchedInsertAll().execute()

    print("✅ SCD Type 2 merge completed into:", bronze_table, " from ", file)

Move the xlsx into archive folder with date name

In [0]:
for file in source_file_names:

    # Set location
    excel_path = f"{source_directory}{file}"

    # Imports & Config
    sheet_name = "Table Description"

    # Load Excel Sheet
    df = pd.read_excel(excel_path, sheet_name=sheet_name)
    publish_date = pd.to_datetime(df[df["Published By"] == "Published Date"]["Reserve Bank of New Zealand"]).dt.strftime('%Y-%m-%d').astype(str).values[0]

    # Excel archive name
    excel_path_archive = f"/Volumes/workspace/riskfree_bronze/raw_data/archive/{publish_date}-{file}"

    # Move the processed Excel file to the archive directory
    dbutils.fs.mv(excel_path, excel_path_archive)