# Playground

## Initialize globals

In [2]:
import sys
import os
import io
import shutil
import time

from uuid import uuid4
from typing import Any
from contextlib import redirect_stdout
from pyspark.sql import SparkSession

sys.path.append("../src")

from fabricengineer.transform.mlv.mlv import MaterializedLakeView
from fabricengineer.transform.silver.insertonly import SilverIngestionInsertOnlyService, ConstantColumn, LakehouseTable as LakehouseTableIO, get_mock_save_path
from fabricengineer.logging import TimeLogger

mlv: MaterializedLakeView
timer: TimeLogger

In [3]:
class NotebookUtilsFSMock:
    def _get_path(self, file: str) -> str:
        return os.path.join(os.getcwd(), file)

    def exists(self, path: str) -> bool:
        return os.path.exists(self._get_path(path))

    def put(
        self,
        file: str,
        content: str,
        overwrite: bool = False
    ) -> None:
        path = self._get_path(file)
        os.makedirs(os.path.dirname(path), exist_ok=True)

        if os.path.exists(path) and not overwrite:
            raise FileExistsError(f"File {path} already exists and overwrite is set to False.")
        with open(path, 'w') as f:
            f.write(content)


class NotebookUtilsMock:
    def __init__(self):
        self.fs = NotebookUtilsFSMock()

global spark
spark: SparkSession = SparkSession.builder.appName("PlaygroundSparkSession").getOrCreate()

global notebookutils
notebookutils = NotebookUtilsMock()

25/08/01 12:47:23 WARN Utils: Your hostname, MacBook-Air-von-Enrico.local resolves to a loopback address: 127.0.0.1; using 192.168.0.7 instead (on interface en0)
25/08/01 12:47:23 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/08/01 12:47:23 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
def sniff_logs(fn: callable) -> tuple[Any, list[str]]:
    log_stream = io.StringIO()
    with redirect_stdout(log_stream):
        result = fn()
    logs = log_stream.getvalue().splitlines()
    return result, logs

In [5]:
def cleanup_fs():
    path_Files = notebookutils.fs._get_path("Files")
    path_tmp = notebookutils.fs._get_path("tmp")
    path_tmp_2 = "../tmp"

    rm_paths = [path_Files, path_tmp, path_tmp_2]
    for path in rm_paths:
        if os.path.exists(path):
            shutil.rmtree(path)

cleanup_fs()

## TimeLogger

In [6]:
with open("../src/fabricengineer/logging/timer.py") as f:
    code = f.read()
exec(code, globals())

timer

TimeLogger(start_time=None, end_time=None, elapsed_time=None)

In [7]:
timer.start().log()
time.sleep(1)
timer.stop().log()

TIMER-START:	2025-08-01 12:47:24
TIMER-END:	2025-08-01 12:47:25, ELAPSED: 1.0031s


## MaterializedLakeView

In [8]:
with open("../src/fabricengineer/transform/mlv/mlv.py") as f:
    code = f.read()
exec(code, globals())


mlv.init(
    lakehouse="Lakehouse",
    schema="schema",
    table="table",
    table_suffix=None,
    is_testing_mock=True
)

mlv.to_dict()

{'lakehouse': 'Lakehouse',
 'schema': 'schema',
 'table': 'table',
 'table_path': 'Lakehouse.schema.table'}

In [9]:
mlv.init(
    lakehouse=str(uuid4()),
    schema="schema",
    table="table",
    table_suffix=None,
    is_testing_mock=True
)

sql = """
SELECT * FROM Lakehouse.schema.table
"""
is_existing = False
for i in range(0, 4):
    if i > 0:
        is_existing = True
    if i == 2:
        sql = """
        SELECT * FROM Lakehouse.schema.table WHERE 1=0
        """
    result, logs = sniff_logs(
        lambda: mlv.create_or_replace(sql, mock_is_existing=is_existing)
    )
    print(f"Logs-{i+1}")
    display(logs)

Logs-1


['CREATE SCHEMA IF NOT EXISTS ea286a6c-1b39-42d9-bd6b-68be3737e9df.schema',
 'CREATE MLV: ea286a6c-1b39-42d9-bd6b-68be3737e9df.schema.table']

Logs-2


['Nothing has changed.']

Logs-3


['REPLACE MLV: ea286a6c-1b39-42d9-bd6b-68be3737e9df.schema.table',
 'DROP MATERIALIZED LAKE VIEW ea286a6c-1b39-42d9-bd6b-68be3737e9df.schema.table',
 'CREATE SCHEMA IF NOT EXISTS ea286a6c-1b39-42d9-bd6b-68be3737e9df.schema',
 'CREATE MLV: ea286a6c-1b39-42d9-bd6b-68be3737e9df.schema.table']

Logs-4


['Nothing has changed.']

In [10]:
mlv.file_path
os.remove(mlv.file_path)

In [11]:
result, logs = sniff_logs(
    lambda: mlv.create_or_replace(sql, mock_is_existing=True)
)

logs

['WARN: file=None, is_existing=True. RECREATE.',
 'DROP MATERIALIZED LAKE VIEW ea286a6c-1b39-42d9-bd6b-68be3737e9df.schema.table',
 'CREATE SCHEMA IF NOT EXISTS ea286a6c-1b39-42d9-bd6b-68be3737e9df.schema',
 'CREATE MLV: ea286a6c-1b39-42d9-bd6b-68be3737e9df.schema.table']

In [12]:
result, logs = sniff_logs(
    lambda: mlv.refresh(full_refresh=True)
)

logs

['REFRESH MATERIALIZED LAKE VIEW ea286a6c-1b39-42d9-bd6b-68be3737e9df.schema.table FULL']

In [13]:
result, logs = sniff_logs(
    lambda: mlv.refresh(full_refresh=False)
)

logs

['REFRESH MATERIALIZED LAKE VIEW ea286a6c-1b39-42d9-bd6b-68be3737e9df.schema.table ']

## Clean up the file system

In [14]:
cleanup_fs()

## SilverIngestionInsertOnlyService

In [15]:
src_table = LakehouseTableIO(
    lakehouse="BronzeLakehouse",
    schema="schema",
    table="table1"
)
dest_table = LakehouseTableIO(
    lakehouse="SilverLakehouse",
    schema=src_table.schema,
    table=src_table.table
)

etl = SilverIngestionInsertOnlyService()
etl.init(
    spark=spark,
    source_table=src_table,
    destination_table=dest_table,
    nk_columns=["id"],
    constant_columns=[],
    is_delta_load=False,
    delta_load_use_broadcast=True,
    transformations={},
    exclude_comparing_columns=None,
    include_comparing_columns=None,
    historize=True,
    partition_by_columns=None,
    is_testing_mock=True
)

25/08/01 12:47:25 WARN SQLConf: The SQL config 'spark.sql.legacy.parquet.int96RebaseModeInRead' has been deprecated in Spark v3.2 and may be removed in the future. Use 'spark.sql.parquet.int96RebaseModeInRead' instead.
25/08/01 12:47:25 WARN SQLConf: The SQL config 'spark.sql.legacy.parquet.int96RebaseModeInWrite' has been deprecated in Spark v3.2 and may be removed in the future. Use 'spark.sql.parquet.int96RebaseModeInWrite' instead.
25/08/01 12:47:25 WARN SQLConf: The SQL config 'spark.sql.legacy.parquet.datetimeRebaseModeInRead' has been deprecated in Spark v3.2 and may be removed in the future. Use 'spark.sql.parquet.datetimeRebaseModeInRead' instead.
25/08/01 12:47:25 WARN SQLConf: The SQL config 'spark.sql.legacy.parquet.datetimeRebaseModeInWrite' has been deprecated in Spark v3.2 and may be removed in the future. Use 'spark.sql.parquet.datetimeRebaseModeInWrite' instead.


In [20]:
from pyspark.sql import functions as F, types as T

schema = T.StructType([
    T.StructField("id", T.IntegerType(), False),
    T.StructField("name", T.StringType(), False),
    T.StructField("department_id", T.IntegerType(), False),
    T.StructField("created_at", T.StringType(), False),
    T.StructField("updated_at", T.StringType(), False),
])

data = [
    (1, "Alice", 1, "2023-01-01", "2023-01-01"),
    (2, "u-Bob", 2, "2023-01-01", "2023-01-01"),
    (3, "u-Charlie", 3, "2023-01-01", "2023-01-01"),
    (4, "David", 1, "2023-01-01", "2023-01-01"),
    (5, "Eve", 2, "2023-01-01", "2023-01-01"),
    (6, "Frank", 3, "2023-01-01", "2023-01-01"),
    # (7, "Grace", 1, "2023-01-01", "2023-01-01"),
    (8, "Heidi", 2, "2023-01-01", "2023-01-01"),
    (9, "Ivan", 3, "2023-01-01", "2023-01-01"),
    (10, "Judy", 1, "2023-01-01", "2023-01-01")
    ,(11, "Judy-2", 1, "2023-01-01", "2023-01-01")
    ,(12, "Judy-3", 1, "2023-01-01", "2023-01-01")
    ,(13, "Judy-4", 1, "2023-01-01", "2023-01-01")
]

df_bronze = spark.createDataFrame(data, schema)
df_bronze = df_bronze \
    .withColumn("created_at", F.to_timestamp("created_at")) \
    .withColumn("updated_at",F.to_timestamp("updated_at"))

df_bronze.show(truncate=False)
bronze_path = get_mock_save_path(etl._src_table)
df_bronze.write \
    .format("parquet") \
    .mode("overwrite") \
    .save(bronze_path)

+---+---------+-------------+-------------------+-------------------+
|id |name     |department_id|created_at         |updated_at         |
+---+---------+-------------+-------------------+-------------------+
|1  |Alice    |1            |2023-01-01 00:00:00|2023-01-01 00:00:00|
|2  |u-Bob    |2            |2023-01-01 00:00:00|2023-01-01 00:00:00|
|3  |u-Charlie|3            |2023-01-01 00:00:00|2023-01-01 00:00:00|
|4  |David    |1            |2023-01-01 00:00:00|2023-01-01 00:00:00|
|5  |Eve      |2            |2023-01-01 00:00:00|2023-01-01 00:00:00|
|6  |Frank    |3            |2023-01-01 00:00:00|2023-01-01 00:00:00|
|8  |Heidi    |2            |2023-01-01 00:00:00|2023-01-01 00:00:00|
|9  |Ivan     |3            |2023-01-01 00:00:00|2023-01-01 00:00:00|
|10 |Judy     |1            |2023-01-01 00:00:00|2023-01-01 00:00:00|
|11 |Judy-2   |1            |2023-01-01 00:00:00|2023-01-01 00:00:00|
|12 |Judy-3   |1            |2023-01-01 00:00:00|2023-01-01 00:00:00|
|13 |Judy-4   |1    

In [23]:
new_data = etl.ingest()
new_data.orderBy("id").show(truncate=False)

+---+---+---+----+-------------+----------+----------+------------+--------------+
|PK |NK |id |name|department_id|created_at|updated_at|ROW_LOAD_DTS|ROW_DELETE_DTS|
+---+---+---+----+-------------+----------+----------+------------+--------------+
+---+---+---+----+-------------+----------+----------+------------+--------------+



In [24]:
silver_path = get_mock_save_path(etl._dest_table)
df = spark.read.format("parquet").load(silver_path).orderBy(F.col("id").asc(), F.col("ROW_LOAD_DTS").asc())

df.show(truncate=False)

+------------------------------------+---+---+---------+-------------+-------------------+-------------------+--------------------------+--------------------------+
|PK                                  |NK |id |name     |department_id|created_at         |updated_at         |ROW_LOAD_DTS              |ROW_DELETE_DTS            |
+------------------------------------+---+---+---------+-------------+-------------------+-------------------+--------------------------+--------------------------+
|d37d08fa-8b08-4d1b-b3ef-1f3bf8eb3815|1  |1  |Alice    |1            |2023-01-01 00:00:00|2023-01-01 00:00:00|2025-08-01 12:47:33.309359|NULL                      |
|36b95c1c-0654-45b2-b13b-a7d8078385a1|2  |2  |Bob      |2            |2023-01-01 00:00:00|2023-01-01 00:00:00|2025-08-01 12:47:33.309359|NULL                      |
|b496ab4c-66f1-41a4-9624-fc7ac7e1ef06|2  |2  |u-Bob    |2            |2023-01-01 00:00:00|2023-01-01 00:00:00|2025-08-01 12:49:06.642621|NULL                      |
|7c570f19-