In [None]:
#
# Import Packages Required for this Notebook
#
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from typing import List

print("Successfully imported all packages for this notebook.")

In [None]:
#
# Configure run-time parameters for this notebook
#
layer = "bronze"
db_schema = "dbo"
application = "warehouse"
workspace_id = "3ac7ce42-ae74-4e7d-8ac3-5ce8358a30df" ## Adv Wrks DE 3 Dev
lakehouse_id = "50402dac-ce50-4831-af2b-7d65ca8fe7db" ## AdventureWorks_Lakehouse

print("Successfully configured all paramaters for this run.")

In [None]:
#
# Define the OneLake folder path
#
folder = "/Files/" + layer + "/" + application
folder_path = "abfss://" + workspace_id + "@onelake.dfs.fabric.microsoft.com/" + lakehouse_id + folder

print(f"Configured to process files from:\n{folder_path}\ninto database schema '{db_schema}' tables.")

In [None]:
#
# Create the Spark session
#
app_name = "LoadLakehouseBronzeTables"

# Get the current Spark session
spark = SparkSession.builder \
    .appName(app_name) \
    .getOrCreate()

print(f"Spark session {app_name} has been created successfully.")

In [None]:
#
# Define the fuction to create a PySpark Stuct fro a T-SQL script
#
def tsql_to_structype(SqlScript: List) -> StructType:
    """
    Converts a T-SQL CREATE TABLE script into a PySpark StructType schema.

    Parameters:
        SqlScript List[str]: The T-SQL script defining the table structure.

    Returns:
        StructType: A PySpark StructType object representing the schema.
    """
    
    # Mapping from T-SQL data types to PySpark data types
    # NOTE: The Lakehouse is the implementation of the BRONZE layer
    #       It is the target for all RAW data
    #       String is the only data type used, ensureing all rows are read
    #       Data formats and contraints will be applied next in the SILVER layer
    sql_to_spark_type = {
        "bigint": StringType(),
        "binary": StringType(),
		"bit": StringType(),
        "char": StringType(),
        "date": StringType(),
        "datetime": StringType(),
        "datetime2": StringType(),
        "datetimeoffset": StringType(),
        "decimal": StringType(),
        "double": StringType(),
        "float": StringType(),
        "image": StringType(),
        "int": StringType(),
        "money": StringType(),
        "nchar": StringType(),
        "nvarchar": StringType(),
        "smallint": StringType(),
        "text": StringType(),
        "time": StringType(),
        "tinyint": StringType(),
        "uniqueidentifier": StringType(),
        "varbinary": StringType(),
        "varchar": StringType() 
    }
    
    # Extract column definitions from the SQL script
    fields = []
    for line in SqlScript:
        line = line.strip()
        # Skip irrelevant lines
        if line.startswith("[") and "]" in line and "[" in line:
            column_name = line.split("[")[1].split("]")[0]
            column_type = line.split("[")[2].split("]")[0].lower()
            nullable = "NOT NULL" not in line
            
            # Get the PySpark type or default to StringType
            spark_type = sql_to_spark_type.get(column_type.split("(")[0], StringType())
            fields.append(StructField(column_name, spark_type, nullable))
    
    return StructType(fields)

print("The function 'tsql_to_structype' has been created successfully.")

In [None]:
#
# Load the BRONZE layer files into a Lakehouse table
#
file_list = spark.read.format("binaryFile").load(folder_path).select("path").collect()

# Iterate through each file and load it into a table
for file in file_list:
    file_path = file["path"]
    
    if file_path.endswith(".csv"):  # Ensure the file is a CSV
        # Extract the table name from the file name
        table_name = file_path.split("/")[-1].split(".")[0]
        full_table_name = db_schema + "." + table_name

        # Read the sql script file for this table into a dataframe
        script_file_path = folder_path + "/" + table_name + ".sql"
        df_script = spark.read.text(script_file_path)
        
        # Convert the one-column DataFrame to a string array using collect()
        string_array = [row["value"] for row in df_script.collect()]

        # Convert the sql script into a pyspark schema structure
        schema = tsql_to_structype(string_array)

        # Read the csv file into source df
        df = spark.read.format("csv") \
            .option("header", "false") \
            .option("delimiter", "|") \
            .schema(schema) \
            .load(file_path)
        
        # Save the DataFrame as a table
        spark.sql(f"DROP TABLE IF EXISTS {full_table_name}")
        df.write.mode("overwrite") \
            .option("mergeSchema", "true") \
            .saveAsTable(full_table_name)
        
        print(f"Loaded file {file_path} into table {full_table_name}")

In [None]:
#
# Stop the Spark session
# NOTE: frees up limited F2 SKU capacity resources
#
spark.stop()

print("Spark session has been stopped successfully.")