In [0]:
# # Ensure proper schema registration for all layers
spark.sql("DROP SCHEMA  IF EXISTS bronze CASCADE")
spark.sql("DROP SCHEMA  IF  EXISTS silver CASCADE")
spark.sql("DROP SCHEMA  IF  EXISTS gold CASCADE")

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, year, month, current_date, date_add

# Initialize Spark session
spark = SparkSession.builder.appName("DimensionalModeling").getOrCreate()

# Define ingestion date
ingestion_date = date_add(current_date(), -1)  # The date of the load

# Example data for dimension tables
raw_customer_data = [
    (1, "John Doe", "New York", "NY", "USA", "Group A"),
    (2, "Jane Smith", "Los Angeles", "CA", "USA", "Group B"),
    (3, "Michael Johnson", "San Francisco", "CA", "USA", "Group A"),
    (4, "Emily Brown", "Chicago", "IL", "USA", "Group C")
]

raw_customer_group_data = [
    ("Group A", "High Value Customers"),
    ("Group B", "Medium Value Customers"),
    ("Group C", "Low Value Customers")
]

raw_product_data = [
    (1, "Product A", "Category X", "Group 1"),
    (2, "Product B", "Category Y", "Group 2"),
    (3, "Product C", "Category Z", "Group 3")
]

raw_product_group_data = [
    ("Group 1", "Electronics"),
    ("Group 2", "Clothing"),
    ("Group 3", "Home & Kitchen")
]

raw_date_data = [
    ("2024-03-01", "2024-03-01", 2024, 3),
    ("2024-03-02", "2024-03-02", 2024, 3),
    ("2024-03-03", "2024-03-03", 2024, 3),
    ("2024-03-04", "2024-03-04", 2024, 3)
]

# Example data for fact table
raw_sales_data = [
    (101, 1, 1, "2024-03-01", 2, 100),
    (102, 2, 2, "2024-03-02", 1, 50),
    (103, 3, 3, "2024-03-02", 3, 200),
    (104, 1, 1, "2024-03-03", 1, 50),
    (105, 2, 2, "2024-03-04", 2, 150)
]

# Ensure proper schema registration for all layers
spark.sql("CREATE SCHEMA IF NOT EXISTS bronze")
spark.sql("CREATE SCHEMA IF NOT EXISTS silver")
spark.sql("CREATE SCHEMA IF NOT EXISTS gold")

# Create DataFrames for separate tables in the bronze layer with ingestion_date
raw_customer_df = spark.createDataFrame(raw_customer_data, ["customer_id", "customer_name", "city", "state", "country", "customer_group"]) \
    .withColumn("ingestion_date", ingestion_date)

raw_customer_group_df = spark.createDataFrame(raw_customer_group_data, ["customer_group", "group_description"]) \
    .withColumn("ingestion_date", ingestion_date)

raw_product_df = spark.createDataFrame(raw_product_data, ["product_id", "product_name", "product_category", "product_group"]) \
    .withColumn("ingestion_date", ingestion_date)

raw_product_group_df = spark.createDataFrame(raw_product_group_data, ["product_group", "group_description"]) \
    .withColumn("ingestion_date", ingestion_date)

raw_date_df = spark.createDataFrame(raw_date_data, ["order_date", "full_date", "year", "month"]) \
    .withColumn("ingestion_date", ingestion_date)

raw_sales_df = spark.createDataFrame(raw_sales_data, ["order_id", "customer_id", "product_id", "order_date", "quantity", "amount"]) \
    .withColumn("ingestion_date", ingestion_date)

# Bronze Layer: Write each table separately to Delta Lake without joining
raw_customer_df.write.format("delta").mode("overwrite").partitionBy("state").save("/tmp/raw_customer")
spark.sql("CREATE TABLE IF NOT EXISTS bronze.raw_customer USING DELTA LOCATION '/tmp/raw_customer'")

raw_customer_group_df.write.format("delta").mode("overwrite").save("/tmp/raw_customer_group")
spark.sql("CREATE TABLE IF NOT EXISTS bronze.raw_customer_group USING DELTA LOCATION '/tmp/raw_customer_group'")

raw_product_df.write.format("delta").mode("overwrite").save("/tmp/raw_product")
spark.sql("CREATE TABLE IF NOT EXISTS bronze.raw_product USING DELTA LOCATION '/tmp/raw_product'")

raw_product_group_df.write.format("delta").mode("overwrite").save("/tmp/raw_product_group")
spark.sql("CREATE TABLE IF NOT EXISTS bronze.raw_product_group USING DELTA LOCATION '/tmp/raw_product_group'")

raw_date_df.write.format("delta").mode("overwrite").partitionBy("year", "month").option("zorder", "order_date").save("/tmp/raw_date")
spark.sql("CREATE TABLE IF NOT EXISTS bronze.raw_date USING DELTA LOCATION '/tmp/raw_date'")

raw_sales_df.write.format("delta").mode("overwrite").partitionBy("order_date").option("zorder", "customer_id").save("/tmp/raw_sales")
spark.sql("CREATE TABLE IF NOT EXISTS bronze.raw_sales USING DELTA LOCATION '/tmp/raw_sales'")


In [0]:
from pyspark.sql import SparkSession
from datetime import datetime, timedelta

# Initialize Spark session
spark = SparkSession.builder.appName("TestSparkSQL").getOrCreate()

# Calculate ingestion_date_from and ingestion_date_to using Python's datetime module
ingestion_date_from = (datetime.now() - timedelta(days=7)).strftime('%Y-%m-%d')  # 7 days ago
ingestion_date_to = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d')    # 1 day ago

# Print the ingestion_date_from and ingestion_date_to
print(f"ingestion_date_from = {ingestion_date_from}")
print(f"ingestion_date_to = {ingestion_date_to}")

# Define the SQL query with named parameter markers
query = """
-- Customer Dimension
SELECT
    cust.customer_id,
    cust.customer_name,
    cust.city,
    cust.state,
    cust.country,
    cust.customer_group,
    grp.group_description AS customer_group_description,
    cust.ingestion_date
FROM bronze.raw_customer AS cust
LEFT JOIN bronze.raw_customer_group AS grp
    ON cust.customer_group = grp.customer_group
WHERE cust.ingestion_date BETWEEN :ingestion_date_from AND :ingestion_date_to;
"""

# Execute the query with named parameters
# Pass parameters using a dictionary
params = {
    "ingestion_date_from": ingestion_date_from,
    "ingestion_date_to": ingestion_date_to
}
result_df = spark.sql(query, params)

# Show the results
result_df.show()


In [0]:
class DataPipelineSQL:
    def __init__(self, config_dict):
        self.config_dict = config_dict
        self.spark = SparkSession.builder.getOrCreate()  # Use the existing Spark session

    def execute_sql_file(self, sql_file_path):
        print(f"Executing SQL file: {sql_file_path}")
        with open(sql_file_path, 'r') as file:
            sql_queries = file.read().split(';')  # Split the file content by semicolon

        for sql_query in sql_queries:
            sql_query = sql_query.strip()
            if sql_query:  # Execute only non-empty queries
                self.spark.sql(sql_query)

    def run_pipeline(self):
        for task in self.config_dict["transformation_rules"]:
            self.execute_sql_file(task["sql_file_path"])

        print("End of ETL pipeline workflow")


In [0]:
import logging
from pyspark.sql import SparkSession
import json

class DataPipelineSQL:
    def __init__(self, config_dict):
        self.config_dict = config_dict
        self.spark = SparkSession.builder.getOrCreate()  # Use the existing Spark session
        self.setup_logging()

    def setup_logging(self):
        # Configure logging
        logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
        self.logger = logging.getLogger(__name__)

    def execute_sql_file(self, sql_file_path, params=None):
        self.logger.info(f"Executing SQL file: {sql_file_path}")

        try:
            with open(sql_file_path, 'r') as file:
                sql_queries = file.read().split(';')  # Split the file content by semicolon

            if params is None:
                params = {}

            for sql_query in sql_queries:
                sql_query = sql_query.strip()
                if sql_query:  # Execute only non-empty queries
                    # Substitute named parameters
                    for param_name, param_value in params.items():
                        sql_query = sql_query.replace(f":{param_name}", str(param_value))
                    self.logger.debug(f"Executing query: {sql_query}")
                    self.spark.sql(sql_query)

        except Exception as e:
            self.logger.error(f"Error executing SQL file {sql_file_path}: {e}", exc_info=True)

    def run_pipeline(self, params):
        self.logger.info("Starting ETL pipeline workflow")

        try:
            for task in self.config_dict.get("transformation_rules", []):
                sql_file_path = task.get("sql_file_path")
                if sql_file_path:
                    self.execute_sql_file(sql_file_path, params)

            self.logger.info("ETL pipeline workflow completed successfully")

        except Exception as e:
            self.logger.error(f"Error in ETL pipeline: {e}", exc_info=True)


In [0]:
%sh
mkdir -p /dbfs/tmp/config

In [0]:
%sh
cat <<EOF > /dbfs/tmp/config/config_sql1.json
{
    "transformation_rules": [
        {
            "sql_file_path": "/dbfs/tmp/config/silver_layer_transformations.sql"
        },
        {
            "sql_file_path": "/dbfs/tmp/config/gold_layer_transformations.sql"
        }    ]
}
EOF

In [0]:
%sh
cat <<EOF > /dbfs/tmp/config/silver_layer_transformations.sql
-- SQL Script to update Silver Layer with daily delta records

-- Step 1: Create Silver Layer tables if they don't exist
-- Customer Dimension
CREATE TABLE IF NOT EXISTS silver.tab_dim_customer (
    customer_id STRING,
    customer_name STRING,
    city STRING,
    state STRING,
    country STRING,
    customer_group STRING,
    customer_group_description STRING,
    ingestion_date DATE
)
USING DELTA
LOCATION '/path/to/delta/silver/tab_dim_customer';

-- Product Dimension
CREATE TABLE IF NOT EXISTS silver.tab_dim_product (
    product_id STRING,
    product_name STRING,
    product_category STRING,
    product_group_description STRING,
    ingestion_date DATE
)
USING DELTA
LOCATION '/path/to/delta/silver/tab_dim_product';

-- Date Dimension
CREATE TABLE IF NOT EXISTS silver.tab_dim_date (
    order_date DATE,
    full_date STRING,
    year INT,
    month INT,
    ingestion_date DATE
)
USING DELTA
LOCATION '/path/to/delta/silver/tab_dim_date';

-- Sales Fact Table
CREATE TABLE IF NOT EXISTS silver.tab_fact_sales (
    order_id STRING,
    customer_id STRING,
    product_id STRING,
    order_date DATE,
    quantity INT,
    amount DOUBLE,
    ingestion_date DATE
)
USING DELTA
LOCATION '/path/to/delta/silver/tab_fact_sales';

-- Step 2: Insert data into Silver Layer tables

-- Customer Dimension
INSERT INTO silver.tab_dim_customer
SELECT
    cust.customer_id,
    cust.customer_name,
    cust.city,
    cust.state,
    cust.country,
    cust.customer_group,
    grp.group_description AS customer_group_description,
    cust.ingestion_date
FROM bronze.raw_customer AS cust
LEFT JOIN bronze.raw_customer_group AS grp
    ON cust.customer_group = grp.customer_group
WHERE cust.ingestion_date = :ingestion_date;

-- Product Dimension
INSERT INTO silver.tab_dim_product
SELECT
    prod.product_id,
    prod.product_name,
    prod.product_category,
    grp.group_description AS product_group_description,
    prod.ingestion_date
FROM bronze.raw_product AS prod
LEFT JOIN bronze.raw_product_group AS grp
    ON prod.product_group = grp.product_group
WHERE prod.ingestion_date = :ingestion_date;

-- Date Dimension
INSERT INTO silver.tab_dim_date
SELECT
    order_date,
    full_date,
    year,
    month,
    ingestion_date
FROM bronze.raw_date
WHERE ingestion_date = :ingestion_date;

-- Sales Fact Table
INSERT INTO silver.tab_fact_sales
SELECT
    sales.order_id,
    sales.customer_id,
    sales.product_id,
    sales.order_date,
    sales.quantity,
    sales.amount,
    sales.ingestion_date
FROM bronze.raw_sales AS sales
WHERE sales.ingestion_date = :ingestion_date;

EOF


In [0]:
%sh
cat <<EOF > /dbfs/tmp/config/gold_layer_transformations.sql
-- SQL Script to update Gold Layer with daily delta records

-- Step 1: Create the Gold Layer table if it doesn't exist
CREATE TABLE IF NOT EXISTS gold.fact_sales_summary (
    order_id STRING,
    customer_id STRING,
    product_id STRING,
    order_date DATE,
    quantity INT,
    amount DOUBLE,
    customer_group_description STRING,
    product_group_description STRING,
    product_category STRING,
    state STRING,
    year INT,
    month INT,
    ingestion_date DATE
)
USING DELTA
LOCATION '/path/to/delta/gold/fact_sales_summary';

-- Step 2: Append to Gold Layer - Aggregate and enrich data
INSERT INTO gold.fact_sales_summary
SELECT
    sales.order_id,
    sales.customer_id,
    sales.product_id,
    sales.order_date,  -- Explicit reference to fact_sales order_date
    sales.quantity,
    sales.amount,
    cust.customer_group_description,
    prod.product_group_description,
    prod.product_category,
    cust.state,
    dt.year,
    dt.month,
    sales.ingestion_date  -- Explicit reference to fact_sales ingestion_date
FROM
    silver.tab_fact_sales AS sales
LEFT JOIN
    silver.tab_dim_customer AS cust ON sales.customer_id = cust.customer_id
LEFT JOIN
    silver.tab_dim_product AS prod ON sales.product_id = prod.product_id
LEFT JOIN
    silver.tab_dim_date AS dt ON sales.order_date = dt.order_date
WHERE
    sales.ingestion_date = :ingestion_date;

EOF


In [0]:
spark.sql("DROP SCHEMA  IF  EXISTS silver CASCADE")
spark.sql("DROP SCHEMA  IF  EXISTS gold CASCADE")
spark.sql("CREATE SCHEMA IF NOT EXISTS silver")
spark.sql("CREATE SCHEMA IF NOT EXISTS gold")

In [0]:
from pyspark.sql import SparkSession

class DataPipelineSQL:
    def __init__(self, config_dict, params):
        self.config_dict = config_dict
        self.params = params
        self.spark = SparkSession.builder.getOrCreate()  # Use the existing Spark session

    def replace_placeholders(self, sql_script):
        for key, value in self.params.items():
            sql_script = sql_script.replace(f":{key}", f"'{value}'")  # Ensure values are properly quoted
        return sql_script

    def execute_sql_file(self, sql_file_path):
        print(f"Executing SQL file: {sql_file_path}")
        with open(sql_file_path, 'r') as file:
            sql_script = file.read()
        sql_script = self.replace_placeholders(sql_script)
        # Execute the SQL script directly
        for statement in sql_script.split(';'):
            statement = statement.strip()
            if statement:
                try:
                    self.spark.sql(statement)
                except Exception as e:
                    print(f"Error executing SQL statement: {e}")

    def run_pipeline(self):
        for task in self.config_dict["transformation_rules"]:
            self.execute_sql_file(task["sql_file_path"])
        print("End of ETL pipeline workflow")


In [0]:
from datetime import datetime, timedelta
import json

# Calculate ingestion_date using Python's datetime module
ingestion_date = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d')
print("ingestion_date ----", ingestion_date)

if __name__ == "__main__":
    # Define parameters as a dictionary
    params = {"ingestion_date": ingestion_date}

    # Load the SQL script configuration
    config_path = "/dbfs/tmp/config/config_sql1.json"
    with open(config_path, 'r') as file:
        config_dict = json.load(file)

    # Initialize the DataPipelineSQL class with the configuration and parameters
    pipeline = DataPipelineSQL(config_dict, params)

    # Run the pipeline with parameters
    pipeline.run_pipeline()


In [0]:
if __name__ == "__main__":
    config_path = "/dbfs/tmp/config/config_sql1.json"
    with open(config_path, 'r') as file:
        config_dict = json.load(file)

    pipeline = DataPipelineSQL(config_dict)
    pipeline.run_pipeline()


In [0]:
spark.sql("SELECT :x * :y * :z AS volume", args = { "x" : 3, "y" : 4, "z"  : 5 }).show()

In [0]:
from datetime import datetime, timedelta

# Calculate ingestion_date using Python's datetime module
ingestion_date = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d')
print("ingestion_date ----",ingestion_date)

if __name__ == "__main__":
    # Set ingestion_date dynamically
    # Set the parameter in Spark SQL context
    spark.sql(f"SET ingestion_date = {ingestion_date}")
    print(f"SET ingestion_date = {ingestion_date}")
    config_path = "/dbfs/tmp/config/config_sql1.json"
    with open(config_path, 'r') as file:
        config_dict = json.load(file)

    pipeline = DataPipelineSQL(config_dict)
    pipeline.run_pipeline()


In [0]:
from datetime import datetime, timedelta
import json
from pyspark.sql import SparkSession

# Calculate ingestion_date using Python's datetime module
ingestion_date = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d')
print("ingestion_date ----", ingestion_date)

if __name__ == "__main__":
    # Set ingestion_date dynamically
    # Set the parameter in Spark SQL context, ensuring it's quoted correctly
    spark.sql(f"SET ingestion_date = '{ingestion_date}'")
    print(f"SET ingestion_date = '{ingestion_date}'")

    # Assuming config_path and pipeline execution as before
    config_path = "/dbfs/tmp/config/config_sql1.json"
    with open(config_path, 'r') as file:
        config_dict = json.load(file)

    pipeline = DataPipelineSQL(config_dict)
    pipeline.run_pipeline()


In [0]:
%sql
SET ingestion_date = '2024-08-10';
SELECT
    cust.customer_id,
    cust.customer_name,
    cust.city,
    cust.state,
    cust.country,
    cust.customer_group,
    grp.group_description AS customer_group_description,
    cust.ingestion_date
FROM bronze.raw_customer AS cust
LEFT JOIN bronze.raw_customer_group AS grp
    ON cust.customer_group = grp.customer_group
WHERE cust.ingestion_date = '${ingestion_date}';


In [0]:
from pyspark.sql import SparkSession
from datetime import datetime, timedelta

# Initialize Spark session
spark = SparkSession.builder.appName("TestSparkSQL").getOrCreate()

# Calculate ingestion_date_from and ingestion_date_to using Python's datetime module
ingestion_date_from = (datetime.now() - timedelta(days=7)).strftime('%Y-%m-%d')  # 7 days ago
ingestion_date_to = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d')    # 1 day ago

# Print the ingestion_date_from and ingestion_date_to
print(f"ingestion_date_from = {ingestion_date_from}")
print(f"ingestion_date_to = {ingestion_date_to}")

# Define the SQL query with named parameter markers
query = """
-- Customer Dimension
SELECT
    cust.customer_id,
    cust.customer_name,
    cust.city,
    cust.state,
    cust.country,
    cust.customer_group,
    grp.group_description AS customer_group_description,
    cust.ingestion_date
FROM bronze.raw_customer AS cust
LEFT JOIN bronze.raw_customer_group AS grp
    ON cust.customer_group = grp.customer_group
WHERE cust.ingestion_date BETWEEN :ingestion_date_from AND :ingestion_date_to;
"""

# Execute the query with named parameters
# Pass parameters using a dictionary
params = {
    "ingestion_date_from": ingestion_date_from,
    "ingestion_date_to": ingestion_date_to
}
result_df = spark.sql(query, params)

# Show the results
result_df.show()


In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, current_date, date_add

# Initialize Spark session
spark = SparkSession.builder.appName("DailyDeltaLoad").getOrCreate()

# Define ingestion date
ingestion_date = date_add(current_date(), 0)  # The date of the load

# New data for the next day
next_day_customer_data = [
    (5, "Alice Williams", "Houston", "TX", "USA", "Group B"),
    (6, "Bob Davis", "Seattle", "WA", "USA", "Group A")
]

next_day_product_data = [
    (4, "Product D", "Category W", "Group 2")
]

next_day_sales_data = [
    (106, 5, 4, "2024-03-05", 1, 75),
    (107, 6, 4, "2024-03-05", 2, 100)
]

# Create DataFrames for next day's data
next_day_customer_df = spark.createDataFrame(next_day_customer_data, ["customer_id", "customer_name", "city", "state", "country", "customer_group"]) \
    .withColumn("ingestion_date", ingestion_date)

next_day_product_df = spark.createDataFrame(next_day_product_data, ["product_id", "product_name", "product_category", "product_group"]) \
    .withColumn("ingestion_date", ingestion_date)

next_day_sales_df = spark.createDataFrame(next_day_sales_data, ["order_id", "customer_id", "product_id", "order_date", "quantity", "amount"]) \
    .withColumn("ingestion_date", ingestion_date)

# Bronze Layer: Append new data to existing Delta Lake tables
next_day_customer_df.write.format("delta").mode("append").partitionBy("state").save("/tmp/raw_customer")
next_day_product_df.write.format("delta").mode("append").save("/tmp/raw_product")
next_day_sales_df.write.format("delta").mode("append").partitionBy("order_date").save("/tmp/raw_sales")