In [0]:
#%run ./Mailchimp_ETL/Production_ETL/04_GoldMaster
# Note: All functions and globals are assumed to be defined in __main__ after the %run.

In [0]:
import unittest
from unittest.mock import patch, MagicMock
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit
from pyspark.sql.types import *

# Create or get an existing SparkSession for testing
spark = SparkSession.builder.master("local[2]").appName("GoldETLTest").getOrCreate()

# Create a fake dbutils.fs.ls response object
class FakeFileInfo:
    def __init__(self, path, isDir):
        self.path = path
        self.isDir = isDir

# Create a fake dbutils object to simulate Databricks file listing
class FakeDBUtils:
    class FS:
        def ls(self, path):
            # Simulate two directories under the silver base
            return [
                FakeFileInfo(f"{path}/delta1", True),
                FakeFileInfo(f"{path}/delta2", True)
            ]
    fs = FS()

# Inject the fake dbutils into globals (as would exist in a Databricks notebook)
dbutils = FakeDBUtils()

# Sample data mimicking records from silver (each record contains all required columns)
sample_data = [
    {
        "unique_email_id": "uid_1",
        "email_address": "john.doe@example.com",
        "full_name": "John Doe",
        "merge_FNAME": "John",
        "merge_LNAME": "Doe",
        "merge_PHONE": "1234567890",
        "address_addr1": "123 Main St",
        "address_addr2": "Apt 1",
        "address_city": "Metropolis",
        "address_state": "NY",
        "address_zip": "12345",
        "address_country": "USA",
        "language": "en",
        "vip": False,
        "list_id": "list_1",
        "list_name": "List One",
        "location_country_code": "US",
        "location_region": "NY",
        "location_timezone": "EST",
        "status": "subscribed",
        "stats_avg_open_rate": 0.5,
        "stats_avg_click_rate": 0.2,
        "timestamp_signup": "2024-01-01T00:00:00Z",
        "timestamp_opt": "2024-01-02T00:00:00Z",
        "last_changed": "2024-01-03T00:00:00Z",
        "member_rating": 5,
        "consents_to_one_to_one_messaging": True,
        "email_client": "Outlook",
        "email_type": "html",
        "web_id": 111
    },
    {
        "unique_email_id": "uid_2",
        "email_address": "jane.smith@example.com",
        "full_name": "Jane Smith",
        "merge_FNAME": "Jane",
        "merge_LNAME": "Smith",
        "merge_PHONE": "0987654321",
        "address_addr1": "456 Elm St",
        "address_addr2": "",
        "address_city": "Gotham",
        "address_state": "NJ",
        "address_zip": "54321",
        "address_country": "USA",
        "language": "en",
        "vip": True,
        "list_id": "list_2",
        "list_name": "List Two",
        "location_country_code": "US",
        "location_region": "NJ",
        "location_timezone": "EST",
        "status": "subscribed",
        "stats_avg_open_rate": 0.7,
        "stats_avg_click_rate": 0.3,
        "timestamp_signup": "2024-02-01T00:00:00Z",
        "timestamp_opt": "2024-02-02T00:00:00Z",
        "last_changed": "2024-02-03T00:00:00Z",
        "member_rating": 3,
        "consents_to_one_to_one_messaging": False,
        "email_client": "Gmail",
        "email_type": "html",
        "web_id": 222
    }
]

# Helper function to simulate spark.read.format("delta").load(path)
def fake_delta_load(path):
    # Create a DataFrame from sample_data.
    df = spark.createDataFrame(sample_data)
    # Add the column "list_name_source" based on the directory name (last part of path)
    list_name_source = os.path.basename(path)
    return df.withColumn("list_name_source", lit(list_name_source))

class TestMailchimpGoldETL(unittest.TestCase):
    
    @patch("pyspark.sql.DataFrameWriter.save")
    def test_gold_etl(self, mock_save):
        """
        Test the gold ETL logic:
         - Union all delta files from silver layer.
         - Create dim_contact, dim_list, and fact_list_membership tables.
         - Write out using Delta mode (here we just verify that the write method is called).
        """
        # Simulate dbutils.fs.ls output (already provided by our fake dbutils)
        silver_base = "abfss://silver@mailchimpspnetwork.dfs.core.windows.net/mailchimp_transformed"
        df_paths = dbutils.fs.ls(silver_base)
        delta_paths = [f.path for f in df_paths if f.isDir]
        
        # Patch the spark.read.format("delta").load to use our fake_delta_load
        original_read = spark.read.format("delta").load
        try:
            spark.read.format("delta").load = fake_delta_load  # override with fake loader
            all_data = None
            for path in delta_paths:
                df = spark.read.format("delta").load(path)
                # Add a column to identify the source list name using the directory name
                df = df.withColumn("list_name_source", lit(path.split("/")[-1]))
                all_data = df if all_data is None else all_data.unionByName(df)
            
            # Verify the union: we simulated 2 directories, so expect two copies of the sample_data
            self.assertEqual(all_data.count(), len(sample_data) * len(delta_paths))
            
            # -------------------------------
            # 1. Create the dimension table 'dim_contact'
            # -------------------------------
            dim_contact = all_data.select(
                "unique_email_id",
                "email_address",
                "full_name",
                "merge_FNAME",
                "merge_LNAME",
                "merge_PHONE",
                "address_addr1",
                "address_addr2",
                "address_city",
                "address_state",
                "address_zip",
                "address_country",
                "language",
                "vip"
            ).dropDuplicates(["unique_email_id"])
            
            # For sample_data, there are two unique email ids.
            self.assertEqual(dim_contact.count(), 2)
            
            # Instead of actually writing out, we simulate the write call by calling .write
            # (The patch on DataFrameWriter.save ensures that save() is called)
            GOLD_BASE = "abfss://gold@mailchimpspnetwork.dfs.core.windows.net"
            dim_contact.write.format("delta").mode("overwrite").save(f"{GOLD_BASE}/dim_contact")
            mock_save.assert_called()  # Ensure write.save() was called
            
            # -------------------------------
            # 2. Create the dimension table 'dim_list'
            # -------------------------------
            dim_list = all_data.select(
                "list_id",
                "list_name",
                "location_country_code",
                "location_region",
                "location_timezone"
            ).dropDuplicates(["list_id"])
            # There are two unique list_ids in our sample data.
            self.assertEqual(dim_list.count(), 2)
            
            dim_list.write.format("delta").mode("overwrite").save(f"{GOLD_BASE}/dim_list")
            self.assertTrue(mock_save.call_count >= 2)
            
            # -------------------------------
            # 3. Create the fact table 'fact_list_membership'
            # -------------------------------
            fact_membership = all_data.select(
                "unique_email_id",
                "list_id",
                "status",
                "stats_avg_open_rate",
                "stats_avg_click_rate",
                "timestamp_signup",
                "timestamp_opt",
                "last_changed",
                "member_rating",
                "consents_to_one_to_one_messaging",
                "email_client",
                "email_type",
                "web_id"
            ).dropDuplicates(["unique_email_id", "list_id"])
            # Since each record in our sample is unique, after union, the duplicates are removed based on the composite key.
            # With two copies per record, the dropDuplicates should leave only the unique set.
            self.assertEqual(fact_membership.count(), len(sample_data))
            
            fact_membership.write.format("delta").mode("overwrite").save(f"{GOLD_BASE}/fact_list_membership")
            self.assertTrue(mock_save.call_count >= 3)
        finally:
            # Restore original method if needed
            spark.read.format("delta").load = original_read

if __name__ == "__main__":
    unittest.main(argv=[''], verbosity=2, exit=False)
