In [1]:
import os
import warnings
warnings.filterwarnings('ignore')

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
import json
from datetime import datetime

In [2]:
# Initialize Spark Session with Delta Lake configuration
spark = SparkSession.builder \
    .appName("LocationDimensionProcessorV2") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0") \
    .config("spark.sql.warehouse.dir", "/tmp/spark-warehouse") \
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

print(f"=== LOCATION DIMENSION PROCESSOR V2 ===")
print(f"Spark Version: {spark.version}")
print(f"Application: {spark.sparkContext.appName}")
print(f"Delta Lake support enabled")
print(f"Processing timestamp: {datetime.now()}")

25/08/28 17:13:53 WARN Utils: Your hostname, 3rnese resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/08/28 17:13:53 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/home/ernese/miniconda3/envs/SO/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/ernese/.ivy2/cache
The jars for the packages stored in: /home/ernese/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-1bddc080-1ea0-4f0d-92ea-76537fbb7e23;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.4.0 in central
	found io.delta#delta-storage;2.4.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
:: resolution report :: resolve 154ms :: artifacts dl 6ms
	:: modules in use:
	io.delta#delta-core_2.12;2.4.0 from central in [default]
	io.delta#delta-storage;2.4.0 from central in [default]
	org.antlr#antlr4-runtime;4.9.3 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   3   |   0   |

=== LOCATION DIMENSION PROCESSOR V2 ===
Spark Version: 3.4.0
Application: LocationDimensionProcessorV2
Delta Lake support enabled
Processing timestamp: 2025-08-28 17:13:56.022318


In [3]:
# Configuration
SILVER_PATH = "/home/ernese/miniconda3/envs/SO/New_SO/final-spark-silver"
PROCESSING_TIMESTAMP = datetime.now()

# Create target directory if it doesn't exist
os.makedirs(SILVER_PATH, exist_ok=True)

print(f"=== CONFIGURATION ===")
print(f"Silver Path: {SILVER_PATH}")
print(f"Processing Time: {PROCESSING_TIMESTAMP}")

=== CONFIGURATION ===
Silver Path: /home/ernese/miniconda3/envs/SO/New_SO/final-spark-silver
Processing Time: 2025-08-28 17:14:01.882077


## Create Comprehensive Philippine Location Dataset

In [4]:
def create_comprehensive_philippine_locations():
    """Create comprehensive Philippine geographic entities with lat/long coordinates"""
    locations = [
        # Country Level
        {
            'name': 'Philippines', 
            'type': 'country', 
            'lat': 12.8797, 
            'lon': 121.7740, 
            'pop': 109035343,
            'iso_code': 'PHL',
            'region_code': None,
            'province_code': None
        },
        
        # Administrative Regions (17 regions)
        {
            'name': 'National Capital Region', 
            'type': 'region', 
            'lat': 14.6042, 
            'lon': 120.9822, 
            'pop': 13484462,
            'iso_code': 'PHL',
            'region_code': 'NCR',
            'province_code': None
        },
        {
            'name': 'Cordillera Administrative Region', 
            'type': 'region', 
            'lat': 17.3494, 
            'lon': 121.0587, 
            'pop': 1797660,
            'iso_code': 'PHL',
            'region_code': 'CAR',
            'province_code': None
        },
        {
            'name': 'Ilocos Region', 
            'type': 'region', 
            'lat': 16.5938, 
            'lon': 120.6121, 
            'pop': 5301139,
            'iso_code': 'PHL',
            'region_code': 'I',
            'province_code': None
        },
        {
            'name': 'Cagayan Valley', 
            'type': 'region', 
            'lat': 17.6322, 
            'lon': 121.7231, 
            'pop': 3685744,
            'iso_code': 'PHL',
            'region_code': 'II',
            'province_code': None
        },
        {
            'name': 'Central Luzon', 
            'type': 'region', 
            'lat': 15.4833, 
            'lon': 120.7667, 
            'pop': 12422172,
            'iso_code': 'PHL',
            'region_code': 'III',
            'province_code': None
        },
        {
            'name': 'Calabarzon', 
            'type': 'region', 
            'lat': 14.1368, 
            'lon': 121.4647, 
            'pop': 16195042,
            'iso_code': 'PHL',
            'region_code': 'IV-A',
            'province_code': None
        },
        {
            'name': 'MIMAROPA', 
            'type': 'region', 
            'lat': 10.6333, 
            'lon': 120.1833, 
            'pop': 3228558,
            'iso_code': 'PHL',
            'region_code': 'IV-B',
            'province_code': None
        },
        {
            'name': 'Bicol Region', 
            'type': 'region', 
            'lat': 13.4243, 
            'lon': 123.4116, 
            'pop': 6082165,
            'iso_code': 'PHL',
            'region_code': 'V',
            'province_code': None
        },
        {
            'name': 'Western Visayas', 
            'type': 'region', 
            'lat': 10.9575, 
            'lon': 122.4283, 
            'pop': 7954723,
            'iso_code': 'PHL',
            'region_code': 'VI',
            'province_code': None
        },
        {
            'name': 'Central Visayas', 
            'type': 'region', 
            'lat': 9.9298, 
            'lon': 123.7505, 
            'pop': 8081988,
            'iso_code': 'PHL',
            'region_code': 'VII',
            'province_code': None
        },
        {
            'name': 'Eastern Visayas', 
            'type': 'region', 
            'lat': 11.9994, 
            'lon': 124.9992, 
            'pop': 4547150,
            'iso_code': 'PHL',
            'region_code': 'VIII',
            'province_code': None
        },
        {
            'name': 'Zamboanga Peninsula', 
            'type': 'region', 
            'lat': 7.9833, 
            'lon': 122.8667, 
            'pop': 3875576,
            'iso_code': 'PHL',
            'region_code': 'IX',
            'province_code': None
        },
        {
            'name': 'Northern Mindanao', 
            'type': 'region', 
            'lat': 8.2280, 
            'lon': 124.6453, 
            'pop': 5022768,
            'iso_code': 'PHL',
            'region_code': 'X',
            'province_code': None
        },
        {
            'name': 'Davao Region', 
            'type': 'region', 
            'lat': 7.0649, 
            'lon': 125.6070, 
            'pop': 5243536,
            'iso_code': 'PHL',
            'region_code': 'XI',
            'province_code': None
        },
        {
            'name': 'SOCCSKSARGEN', 
            'type': 'region', 
            'lat': 6.5204, 
            'lon': 124.8450, 
            'pop': 4945885,
            'iso_code': 'PHL',
            'region_code': 'XII',
            'province_code': None
        },
        {
            'name': 'Caraga', 
            'type': 'region', 
            'lat': 9.1667, 
            'lon': 125.8333, 
            'pop': 2804788,
            'iso_code': 'PHL',
            'region_code': 'XIII',
            'province_code': None
        },
        {
            'name': 'Bangsamoro Autonomous Region in Muslim Mindanao', 
            'type': 'region', 
            'lat': 7.2189, 
            'lon': 124.2458, 
            'pop': 4404288,
            'iso_code': 'PHL',
            'region_code': 'BARMM',
            'province_code': None
        },
        
        # Major Provinces
        {
            'name': 'Rizal', 
            'type': 'province', 
            'lat': 14.6037, 
            'lon': 121.3084, 
            'pop': 3330143,
            'iso_code': 'PHL',
            'region_code': 'IV-A',
            'province_code': 'RIZ'
        },
        {
            'name': 'Cavite', 
            'type': 'province', 
            'lat': 14.2456, 
            'lon': 120.8756, 
            'pop': 4344829,
            'iso_code': 'PHL',
            'region_code': 'IV-A',
            'province_code': 'CAV'
        },
        {
            'name': 'Laguna', 
            'type': 'province', 
            'lat': 14.2691, 
            'lon': 121.3618, 
            'pop': 3382193,
            'iso_code': 'PHL',
            'region_code': 'IV-A',
            'province_code': 'LAG'
        },
        {
            'name': 'Bulacan', 
            'type': 'province', 
            'lat': 14.7942, 
            'lon': 120.8794, 
            'pop': 3708890,
            'iso_code': 'PHL',
            'region_code': 'III',
            'province_code': 'BUL'
        },
        {
            'name': 'Batangas', 
            'type': 'province', 
            'lat': 13.7565, 
            'lon': 121.0583, 
            'pop': 2908494,
            'iso_code': 'PHL',
            'region_code': 'IV-A',
            'province_code': 'BAT'
        },
        {
            'name': 'Cebu', 
            'type': 'province', 
            'lat': 10.3157, 
            'lon': 123.8854, 
            'pop': 5194086,
            'iso_code': 'PHL',
            'region_code': 'VII',
            'province_code': 'CEB'
        },
        {
            'name': 'Davao del Sur', 
            'type': 'province', 
            'lat': 6.7500, 
            'lon': 125.3500, 
            'pop': 688565,
            'iso_code': 'PHL',
            'region_code': 'XI',
            'province_code': 'DAS'
        },
        {
            'name': 'Pangasinan', 
            'type': 'province', 
            'lat': 16.0000, 
            'lon': 120.3333, 
            'pop': 3163190,
            'iso_code': 'PHL',
            'region_code': 'I',
            'province_code': 'PAN'
        },
        {
            'name': 'Negros Occidental', 
            'type': 'province', 
            'lat': 10.6500, 
            'lon': 122.9500, 
            'pop': 2623172,
            'iso_code': 'PHL',
            'region_code': 'VI',
            'province_code': 'NEC'
        },
        {
            'name': 'Iloilo', 
            'type': 'province', 
            'lat': 10.7167, 
            'lon': 122.5667, 
            'pop': 2051899,
            'iso_code': 'PHL',
            'region_code': 'VI',
            'province_code': 'ILO'
        },
        {
            'name': 'Pampanga', 
            'type': 'province', 
            'lat': 15.0794, 
            'lon': 120.6200, 
            'pop': 2609744,
            'iso_code': 'PHL',
            'region_code': 'III',
            'province_code': 'PAM'
        },
        {
            'name': 'Nueva Ecija', 
            'type': 'province', 
            'lat': 15.5833, 
            'lon': 120.9167, 
            'pop': 2310134,
            'iso_code': 'PHL',
            'region_code': 'III',
            'province_code': 'NUE'
        },
        
        # Major Cities (HUCs and Component Cities)
        {
            'name': 'Manila', 
            'type': 'city', 
            'lat': 14.5995, 
            'lon': 120.9842, 
            'pop': 1780148,
            'iso_code': 'PHL',
            'region_code': 'NCR',
            'province_code': None
        },
        {
            'name': 'Quezon City', 
            'type': 'city', 
            'lat': 14.6760, 
            'lon': 121.0437, 
            'pop': 2960048,
            'iso_code': 'PHL',
            'region_code': 'NCR',
            'province_code': None
        },
        {
            'name': 'Caloocan City', 
            'type': 'city', 
            'lat': 14.6507, 
            'lon': 120.9634, 
            'pop': 1661584,
            'iso_code': 'PHL',
            'region_code': 'NCR',
            'province_code': None
        },
        {
            'name': 'Makati City', 
            'type': 'city', 
            'lat': 14.5547, 
            'lon': 121.0244, 
            'pop': 629616,
            'iso_code': 'PHL',
            'region_code': 'NCR',
            'province_code': None
        },
        {
            'name': 'Pasig City', 
            'type': 'city', 
            'lat': 14.5764, 
            'lon': 121.0851, 
            'pop': 803159,
            'iso_code': 'PHL',
            'region_code': 'NCR',
            'province_code': None
        },
        {
            'name': 'Taguig City', 
            'type': 'city', 
            'lat': 14.5176, 
            'lon': 121.0509, 
            'pop': 886722,
            'iso_code': 'PHL',
            'region_code': 'NCR',
            'province_code': None
        },
        {
            'name': 'Cebu City', 
            'type': 'city', 
            'lat': 10.3157, 
            'lon': 123.8854, 
            'pop': 964169,
            'iso_code': 'PHL',
            'region_code': 'VII',
            'province_code': 'CEB'
        },
        {
            'name': 'Davao City', 
            'type': 'city', 
            'lat': 7.1907, 
            'lon': 125.4553, 
            'pop': 1776949,
            'iso_code': 'PHL',
            'region_code': 'XI',
            'province_code': 'DAS'
        },
        {
            'name': 'Zamboanga City', 
            'type': 'city', 
            'lat': 6.9214, 
            'lon': 122.0790, 
            'pop': 977234,
            'iso_code': 'PHL',
            'region_code': 'IX',
            'province_code': None
        },
        {
            'name': 'Antipolo', 
            'type': 'city', 
            'lat': 14.5873, 
            'lon': 121.1759, 
            'pop': 887399,
            'iso_code': 'PHL',
            'region_code': 'IV-A',
            'province_code': 'RIZ'
        },
        
        # Regional Centers and Important Cities for Climate Data
        {
            'name': 'Baguio', 
            'type': 'city', 
            'lat': 16.4023, 
            'lon': 120.5960, 
            'pop': 366358,
            'iso_code': 'PHL',
            'region_code': 'CAR',
            'province_code': None
        },
        {
            'name': 'Cagayan de Oro', 
            'type': 'city', 
            'lat': 8.4542, 
            'lon': 124.6319, 
            'pop': 728402,
            'iso_code': 'PHL',
            'region_code': 'X',
            'province_code': None
        },
        {
            'name': 'Iloilo City', 
            'type': 'city', 
            'lat': 10.7202, 
            'lon': 122.5621, 
            'pop': 457626,
            'iso_code': 'PHL',
            'region_code': 'VI',
            'province_code': 'ILO'
        },
        {
            'name': 'Bacolod', 
            'type': 'city', 
            'lat': 10.6740, 
            'lon': 122.9500, 
            'pop': 600783,
            'iso_code': 'PHL',
            'region_code': 'VI',
            'province_code': 'NEC'
        },
        {
            'name': 'General Santos', 
            'type': 'city', 
            'lat': 6.1164, 
            'lon': 125.1716, 
            'pop': 697315,
            'iso_code': 'PHL',
            'region_code': 'XII',
            'province_code': None
        },
        
        # Additional climate monitoring locations and provincial capitals
        {
            'name': 'Aparri', 
            'type': 'city', 
            'lat': 18.3553, 
            'lon': 121.6403, 
            'pop': 68839,
            'iso_code': 'PHL',
            'region_code': 'II',
            'province_code': None
        },
        {
            'name': 'Butuan', 
            'type': 'city', 
            'lat': 8.9436, 
            'lon': 125.5281, 
            'pop': 372910,
            'iso_code': 'PHL',
            'region_code': 'XIII',
            'province_code': None
        },
        {
            'name': 'Legazpi', 
            'type': 'city', 
            'lat': 13.1391, 
            'lon': 123.7436, 
            'pop': 209533,
            'iso_code': 'PHL',
            'region_code': 'V',
            'province_code': None
        },
        {
            'name': 'Puerto Princesa', 
            'type': 'city', 
            'lat': 9.7392, 
            'lon': 118.7353, 
            'pop': 307079,
            'iso_code': 'PHL',
            'region_code': 'IV-B',
            'province_code': None
        },
        {
            'name': 'Tacloban', 
            'type': 'city', 
            'lat': 11.2447, 
            'lon': 125.0048, 
            'pop': 251881,
            'iso_code': 'PHL',
            'region_code': 'VIII',
            'province_code': None
        }
    ]
    
    # Standardize locations
    standardized_locations = []
    for loc in locations:
        location_code = loc['name'].replace(" ", "_").replace("-", "_").replace("(", "").replace(")", "").upper()
        if len(location_code) > 50:
            location_code = location_code[:50]
            
        standardized_locations.append({
            'location_name': loc['name'],
            'location_type': loc['type'],
            'location_code': location_code,
            'latitude': loc.get('lat'),
            'longitude': loc.get('lon'),
            'population': loc.get('pop'),
            'iso_code': loc.get('iso_code'),
            'region_code': loc.get('region_code'),
            'province_code': loc.get('province_code'),
            'is_active': True,
            'valid_from': PROCESSING_TIMESTAMP,
            'valid_to': None,
            'created_at': PROCESSING_TIMESTAMP,
            'updated_at': PROCESSING_TIMESTAMP
        })
    
    return standardized_locations

# Create comprehensive Philippine locations
philippine_locations = create_comprehensive_philippine_locations()
print(f"Created {len(philippine_locations)} Philippine locations with coordinates")

# Show distribution by type
type_counts = {}
for loc in philippine_locations:
    loc_type = loc['location_type']
    type_counts[loc_type] = type_counts.get(loc_type, 0) + 1

print("\nLocation Type Distribution:")
for loc_type, count in sorted(type_counts.items()):
    print(f"  {loc_type}: {count:,}")

# Show coordinate and metadata coverage
coords_count = len([loc for loc in philippine_locations if loc['latitude'] is not None])
region_codes_count = len([loc for loc in philippine_locations if loc['region_code'] is not None])
province_codes_count = len([loc for loc in philippine_locations if loc['province_code'] is not None])

print(f"\nMetadata Coverage:")
print(f"  Locations with coordinates: {coords_count}/{len(philippine_locations)} ({coords_count/len(philippine_locations)*100:.1f}%)")
print(f"  Locations with region codes: {region_codes_count}/{len(philippine_locations)} ({region_codes_count/len(philippine_locations)*100:.1f}%)")
print(f"  Locations with province codes: {province_codes_count}/{len(philippine_locations)} ({province_codes_count/len(philippine_locations)*100:.1f}%)")

Created 50 Philippine locations with coordinates

Location Type Distribution:
  city: 20
  country: 1
  province: 12
  region: 17

Metadata Coverage:
  Locations with coordinates: 50/50 (100.0%)
  Locations with region codes: 49/50 (98.0%)
  Locations with province codes: 17/50 (34.0%)


In [5]:
# Define explicit schema for location dimension
location_schema = StructType([
    StructField("location_name", StringType(), True),
    StructField("location_type", StringType(), True),
    StructField("location_code", StringType(), True),
    StructField("latitude", DoubleType(), True),
    StructField("longitude", DoubleType(), True),
    StructField("population", LongType(), True),
    StructField("iso_code", StringType(), True),
    StructField("region_code", StringType(), True),
    StructField("province_code", StringType(), True),
    StructField("is_active", BooleanType(), True),
    StructField("valid_from", TimestampType(), True),
    StructField("valid_to", TimestampType(), True),
    StructField("created_at", TimestampType(), True),
    StructField("updated_at", TimestampType(), True)
])

# Create DataFrame with explicit schema
if philippine_locations:
    locations_df = spark.createDataFrame(philippine_locations, schema=location_schema)
    
    # Add location_id using window function
    window_spec = Window.orderBy("location_name", "location_type")
    locations_df = locations_df.withColumn("location_id", row_number().over(window_spec)) \
                               .withColumn("parent_location_id", lit(None).cast(LongType()))
    
    # Add hierarchy logic
    locations_df = locations_df.withColumn(
        "parent_location_id",
        when((col("location_type") == "city") & (col("region_code").isNotNull()),
            # Cities get the region as parent (this would need a lookup in real implementation)
            lit(None)
        ).when((col("location_type") == "province") & (col("region_code").isNotNull()),
                # Provinces get the region as parent
                lit(None)
        ).otherwise(lit(None))
    )
    
    # Add computed columns
    locations_df = locations_df.withColumn(
        "display_name",
        when(col("location_type") == "region", concat(col("location_name"), lit(" Region")))
        .when(col("location_type") == "province", concat(col("location_name"), lit(" Province")))
        .when(col("location_type") == "city", concat(col("location_name"), lit(" City")))
        .otherwise(col("location_name"))
    ).withColumn(
        "full_name",
        when(col("province_code").isNotNull() & col("region_code").isNotNull(),
            concat(col("location_name"), lit(", "), col("province_code"), lit(", "), col("region_code"))
        ).when(col("region_code").isNotNull(),
                concat(col("location_name"), lit(", "), col("region_code"))
        ).otherwise(col("location_name"))
    ).withColumn(
        "processing_version", lit("V2")
    )
    
    # Select final columns in optimized order
    locations_df = locations_df.select(
        "location_id", "location_code", "location_name", "display_name", "full_name",
        "location_type", "parent_location_id", "iso_code", "region_code", "province_code",
        "latitude", "longitude", "population", "is_active", "valid_from", "valid_to",
        "processing_version", "created_at", "updated_at"
    )
    
    record_count = locations_df.count()
    print(f"Location dimension created: {record_count:,} records")
    
    if record_count == 0:
        raise Exception("No location records created - check input data")
    
    print("\nSample locations by type with metadata:")
    
    for loc_type in sorted(type_counts.keys()):
        print(f"\n{loc_type.title()}:")
        sample_df = locations_df.filter(col("location_type") == loc_type).limit(3)
        for row in sample_df.collect():
            coords = f"({row.latitude:.4f}, {row.longitude:.4f})" if row.latitude is not None else "No coordinates"
            pop = f"Pop: {row.population:,}" if row.population is not None else "No population"
            region = f"Region: {row.region_code}" if row.region_code else "No region"
            print(f"  - {row.full_name} {coords} {pop} {region}")
            
else:
    raise Exception("Error: No Philippine locations were generated.")

                                                                                

Location dimension created: 50 records

Sample locations by type with metadata:

City:
  - Antipolo, RIZ, IV-A (14.5873, 121.1759) Pop: 887,399 Region: IV-A
  - Aparri, II (18.3553, 121.6403) Pop: 68,839 Region: II
  - Bacolod, NEC, VI (10.6740, 122.9500) Pop: 600,783 Region: VI

Country:
  - Philippines (12.8797, 121.7740) Pop: 109,035,343 No region

Province:
  - Batangas, BAT, IV-A (13.7565, 121.0583) Pop: 2,908,494 Region: IV-A
  - Bulacan, BUL, III (14.7942, 120.8794) Pop: 3,708,890 Region: III
  - Cavite, CAV, IV-A (14.2456, 120.8756) Pop: 4,344,829 Region: IV-A

Region:
  - Bangsamoro Autonomous Region in Muslim Mindanao, BARMM (7.2189, 124.2458) Pop: 4,404,288 Region: BARMM
  - Bicol Region, V (13.4243, 123.4116) Pop: 6,082,165 Region: V
  - Cagayan Valley, II (17.6322, 121.7231) Pop: 3,685,744 Region: II


In [10]:
from pyspark.sql.functions import col, lit, when, count, countDistinct, sum

print(f"\n=== LOCATION DIMENSION VALIDATION ===")

try:
    # Comprehensive validation
    validation_stats = locations_df.agg(
        countDistinct("location_id").alias("unique_ids"),
        countDistinct("location_code").alias("unique_codes"),
        countDistinct("location_name").alias("unique_names"),
        countDistinct("location_type").alias("unique_types"),
        count("*").alias("total_records"),  # FIX: Use count("*") and ensure 'count' is the imported function
        sum(when(col("latitude").isNull(), 1).otherwise(0)).alias("missing_latitude"),
        sum(when(col("longitude").isNull(), 1).otherwise(0)).alias("missing_longitude"),
        sum(when(col("population").isNull(), 1).otherwise(0)).alias("missing_population"),
        sum(when(col("region_code").isNull(), 1).otherwise(0)).alias("missing_region_codes")
    ).collect()[0]
    
    print(f"Location Dimension Validation:")
    print(f"  Unique IDs: {validation_stats['unique_ids']:,}")
    print(f"  Unique codes: {validation_stats['unique_codes']:,}")
    print(f"  Unique names: {validation_stats['unique_names']:,}")
    print(f"  Location types: {validation_stats['unique_types']:,}")
    print(f"  Total records: {validation_stats['total_records']:,}")
    
    # Data completeness
    total_records = validation_stats['total_records']
    if total_records > 0:
        coord_completeness = (total_records - validation_stats['missing_latitude']) / total_records * 100
        pop_completeness = (total_records - validation_stats['missing_population']) / total_records * 100
        region_completeness = (total_records - validation_stats['missing_region_codes']) / total_records * 100
        
        print(f"\nData Completeness:")
        print(f"  Coordinate completeness: {coord_completeness:.1f}%")
        print(f"  Population completeness: {pop_completeness:.1f}%")
        print(f"  Region code completeness: {region_completeness:.1f}%")
        
        # Quality score
        quality_score = (coord_completeness + pop_completeness + region_completeness) / 3
        print(f"  Data Quality Score: {quality_score:.1f}%")
        
        if quality_score > 90:
            print(f"  EXCELLENT: Data quality meets standards")
        elif quality_score > 80:
            print(f"  GOOD: Data quality acceptable for processing")
        else:
            print(f"  WARNING: Data quality issues detected - review required")
    else:
        print("\nData Completeness: Cannot be calculated, no records found.")

    # Show schema for validation
    print(f"\nLocation Dimension Schema:")
    locations_df.printSchema()
    
    print(f"\nSample Data:")
    locations_df.select(
        "location_id", "location_name", "location_type", "region_code", 
        "latitude", "longitude", "population", "processing_version"
    ).limit(10).show(truncate=False)
    
except Exception as e:
    print(f"Error in location validation: {e}")
    raise


=== LOCATION DIMENSION VALIDATION ===
Location Dimension Validation:
  Unique IDs: 50
  Unique codes: 50
  Unique names: 50
  Location types: 4
  Total records: 50

Data Completeness:
  Coordinate completeness: 100.0%
  Population completeness: 100.0%
  Region code completeness: 98.0%
  Data Quality Score: 99.3%
  EXCELLENT: Data quality meets standards

Location Dimension Schema:
root
 |-- location_id: integer (nullable = false)
 |-- location_code: string (nullable = true)
 |-- location_name: string (nullable = true)
 |-- display_name: string (nullable = true)
 |-- full_name: string (nullable = true)
 |-- location_type: string (nullable = true)
 |-- parent_location_id: void (nullable = true)
 |-- iso_code: string (nullable = true)
 |-- region_code: string (nullable = true)
 |-- province_code: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- population: long (nullable = true)
 |-- is_active: boolean (nullable = true)
 |-- v

In [11]:
# Save location dimension with optimization
def save_location_dimension(df, table_name, partition_cols=None):
    """Save location dimension with optimization"""
    output_path = os.path.join(SILVER_PATH, table_name)
    
    print(f"\n=== SAVING TABLE: {table_name.upper()} ===")
    print(f"Target path: {output_path}")
    
    try:
        # Cache DataFrame for operations
        df.cache()
        
        # Get record count for validation
        record_count = df.count()
        print(f"Records to save: {record_count:,}")
        
        if record_count == 0:
            raise Exception(f"Cannot save {table_name} - no records to write")
        
        # Configure Delta Lake writer with optimizations
        writer = (
            df.write
            .format("delta")
            .mode("overwrite")
            .option("overwriteSchema", "true")
            .option("dataChange", "true")
        )
        
        if partition_cols:
            writer = writer.partitionBy(*partition_cols)
            print(f"Partitioning applied on: {', '.join(partition_cols)}")
        
        # Save to Delta table
        writer.save(output_path)
        print(f"SUCCESS: Table '{table_name}' saved")
        
        # Post-save validation
        saved_df = spark.read.format("delta").load(output_path)
        saved_count = saved_df.count()
        
        print("\n--- Post-save Validation ---")
        print(f"  Expected records : {record_count:,}")
        print(f"  Saved records    : {saved_count:,}")
        if saved_count == record_count:
            print("  Validation result: PASSED")
        else:
            print("  Validation result: FAILED")
        
        # Show partition structure if applicable
        if partition_cols:
            print("\n--- Partition Structure ---")
            partition_sample = saved_df.select(*partition_cols).distinct().orderBy(*partition_cols)
            partition_sample.show(10)
        
        # Show location type distribution in saved data
        print("\n--- Saved Data Distribution ---")
        saved_df.groupBy("location_type").count().orderBy(desc("count")).show()
        
        return True, saved_count
        
    except Exception as e:
        print(f"ERROR saving table '{table_name}': {e}")
        import traceback
        traceback.print_exc()
        return False, 0
    finally:
        df.unpersist()

# Save location dimension
location_success, location_saved_count = save_location_dimension(
    locations_df, 
    "dim_location_v2", 
    ["location_type"]  # Partition by location type for analytics performance
)

# Overall save status
print("\n=== SAVE SUMMARY ===")
if location_success:
    print("Location dimension saved successfully")
    print(f"  Records saved: {location_saved_count:,}")
    print("  Delta Lake optimization: Enabled")
    print("  Partitioning: By location_type")
else:
    print("Location dimension save failed - check configuration")


=== SAVING TABLE: DIM_LOCATION_V2 ===
Target path: /home/ernese/miniconda3/envs/SO/New_SO/final-spark-silver/dim_location_v2
Records to save: 50
Partitioning applied on: location_type


                                                                                

SUCCESS: Table 'dim_location_v2' saved

--- Post-save Validation ---
  Expected records : 50
  Saved records    : 50
  Validation result: PASSED

--- Partition Structure ---
+-------------+
|location_type|
+-------------+
|         city|
|      country|
|     province|
|       region|
+-------------+


--- Saved Data Distribution ---
+-------------+-----+
|location_type|count|
+-------------+-----+
|         city|   20|
|       region|   17|
|     province|   12|
|      country|    1|
+-------------+-----+


=== SAVE SUMMARY ===
Location dimension saved successfully
  Records saved: 50
  Delta Lake optimization: Enabled
  Partitioning: By location_type


In [12]:
# Final comprehensive validation
try:
    # Load and validate saved table
    saved_location_path = os.path.join(SILVER_PATH, "dim_location_v2")
    test_df = spark.read.format("delta").load(saved_location_path)
    final_count = test_df.count()
    
    print(f"\nFinal Validation: Successfully created dim_location_v2 with {final_count:,} records")
    
    # Show geographic coverage validation
    print("\nGeographic Coverage Validation:")
    geographic_coverage = test_df.groupBy("location_type").agg(
        count("*").alias("count"),
        avg("latitude").alias("avg_lat"),
        avg("longitude").alias("avg_lon"),
        avg("population").alias("avg_population")
    ).orderBy(desc("count"))
    
    geographic_coverage.show()
    
    # Show regional distribution
    print("\nRegional Distribution:")
    regional_dist = test_df.filter(col("region_code").isNotNull()).groupBy("region_code").count().orderBy(desc("count"))
    regional_dist.show()
    
except Exception as e:
    print(f"Final validation failed: {e}")


Final Validation: Successfully created dim_location_v2 with 50 records

Geographic Coverage Validation:
+-------------+-----+------------------+------------------+--------------+
|location_type|count|           avg_lat|           avg_lon|avg_population|
+-------------+-----+------------------+------------------+--------------+
|         city|   20|          12.01895|122.46555000000004|      869387.7|
|       region|   17|11.701541176470588|122.90726470588234|     6416332.0|
|     province|   12|13.063683333333332|121.84213333333336|    3026278.25|
|      country|    1|           12.8797|           121.774|  1.09035343E8|
+-------------+-----+------------------+------------------+--------------+


Regional Distribution:
+-----------+-----+
|region_code|count|
+-----------+-----+
|        NCR|    7|
|       IV-A|    6|
|         VI|    5|
|        III|    4|
|        VII|    3|
|         XI|    3|
|        CAR|    2|
|       XIII|    2|
|       IV-B|    2|
|          V|    2|
|         