In [1]:
# Import required libraries
import os
import warnings
warnings.filterwarnings('ignore')

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
import json
from datetime import datetime, date
import re
from functools import reduce

In [2]:
# Initialize Spark Session with optimizations
spark = SparkSession.builder \
    .appName("ClimateWeatherFactProcessorV2") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .config("spark.sql.adaptive.advisoryPartitionSizeInBytes", "64MB") \
    .config("spark.sql.adaptive.maxNumPostShufflePartitions", "100") \
    .config("spark.databricks.delta.optimizeWrite.enabled", "true") \
    .config("spark.databricks.delta.autoCompact.enabled", "false") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "false") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0") \
    .config("spark.sql.warehouse.dir", "/tmp/spark-warehouse") \
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

print(f"=== CLIMATE WEATHER FACT PROCESSOR V2 ===")
print(f"Spark Version: {spark.version}")
print(f"Application: {spark.sparkContext.appName}")
print(f"Anti-corruption optimizations enabled")
print(f"Processing timestamp: {datetime.now()}")

25/08/28 17:31:43 WARN Utils: Your hostname, 3rnese resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/08/28 17:31:43 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/home/ernese/miniconda3/envs/SO/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/ernese/.ivy2/cache
The jars for the packages stored in: /home/ernese/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-a069162c-1947-4c9c-8eda-17450cf9af24;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.4.0 in central
	found io.delta#delta-storage;2.4.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
:: resolution report :: resolve 200ms :: artifacts dl 4ms
	:: modules in use:
	io.delta#delta-core_2.12;2.4.0 from central in [default]
	io.delta#delta-storage;2.4.0 from central in [default]
	org.antlr#antlr4-runtime;4.9.3 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   3   |   0   |

=== CLIMATE WEATHER FACT PROCESSOR V2 ===
Spark Version: 3.4.0
Application: ClimateWeatherFactProcessorV2
Anti-corruption optimizations enabled
Processing timestamp: 2025-08-28 17:31:46.885000


In [6]:
# Configuration
BRONZE_PATH = "/home/ernese/miniconda3/envs/SO/New_SO/final-spark-bronze/new_bronze"
SILVER_PATH = "/home/ernese/miniconda3/envs/SO/New_SO/final-spark-silver"
PROCESSING_TIMESTAMP = datetime.now()

# Ensure target directory exists
os.makedirs(SILVER_PATH, exist_ok=True)

print(f"=== CONFIGURATION ===")
print(f"Bronze Path: {BRONZE_PATH}")
print(f"Silver Path: {SILVER_PATH}")
print(f"Processing Time: {PROCESSING_TIMESTAMP}")

=== CONFIGURATION ===
Bronze Path: /home/ernese/miniconda3/envs/SO/New_SO/final-spark-bronze/new_bronze
Silver Path: /home/ernese/miniconda3/envs/SO/New_SO/final-spark-silver
Processing Time: 2025-08-28 17:32:30.369048


## Load Dimension Tables with Caching

In [7]:
# Load dimension tables with validation and caching
try:
    # Load dimensions - using dim_location_v2 since that's what you have
    dim_location = spark.read.format("delta").load(os.path.join(SILVER_PATH, "dim_location_v2")).cache()
    dim_time = spark.read.format("delta").load(os.path.join(SILVER_PATH, "dim_time")).cache()
    dim_indicator = spark.read.format("delta").load(os.path.join(SILVER_PATH, "dim_indicator")).cache()
    
    # Force caching and validate
    location_count = dim_location.count()
    time_count = dim_time.count()
    indicator_count = dim_indicator.count()
    
    print(f"Dimension tables loaded and cached:")
    print(f"  - dim_location_v2: {location_count:,} records")
    print(f"  - dim_time: {time_count:,} records")
    print(f"  - dim_indicator: {indicator_count:,} records")
    
    # Create broadcast variables for efficient joins
    location_broadcast = broadcast(dim_location.select("location_id", "location_name"))
    time_broadcast = broadcast(dim_time.select("date_id", col("date_value").alias("measurement_date")))
    
    print("\nDimension tables ready for broadcast joins")
    
except Exception as e:
    print(f"CRITICAL ERROR loading dimensions: {e}")
    raise

Dimension tables loaded and cached:
  - dim_location_v2: 50 records
  - dim_time: 612 records
  - dim_indicator: 15 records

Dimension tables ready for broadcast joins


## Identify NASA Climate Data Sources

In [8]:
def get_nasa_climate_sources():
    """Identify all NASA climate data sources with metadata"""
    climate_sources = []
    
    # NASA Climate data sources mapping
    nasa_datasets = {
        'bronze_nasa_c01_v2': {
            'metric': 'Mean Air Surface Temperature',
            'unit': 'Celsius',
            'frequency': 'Daily',
            'code': 'C01'
        },
        'bronze_nasa_c03_v2': {
            'metric': 'Daily Highest Temperature',
            'unit': 'Celsius',
            'frequency': 'Daily',
            'code': 'C03'
        },
        'bronze_nasa_c04_v2': {
            'metric': 'Daily Lowest Temperature', 
            'unit': 'Celsius',
            'frequency': 'Daily',
            'code': 'C04'
        },
        'bronze_nasa_c09_v2': {
            'metric': 'Daily Precipitation',
            'unit': 'Millimeters',
            'frequency': 'Daily',
            'code': 'C09'
        },
        'bronze_nasa_c12_v2': {
            'metric': 'Mean Surface Pressure',
            'unit': 'Pascals',
            'frequency': 'Daily',
            'code': 'C12'
        },
        'bronze_nasa_c13_v2': {
            'metric': 'Humidity Level',
            'unit': 'Percentage',
            'frequency': 'Daily',
            'code': 'C13'
        }
    }
    
    # Check which datasets exist
    for dataset_name, metadata in nasa_datasets.items():
        dataset_path = os.path.join(BRONZE_PATH, dataset_name)
        if os.path.exists(dataset_path):
            climate_sources.append({
                'path': dataset_path,
                'source': 'NASA',
                'dataset_code': metadata['code'],
                'dataset_name': dataset_name,
                'metric': metadata['metric'],
                'unit': metadata['unit'],
                'frequency': metadata['frequency'],
                'data_type': 'climate'
            })
            print(f"Found: {metadata['code']} - {metadata['metric']} ({metadata['unit']})")
        else:
            print(f"Missing: {metadata['code']} - {dataset_path}")
    
    # Also check for C23 in the separate folder
    c23_path = "/home/ernese/miniconda3/envs/SO/New_SO/final-spark-bronze/bronze_nasa_c23_v2"
    if os.path.exists(c23_path):
        climate_sources.append({
            'path': c23_path,
            'source': 'NASA',
            'dataset_code': 'C23',
            'dataset_name': 'bronze_nasa_c23_v2',
            'metric': 'Monthly Surface Air Temperature',
            'unit': 'Celsius',
            'frequency': 'Monthly',
            'data_type': 'climate'
        })
        print(f"Found: C23 - Monthly Surface Air Temperature (Celsius)")
    
    return climate_sources

# Execute discovery
climate_sources = get_nasa_climate_sources()
print(f"\n=== DISCOVERY COMPLETE ===")
print(f"Total NASA climate sources found: {len(climate_sources)}")

Found: C01 - Mean Air Surface Temperature (Celsius)
Found: C03 - Daily Highest Temperature (Celsius)
Found: C04 - Daily Lowest Temperature (Celsius)
Found: C09 - Daily Precipitation (Millimeters)
Found: C12 - Mean Surface Pressure (Pascals)
Found: C13 - Humidity Level (Percentage)
Found: C23 - Monthly Surface Air Temperature (Celsius)

=== DISCOVERY COMPLETE ===
Total NASA climate sources found: 7


In [None]:
def safe_process_and_unpivot_v2(source_info):
    """Process climate data"""
    try:
        print(f"\n--- PROCESSING: {source_info['dataset_code']} - {source_info['metric']} ---")
        
        # Load source data
        df = spark.read.format("delta").load(source_info['path'])
        total_rows = df.count()
        print(f"  Source rows: {total_rows:,}")
        
        if total_rows == 0:
            print(f"  WARNING: Empty dataset - skipping")
            return None
        
        # Identify data structure
        columns = df.columns
        print(f"  Columns detected: {len(columns)}")
        
        # Skip audit/system columns for location detection
        system_cols = ['measurement_date', 'year', 'month', 'day', 'quarter', 'climate_metric_code', 
                       'climate_metric_name', 'source_system', 'source_file', 'processing_version', 
                       'ingestion_timestamp', 'data_quality_flag']
        
        location_cols = [c for c in columns if c not in system_cols]
        print(f"  Location columns: {len(location_cols)}")
        
        if len(location_cols) == 0:
            print(f"  ERROR: No location columns found")
            return None
        
        # Sample validation - check for actual data
        sample_data = df.select([col(c) for c in location_cols[:3]]).limit(5).collect()
        has_valid_data = any(
            any(row[col_name] not in [None, 0.0, ''] for col_name in location_cols[:3] if col_name in row.asDict())
            for row in sample_data
        )
        
        if not has_valid_data:
            print(f"  WARNING: Source data appears to be all zeros/nulls - skipping")
            return None
        
        print(f"  Source data validation: PASSED")
        
        # Build unpivot expression using stack()
        stack_expr = ", ".join([f"'{col}', `{col}`" for col in location_cols])
        unpivot_expr = f"stack({len(location_cols)}, {stack_expr}) as (location_name, measurement_value)"
        
        # Get the date column (should be measurement_date from bronze processing)
        date_col = "measurement_date" if "measurement_date" in columns else "date"
        
        # Unpivot with comprehensive filtering
        long_df = df.select(date_col, expr(unpivot_expr)) \
                      .filter(col("measurement_value").isNotNull()) \
                      .filter(col("location_name").isNotNull()) \
                      .filter(col("measurement_value") > -900)  # Remove sentinel values
        
        # Cache and validate unpivoted data
        long_df.cache()
        unpivot_count = long_df.count()
        print(f"  Unpivoted records: {unpivot_count:,}")
        
        if unpivot_count == 0:
            print(f"  ERROR: No records after unpivoting")
            return None
        
        # CRITICAL: Validate unpivoted data quality
        value_stats = long_df.agg(
            avg("measurement_value").alias("avg_val"),
            min("measurement_value").alias("min_val"),
            max("measurement_value").alias("max_val"),
            sum(when(col("measurement_value") == 0.0, 1).otherwise(0)).alias("zero_count")
        ).collect()[0]
        
        print(f"  Value statistics:")
        print(f"    Range: {value_stats['min_val']:.2f} to {value_stats['max_val']:.2f}")
        print(f"    Average: {value_stats['avg_val']:.2f}")
        print(f"    Zero values: {value_stats['zero_count']:,}")
        
        # Corruption detection
        if value_stats['avg_val'] == 0.0 and value_stats['max_val'] == 0.0:
            print(f"  CRITICAL ERROR: All values are zero after unpivot - data corruption detected")
            return None
        
        # Create standardized output with proper date handling
        processed_df = long_df.withColumn("location_name", regexp_replace(col("location_name"), "_", " ")) \
                                 .withColumn("measurement_date", 
                                             when(col(date_col).isNull(), lit(None))
                                             .otherwise(col(date_col).cast("date"))) \
                                 .withColumn("climate_metric", lit(source_info['metric'])) \
                                 .withColumn("metric_code", lit(source_info['dataset_code'])) \
                                 .withColumn("unit_of_measure", lit(source_info['unit'])) \
                                 .withColumn("data_source", lit(source_info['source'])) \
                                 .withColumn("source_dataset", lit(source_info['dataset_name'])) \
                                 .select("measurement_date", "location_name", "climate_metric", "metric_code", 
                                         "measurement_value", "unit_of_measure", "data_source", "source_dataset")
        
        # Final validation
        final_count = processed_df.count()
        final_avg = processed_df.agg(avg("measurement_value")).collect()[0][0]
        
        print(f"  Final records: {final_count:,}")
        print(f"  Final average: {final_avg:.2f}")
        
        if final_avg == 0.0 or final_count == 0:
            print(f"  ERROR: Final data corrupted")
            return None
        
        print(f"  Processing: SUCCESS")
        return processed_df

    except Exception as e:
        print(f"  ERROR processing {source_info['dataset_name']}: {str(e)}")
        import traceback
        traceback.print_exc()
        return None

# Process all climate sources with validation
all_climate_dfs = []
print("\n=== STARTING CLIMATE DATA PROCESSING ===")

for i, source in enumerate(climate_sources, 1):
    print(f"\n{'='*70}")
    print(f"PROCESSING DATASET {i}/{len(climate_sources)}: {source['dataset_code']}")
    print(f"{'='*70}")
    
    processed_df = safe_process_and_unpivot_v2(source)
    if processed_df:
        all_climate_dfs.append(processed_df)
        print(f"SUCCESS: {source['dataset_code']} processed")
    else:
        print(f"FAILED: {source['dataset_code']} skipped")

print(f"\n=== PROCESSING SUMMARY ===")
print(f"Successfully processed: {len(all_climate_dfs)}/{len(climate_sources)} datasets")


=== STARTING CLIMATE DATA PROCESSING ===

PROCESSING DATASET 1/7: C01

--- PROCESSING: C01 - Mean Air Surface Temperature ---
  Source rows: 16,262
  Columns detected: 144
  Location columns: 132
  Source data validation: PASSED
  Unpivoted records: 2,146,584
  Value statistics:
    Range: 13.37 to 33.16
    Average: 26.00
    Zero values: 0
  Final records: 2,146,584
  Final average: 26.00
  Processing: SUCCESS
SUCCESS: C01 processed

PROCESSING DATASET 2/7: C03

--- PROCESSING: C03 - Daily Highest Temperature ---
  Source rows: 16,262
  Columns detected: 144
  Location columns: 132
  Source data validation: PASSED
  Unpivoted records: 2,146,584
  Value statistics:
    Range: 15.68 to 42.20
    Average: 29.18
    Zero values: 0
  Final records: 2,146,584
  Final average: 29.18
  Processing: SUCCESS
SUCCESS: C03 processed

PROCESSING DATASET 3/7: C04

--- PROCESSING: C04 - Daily Lowest Temperature ---
  Source rows: 16,263
  Columns detected: 144
  Location columns: 132
  Source data 

Exception in thread "serve-DataFrame" java.net.SocketTimeoutException: Accept timed out
	at java.base/java.net.PlainSocketImpl.socketAccept(Native Method)
	at java.base/java.net.AbstractPlainSocketImpl.accept(AbstractPlainSocketImpl.java:474)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:565)
	at java.base/java.net.ServerSocket.accept(ServerSocket.java:533)
	at org.apache.spark.security.SocketAuthServer$$anon$1.run(SocketAuthServer.scala:65)


In [11]:
# Combine datasets with comprehensive validation
if all_climate_dfs:
    print("\n" + "="*70)
    print("COMBINING AND VALIDATING CLIMATE DATA")
    print("="*70)
    
    # Union all processed dataframes
    print("\n1. Combining all climate datasets...")
    combined_df = all_climate_dfs[0]
    for i, df in enumerate(all_climate_dfs[1:], 2):
        print(f"  Adding dataset {i}...")
        combined_df = combined_df.unionByName(df)

    # Critical validation
    combined_df.cache()
    combined_count = combined_df.count()
    print(f"\n  Total combined records: {combined_count:,}")
    
    # Validate combined data quality
    combined_quality = combined_df.agg(
        avg("measurement_value").alias("avg_value"),
        min("measurement_value").alias("min_value"),
        max("measurement_value").alias("max_value"),
        sum(when(col("measurement_value") == 0.0, 1).otherwise(0)).alias("zero_count"),
        sum(when(col("measurement_value").isNull(), 1).otherwise(0)).alias("null_count")
    ).collect()[0]
    
    print(f"\n2. Combined data quality validation:")
    print(f"  Range: {combined_quality['min_value']:.2f} to {combined_quality['max_value']:.2f}")
    print(f"  Average: {combined_quality['avg_value']:.2f}")
    print(f"  Zero values: {combined_quality['zero_count']:,} ({(combined_quality['zero_count']/combined_count)*100:.1f}%)")
    print(f"  Null values: {combined_quality['null_count']:,}")
    
    if combined_quality['avg_value'] == 0.0:
        print("  CRITICAL ERROR: All combined data is zero - stopping processing")
        combined_df = None
    else:
        print("  Combined data validation: PASSED")
        
        # Show metric distribution
        print(f"\n3. Metric distribution:")
        combined_df.groupBy("metric_code", "climate_metric").count().orderBy("metric_code").show(truncate=False)
        
        # Date range and location validation
        date_location_stats = combined_df.agg(
            min("measurement_date").alias("min_date"),
            max("measurement_date").alias("max_date"),
            countDistinct("measurement_date").alias("unique_dates"),
            countDistinct("location_name").alias("unique_locations")
        ).collect()[0]
        
        print(f"\n4. Data coverage:")
        print(f"  Date range: {date_location_stats['min_date']} to {date_location_stats['max_date']}")
        print(f"  Unique dates: {date_location_stats['unique_dates']:,}")
        print(f"  Unique locations: {date_location_stats['unique_locations']:,}")
        
else:
    print("No datasets were successfully processed")
    combined_df = None


COMBINING AND VALIDATING CLIMATE DATA

1. Combining all climate datasets...
  Adding dataset 2...
  Adding dataset 3...
  Adding dataset 4...
  Adding dataset 5...
  Adding dataset 6...
  Adding dataset 7...


                                                                                


  Total combined records: 12,880,169

2. Combined data quality validation:
  Range: 0.00 to 573.72
  Average: 44.35
  Zero values: 107,511 (0.8%)
  Null values: 0
  Combined data validation: PASSED

3. Metric distribution:


                                                                                

+-----------+-------------------------------+-------+
|metric_code|climate_metric                 |count  |
+-----------+-------------------------------+-------+
|C01        |Mean Air Surface Temperature   |2146584|
|C03        |Daily Highest Temperature      |2146584|
|C04        |Daily Lowest Temperature       |2146584|
|C09        |Daily Precipitation            |2146584|
|C12        |Mean Surface Pressure          |2146716|
|C13        |Humidity Level                 |2146584|
|C23        |Monthly Surface Air Temperature|533    |
+-----------+-------------------------------+-------+






4. Data coverage:
  Date range: 1981-01-01 to 2025-07-11
  Unique dates: 16,263
  Unique locations: 133


                                                                                

In [12]:
# Add foreign keys with comprehensive validation
if combined_df:
    print("\n" + "="*70)
    print("ADDING FOREIGN KEYS WITH VALIDATION")
    print("="*70)
    
    print("\n1. Adding location foreign keys...")
    climate_with_location = combined_df.join(
        location_broadcast,
        "location_name",
        "left"
    ).na.fill(value=1, subset=["location_id"])  # Default to Philippines (ID=1) if no match
    
    # Validate location join
    location_join_count = climate_with_location.count()
    unmatched_locations = climate_with_location.filter(col("location_id") == 1).select("location_name").distinct().count()
    print(f"  Records after location join: {location_join_count:,}")
    print(f"  Unmatched locations (defaulted to Philippines): {unmatched_locations}")
    
    print("\n2. Adding time foreign keys...")
    final_df = climate_with_location.join(
        time_broadcast,
        "measurement_date",
        "left"
    ).na.fill(value=1, subset=["date_id"])  # Default to unknown date if no match
    
    # Add indicator_id for climate data
    final_df = final_df.withColumn("indicator_id", lit(7))  # Climate indicator
    
    # Validate final joined data
    final_joined_count = final_df.count()
    print(f"  Records after time join: {final_joined_count:,}")
    
    if final_joined_count != combined_count:
        print(f"  WARNING: Record count changed during joins ({final_joined_count - combined_count:+,})")
    else:
        print(f"  Join validation: PASSED - record count preserved")
    
    # Critical data quality check after joins
    final_quality = final_df.agg(
        avg("measurement_value").alias("avg_value"),
        sum(when(col("measurement_value") == 0.0, 1).otherwise(0)).alias("zero_count")
    ).collect()[0]
    
    print(f"\n3. Post-join data quality:")
    print(f"  Final average value: {final_quality['avg_value']:.2f}")
    print(f"  Zero values: {final_quality['zero_count']:,}")
    
    if final_quality['avg_value'] == 0.0:
        print("  CRITICAL ERROR: Data corrupted during joins")
        final_df = None
    else:
        print("  Join quality validation: PASSED")
else:
    final_df = None


ADDING FOREIGN KEYS WITH VALIDATION

1. Adding location foreign keys...


                                                                                

  Records after location join: 12,880,169
  Unmatched locations (defaulted to Philippines): 107

2. Adding time foreign keys...
  Records after time join: 12,880,169
  Join validation: PASSED - record count preserved





3. Post-join data quality:
  Final average value: 44.35
  Zero values: 107,511
  Join quality validation: PASSED


                                                                                

In [None]:
# Create final fact table with comprehensive schema
if final_df:
    print("\n" + "="*70)
    print("CREATING FINAL FACT TABLE")
    print("="*70)
    
    print("\n1. Building final fact table schema...")
    
    # Create the final fact table with optimized schema
    final_climate_fact = final_df.withColumn(
        "climate_weather_id", row_number().over(Window.orderBy("location_id", "date_id", "metric_code"))
    ).withColumn(
        "temperature_celsius", 
        when(col("unit_of_measure") == "Celsius", col("measurement_value")).otherwise(lit(None))
    ).withColumn(
        "precipitation_mm", 
        when(col("unit_of_measure") == "Millimeters", col("measurement_value")).otherwise(lit(None))
    ).withColumn(
        "pressure_pascals", 
        when(col("unit_of_measure") == "Pascals", col("measurement_value")).otherwise(lit(None))
    ).withColumn(
        "humidity_percentage", 
        when(col("unit_of_measure") == "Percentage", col("measurement_value")).otherwise(lit(None))
    ).withColumn("year", year(col("measurement_date"))) \
     .withColumn("month", month(col("measurement_date"))) \
     .withColumn("day", dayofmonth(col("measurement_date"))) \
     .withColumn("quality_flag", lit("good")) \
     .withColumn("measurement_type", lit("observed")) \
     .withColumn("data_quality_score", lit(0.95)) \
     .withColumn("processing_version", lit("V2")) \
     .withColumn("created_at", lit(PROCESSING_TIMESTAMP)) \
     .withColumn("updated_at", lit(PROCESSING_TIMESTAMP))

    # Select final columns in optimized order
    final_climate_fact = final_climate_fact.select(
        "climate_weather_id", "location_id", "date_id", "indicator_id", "measurement_date",
        "year", "month", "day", "climate_metric", "metric_code", "measurement_value",
        "temperature_celsius", "precipitation_mm", "pressure_pascals", "humidity_percentage",
        "unit_of_measure", "measurement_type", "quality_flag", "data_quality_score",
        "data_source", "source_dataset", "processing_version", "created_at", "updated_at"
    )

    # Cache and validate final table
    final_climate_fact.cache()
    final_count = final_climate_fact.count()
    
    print(f"\n2. Final fact table records: {final_count:,}")
    
    # CRITICAL PRE-WRITE VALIDATION
    print("\n3. CRITICAL PRE-WRITE VALIDATION:")
    pre_write_validation = final_climate_fact.agg(
        avg("measurement_value").alias("avg_value"),
        min("measurement_value").alias("min_value"),
        max("measurement_value").alias("max_value"),
        sum(when(col("measurement_value") == 0.0, 1).otherwise(0)).alias("zero_count"),
        countDistinct("climate_weather_id").alias("unique_ids"),
        sum(when(col("measurement_value").isNull(), 1).otherwise(0)).alias("null_count")
    ).collect()[0]
    
    print(f"  Records: {final_count:,}")
    print(f"  Unique IDs: {pre_write_validation['unique_ids']:,}")
    print(f"  Value range: {pre_write_validation['min_value']:.2f} to {pre_write_validation['max_value']:.2f}")
    print(f"  Average value: {pre_write_validation['avg_value']:.2f}")
    print(f"  Zero values: {pre_write_validation['zero_count']:,}")
    print(f"  Null values: {pre_write_validation['null_count']:,}")
    
    # Check for critical issues
    has_critical_issues = False
    
    if pre_write_validation['avg_value'] == 0.0:
        print("  CRITICAL ERROR: All measurement values are zero!")
        has_critical_issues = True
        
    if pre_write_validation['unique_ids'] != final_count:
        print(f"  CRITICAL ERROR: Non-unique IDs detected!")
        has_critical_issues = True
        
    if has_critical_issues:
        print("  STOPPING: Cannot proceed with corrupted data")
        final_climate_fact = None
    else:
        print("  PRE-WRITE VALIDATION: PASSED - Data is ready for write")
        
        # Show sample data
        print("\n4. Sample final data:")
        final_climate_fact.select(
            "measurement_date", "location_id", "climate_metric", "measurement_value", 
            "unit_of_measure", "processing_version"
        ).show(5, truncate=False)
else:
    final_climate_fact = None
    print("No final fact table created ")


CREATING FINAL FACT TABLE

1. Building final fact table schema...


                                                                                


2. Final fact table records: 12,880,169

3. CRITICAL PRE-WRITE VALIDATION:


[Stage 408:>                                                        (0 + 1) / 1]

  Records: 12,880,169
  Unique IDs: 12,880,169
  Value range: 0.00 to 573.72
  Average value: 44.35
  Zero values: 107,511
  Null values: 0
  PRE-WRITE VALIDATION: PASSED - Data is ready for write

4. Sample final data:
+----------------+-----------+----------------------------+-----------------+---------------+------------------+
|measurement_date|location_id|climate_metric              |measurement_value|unit_of_measure|processing_version|
+----------------+-----------+----------------------------+-----------------+---------------+------------------+
|1983-01-02      |1          |Mean Air Surface Temperature|24.89            |Celsius        |V2                |
|1983-01-02      |1          |Mean Air Surface Temperature|24.89            |Celsius        |V2                |
|1983-01-02      |1          |Mean Air Surface Temperature|24.89            |Celsius        |V2                |
|1983-01-02      |1          |Mean Air Surface Temperature|24.89            |Celsius        |V2       

                                                                                

In [19]:
from pyspark.sql.functions import avg
import os
import shutil
import builtins  # To access Python's built-in abs function

if final_climate_fact:
    print("\n" + "="*70)
    print("SAVE TABLE")
    print("="*70)

    fact_climate_path = os.path.join(SILVER_PATH, "fact_climate_weather_v2")
    staging_path = fact_climate_path + "_staging"

    try:
        # Get record count and stats
        expected_count = final_climate_fact.count()
        expected_avg = final_climate_fact.agg(avg("measurement_value")).collect()[0][0]

        print(f"\n1. Preparing to save:")
        print(f"  Records: {expected_count:,}")
        print(f"  Expected average: {expected_avg if expected_avg is not None else 'N/A'}")
        print(f"  Target path: {fact_climate_path}")

        if expected_avg is None:
            raise Exception("Cannot save data with an incalculable average (likely all nulls)")

        if expected_avg == 0.0:
            print("  WARNING: Average value is zero. Proceeding with save.")

        # If table doesn't exist, pre-create it
        if not os.path.exists(fact_climate_path):
            print("  Initializing empty Delta table...")
            (spark.createDataFrame([], final_climate_fact.schema)
                 .write.format("delta")
                 .mode("overwrite")
                 .save(fact_climate_path))

        # Ensure staging path is clean
        if os.path.exists(staging_path):
            shutil.rmtree(staging_path)

        # Write to staging
        print(f"\n2. Writing to staging Delta table...")
        (final_climate_fact.write
            .format("delta")
            .mode("overwrite")
            .option("overwriteSchema", "true")
            .partitionBy("year", "metric_code")
            .save(staging_path))

        # Atomic replace
        if os.path.exists(fact_climate_path):
            shutil.rmtree(fact_climate_path)
        shutil.move(staging_path, fact_climate_path)

        print(f"  SUCCESS: Table written to {fact_climate_path}")

        # Post-write validation
        print(f"\n3. Post-write validation...")
        test_df = spark.read.format("delta").load(fact_climate_path)
        written_count = test_df.count()
        written_avg = test_df.agg(avg("measurement_value")).collect()[0][0]

        print(f"  Written records: {written_count:,}")
        print(f"  Written average: {written_avg if written_avg is not None else 'N/A'}")

        # Use Python's built-in abs() function explicitly
        if builtins.abs(written_count - expected_count) > (expected_count * 0.01):
            raise Exception(f"Record count mismatch! Expected {expected_count:,}, got {written_count:,}")

        if (expected_avg is None or written_avg is None) and expected_avg != written_avg:
            raise Exception(f"Average value mismatch! Expected {expected_avg}, got {written_avg}")
        elif expected_avg is not None and written_avg is not None:
            if builtins.abs(written_avg - expected_avg) > (builtins.abs(expected_avg) * 0.1):
                raise Exception(f"Average value corruption! Expected {expected_avg:.2f}, got {written_avg:.2f}")

        print(f"  POST-WRITE VALIDATION: SUCCESS")

        # Partition info
        print(f"\n4. Partition information:")
        partition_info = test_df.groupBy("year", "metric_code").count().orderBy("year", "metric_code")
        print(f"  Total partitions: {partition_info.count()}")
        print(f"  Sample partitions:")
        partition_info.show(10)

        save_success = True

    except Exception as e:
        print(f"\nCRITICAL ERROR during save: {e}")
        try:
            if os.path.exists(staging_path):
                shutil.rmtree(staging_path)
            if os.path.exists(fact_climate_path) and expected_count == 0:
                shutil.rmtree(fact_climate_path)
        except:
            pass
        save_success = False
        raise
else:
    save_success = False
    print("No data to save")


SAVE TABLE

1. Preparing to save:
  Records: 12,880,169
  Expected average: 44.35075651489767
  Target path: /home/ernese/miniconda3/envs/SO/New_SO/final-spark-silver/fact_climate_weather_v2

2. Writing to staging Delta table...


                                                                                

  SUCCESS: Table written to /home/ernese/miniconda3/envs/SO/New_SO/final-spark-silver/fact_climate_weather_v2

3. Post-write validation...
  Written records: 12,880,169
  Written average: 44.35075651491791
  POST-WRITE VALIDATION: SUCCESS

4. Partition information:
  Total partitions: 315
  Sample partitions:
+----+-----------+-----+
|year|metric_code|count|
+----+-----------+-----+
|1981|        C01|48180|
|1981|        C03|48180|
|1981|        C04|48180|
|1981|        C09|48180|
|1981|        C12|48180|
|1981|        C13|48180|
|1981|        C23|   12|
|1982|        C01|48180|
|1982|        C03|48180|
|1982|        C04|48180|
+----+-----------+-----+
only showing top 10 rows



In [20]:
# Comprehensive post-save data quality analysis
if save_success:
    print("\n" + "="*70)
    print("COMPREHENSIVE DATA QUALITY ANALYSIS")
    print("="*70)
    
    try:
        # Read the saved table for analysis
        fact_climate_path = os.path.join(SILVER_PATH, "fact_climate_weather_v2")
        saved_df = spark.read.format("delta").load(fact_climate_path)
        
        print(f"\n1. Basic Statistics:")
        total_records = saved_df.count()
        print(f"  Total Records: {total_records:,}")
        
        # Temporal coverage
        print(f"\n2. Temporal Coverage:")
        temporal_stats = saved_df.agg(
            min("measurement_date").alias("earliest_date"),
            max("measurement_date").alias("latest_date"),
            countDistinct("year").alias("unique_years"),
            countDistinct("measurement_date").alias("unique_dates")
        ).collect()[0]
        
        print(f"  Date range: {temporal_stats['earliest_date']} to {temporal_stats['latest_date']}")
        print(f"  Years covered: {temporal_stats['unique_years']}")
        print(f"  Unique dates: {temporal_stats['unique_dates']:,}")
        
        # Climate metric distribution
        print(f"\n3. Climate Metric Distribution:")
        saved_df.groupBy("climate_metric", "metric_code").count().orderBy("metric_code").show(truncate=False)
        
        # Data quality metrics
        print(f"\n4. Data Quality Metrics:")
        quality_stats = saved_df.agg(
            avg("measurement_value").alias("avg_value"),
            min("measurement_value").alias("min_value"),
            max("measurement_value").alias("max_value"),
            stddev("measurement_value").alias("stddev_value"),
            sum(when(col("measurement_value") == 0.0, 1).otherwise(0)).alias("zero_count"),
            sum(when(col("measurement_value").isNull(), 1).otherwise(0)).alias("null_count"),
            avg("data_quality_score").alias("avg_quality_score")
        ).collect()[0]
        
        print(f"  Average value: {quality_stats['avg_value']:.2f}")
        print(f"  Value range: {quality_stats['min_value']:.2f} to {quality_stats['max_value']:.2f}")
        print(f"  Standard deviation: {quality_stats['stddev_value']:.2f}")
        print(f"  Zero values: {quality_stats['zero_count']:,} ({(quality_stats['zero_count']/total_records)*100:.1f}%)")
        print(f"  Null values: {quality_stats['null_count']:,}")
        print(f"  Average quality score: {quality_stats['avg_quality_score']:.2f}")
        
        # Location coverage
        print(f"\n5. Geographic Coverage:")
        location_stats = saved_df.agg(
            countDistinct("location_id").alias("unique_locations")
        ).collect()[0]
        print(f"  Unique locations: {location_stats['unique_locations']}")
        
        # Processing version and timestamps
        print(f"\n6. Processing Information:")
        processing_info = saved_df.select("processing_version", "created_at").distinct().collect()[0]
        print(f"  Processing version: {processing_info['processing_version']}")
        print(f"  Created at: {processing_info['created_at']}")
        
        # Final assessment
        data_quality_excellent = (
            quality_stats['avg_value'] > 0.0 and 
            quality_stats['null_count'] == 0 and
            total_records > 100000  # Assuming substantial dataset
        )
        
        if data_quality_excellent:
            print(f"\n" + "="*70)
            print(f"SUCCESS! TABLE CREATION COMPLETED WITHOUT CORRUPTION")
            print(f"="*70)
            print(f"Table: fact_climate_weather_v2")
            print(f"Records: {total_records:,}")
            print(f"Metrics: {temporal_stats['unique_years']} years of climate data")
            print(f"Data Quality: EXCELLENT (avg: {quality_stats['avg_value']:.2f})")
            print(f"Corruption: NONE DETECTED")
            print(f"Status: READY FOR ANALYSIS")
            print(f"Path: {fact_climate_path}")
        else:
            print(f"\nWARNING: Some data quality issues detected - review metrics above")
            
    except Exception as e:
        print(f"Error during quality analysis: {e}")
        import traceback
        traceback.print_exc()
else:
    print("No data available for quality analysis")


COMPREHENSIVE DATA QUALITY ANALYSIS

1. Basic Statistics:
  Total Records: 12,880,169

2. Temporal Coverage:


                                                                                

  Date range: 1981-01-01 to 2025-07-11
  Years covered: 45
  Unique dates: 16,263

3. Climate Metric Distribution:
+-------------------------------+-----------+-------+
|climate_metric                 |metric_code|count  |
+-------------------------------+-----------+-------+
|Mean Air Surface Temperature   |C01        |2146584|
|Daily Highest Temperature      |C03        |2146584|
|Daily Lowest Temperature       |C04        |2146584|
|Daily Precipitation            |C09        |2146584|
|Mean Surface Pressure          |C12        |2146716|
|Humidity Level                 |C13        |2146584|
|Monthly Surface Air Temperature|C23        |533    |
+-------------------------------+-----------+-------+


4. Data Quality Metrics:
  Average value: 44.35
  Value range: 0.00 to 573.72
  Standard deviation: 34.23
  Zero values: 107,511 (0.8%)
  Null values: 0
  Average quality score: 0.95

5. Geographic Coverage:
  Unique locations: 27

6. Processing Information:
  Processing version: V2
  Cre