In [None]:
# Import required libraries and initialize a local Spark session with Delta Lake
import os
import warnings
from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
from delta import *

warnings.filterwarnings('ignore')

print("=== V2 LOCAL GOLD EMISSIONS PROCESSOR ===")

# Initialize a local Spark session with Delta Lake support
builder = SparkSession.builder \
    .appName("v2-Local-Gold-Emissions") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.driver.memory", "4g") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true")

spark = configure_spark_with_delta_pip(builder).getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

print(f"Spark Version: {spark.version}")
print("Delta Lake support enabled for local environment.")

In [None]:
# Configuration for local file paths
PROCESSING_TIMESTAMP = datetime.now()

# Corrected relative paths from 'notebooks/new/gold' to project root
base_path = "../../../"
silver_path = os.path.abspath(os.path.join(base_path, 'final-spark-silver'))
gold_path = os.path.abspath(os.path.join(base_path, 'final-spark-gold'))

print("=== V2 LOCAL GOLD PROCESSING CONFIGURATION ===")
print(f"Silver source path: {silver_path}")
print(f"Gold target path: {gold_path}")

# Ensure the target directory exists
os.makedirs(gold_path, exist_ok=True)

In [None]:
# Load Silver Layer data from local Delta paths
def load_climate_data_with_locations_local():
    """Load climate and location data from local Silver Delta tables."""
    print("=== DATA LOADING (LOCAL DELTA PATHS) ===")
    
    try:
        # Define paths to the specific silver tables
        climate_table_path = os.path.join(silver_path, 'fact_climate_weather') # Assuming this is the correct folder name
        location_table_path = os.path.join(silver_path, 'dim_location')
        
        print(f'Reading climate data from: {climate_table_path}')
        daily_climate = spark.read.format("delta").load(climate_table_path)
        print(f'Reading location data from: {location_table_path}')
        location_dim = spark.read.format("delta").load(location_table_path)
        
        print(f"Climate data loaded: {daily_climate.count():,} records")
        print(f"Location dimension loaded: {location_dim.count():,} records")
        
    except Exception as e:
        print(f"ERROR: Could not load Silver layer tables. Please check paths and table names. Error: {e}")
        raise
        
    # Join climate data with location dimension
    climate_with_locations = daily_climate.join(location_dim, "location_id", "left")
    
    # Cache for performance
    climate_with_locations.cache()
    joined_count = climate_with_locations.count()
    print(f"Successfully joined climate and location data: {joined_count:,} records")
    
    return climate_with_locations

# Execute the loading function
climate_data = load_climate_data_with_locations_local()

### Data Quality Analysis
The following cell performs a deep analysis of the loaded data, calculating statistics on geographic coverage, data completeness, and distributions. This step is for validation and does not transform the data.

In [None]:
# This cell is kept for its valuable data profiling capabilities.
print("=== DATA QUALITY ANALYSIS ===")
try:
    total_records = climate_data.count()
    print(f"Total records for analysis: {total_records:,}")
    
    # Geographic coverage analysis
    geo_coverage = climate_data.agg(
        countDistinct("location_id").alias("unique_locations"),
        countDistinct("region_code").alias("unique_regions"),
        countDistinct("metric_code").alias("unique_metrics"),
        min("measurement_date").alias("earliest_date"),
        max("measurement_date").alias("latest_date")
    ).collect()[0]
    
    print(f"\nGeographic Coverage:")
    print(f"  Unique locations: {geo_coverage['unique_locations']:,}")
    print(f"  Unique regions: {geo_coverage['unique_regions']:,}")
    print(f"  Climate metrics: {geo_coverage['unique_metrics']:,}")
    print(f"  Date range: {geo_coverage['earliest_date']} to {geo_coverage['latest_date']}")
    
    # Climate metrics distribution
    print(f"\nClimate metrics distribution:")
    climate_data.groupBy("metric_code").agg(count("*").alias("record_count")).orderBy("metric_code").show()
    
    print(f"\nDATA QUALITY: VALIDATED")
except Exception as e:
    print(f"Error in data quality analysis: {e}")