# Climate Data Quarterly & Annual Aggregations

Creates quarterly and annual aggregated views of climate data from the daily fact_climate_weather_v2 table.
These aggregations enable efficient time-series analysis and reporting for the lakehouse chatbot.

**Quarterly Aggregations**: SUM() for precipitation, AVG() for temperature, pressure, humidity
**Annual Aggregations**: SUM() for precipitation, AVG() for temperature, pressure, humidity

**Outputs**:
- `fact_climate_quarterly` - Quarterly climate metrics (C01, C03, C04, C09, C12, C13)
- `fact_climate_annual` - Annual climate metrics (C01, C03, C04, C09, C12, C13)

**Climate Metrics**:
- C01: Mean air surface temperature (°C)
- C03: Highest temperature (°C) 
- C04: Lowest temperature (°C)
- C09: Total precipitation (mm)
- C12: Mean surface pressure (Pascals)
- C13: Mean humidity level (%)

In [None]:
import os
import warnings
warnings.filterwarnings('ignore')

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
from datetime import datetime
import json

In [None]:
# Initialize Spark Session
spark = SparkSession.builder \
    .appName("ClimateAggregationsProcessor") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .config("spark.databricks.delta.optimizeWrite.enabled", "true") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0") \
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

print(f"Spark Version: {spark.version}")
print("Climate Aggregations Processor Started")

In [None]:
# Configuration
SILVER_PATH = "/home/ernese/miniconda3/envs/SO/New_SO/final-spark-silver"
PROCESSING_TIMESTAMP = datetime.now()

print(f"Silver Path: {SILVER_PATH}")
print(f"Processing Time: {PROCESSING_TIMESTAMP}")

## Load Daily Climate Data

In [None]:
# Load the daily climate weather fact table
try:
    # Try v2 first (latest version)
    climate_path = os.path.join(SILVER_PATH, "fact_climate_weather_v2")
    daily_climate = spark.read.format("delta").load(climate_path)
    print(f"Loaded fact_climate_weather_v2")
except:
    try:
        # Fallback to original version
        climate_path = os.path.join(SILVER_PATH, "fact_climate_weather")
        daily_climate = spark.read.format("delta").load(climate_path)
        print(f"Loaded fact_climate_weather (fallback)")
    except Exception as e:
        print(f"Error loading climate data: {e}")
        raise

print(f"\n=== DAILY CLIMATE DATA OVERVIEW ===")
print(f"Total daily records: {daily_climate.count():,}")
daily_climate.printSchema()

# Show data quality summary
print(f"\nData Quality Summary:")
quality_stats = daily_climate.agg(
    min("measurement_date").alias("earliest_date"),
    max("measurement_date").alias("latest_date"),
    countDistinct("location_id").alias("unique_locations"),
    countDistinct("metric_code").alias("unique_metrics"),
    avg("measurement_value").alias("avg_measurement_value")
).collect()[0]

print(f"   Date range: {quality_stats['earliest_date']} to {quality_stats['latest_date']}")
print(f"   Unique locations: {quality_stats['unique_locations']:,}")
print(f"   Unique metrics: {quality_stats['unique_metrics']}")
print(f"   Average measurement value: {quality_stats['avg_measurement_value']:.2f}")

# Show metric distribution
print(f"\nMetric Code Distribution:")
daily_climate.groupBy("metric_code", "climate_metric").count().orderBy("metric_code").show(truncate=False)

# Sample data
print(f"\nSample Daily Data:")
daily_climate.select(
    "measurement_date", "location_id", "metric_code", "climate_metric", 
    "measurement_value", "unit_of_measure"
).show(10, truncate=False)

## Create Quarterly Aggregations

In [None]:
# Create quarterly aggregations
print("=== CREATING QUARTERLY CLIMATE AGGREGATIONS ===")

# Add quarter and year columns
quarterly_base = daily_climate.withColumn(
    "quarter", 
    concat(col("year"), lit("-Q"), quarter(col("measurement_date")))
).withColumn(
    "quarter_number", 
    quarter(col("measurement_date"))
).withColumn(
    "quarter_start_date",
    date_trunc("quarter", col("measurement_date"))
)

# Define aggregation logic based on metric type
quarterly_climate = quarterly_base.groupBy(
    "location_id", "indicator_id", "year", "quarter", "quarter_number", 
    "quarter_start_date", "metric_code", "climate_metric", "unit_of_measure"
).agg(
    # Use SUM for precipitation (C09), AVG for others
    when(col("metric_code") == "C09", sum("measurement_value"))
    .otherwise(avg("measurement_value")).alias("quarterly_value"),
    
    # Additional statistics
    min("measurement_value").alias("min_daily_value"),
    max("measurement_value").alias("max_daily_value"),
    stddev("measurement_value").alias("stddev_daily_value"),
    count("measurement_value").alias("daily_records_count"),
    
    # Date range
    min("measurement_date").alias("period_start_date"),
    max("measurement_date").alias("period_end_date")
)

# Add aggregation type and computed columns
quarterly_climate = quarterly_climate.withColumn(
    "aggregation_type",
    when(col("metric_code") == "C09", lit("SUM")).otherwise(lit("AVG"))
).withColumn(
    "quarterly_climate_id", 
    row_number().over(Window.orderBy("location_id", "year", "quarter_number", "metric_code"))
).withColumn(
    "created_at", lit(PROCESSING_TIMESTAMP)
).withColumn(
    "updated_at", lit(PROCESSING_TIMESTAMP)
)

# Add specific metric columns for easier querying
quarterly_climate = quarterly_climate.withColumn(
    "temperature_celsius",
    when(col("unit_of_measure") == "Celsius", col("quarterly_value")).otherwise(lit(None))
).withColumn(
    "precipitation_mm",
    when(col("unit_of_measure") == "Millimeters", col("quarterly_value")).otherwise(lit(None))
).withColumn(
    "pressure_pascals",
    when(col("unit_of_measure") == "Pascals", col("quarterly_value")).otherwise(lit(None))
).withColumn(
    "humidity_percentage",
    when(col("unit_of_measure") == "Percentage", col("quarterly_value")).otherwise(lit(None))
)

# Select final quarterly columns
quarterly_climate = quarterly_climate.select(
    "quarterly_climate_id", "location_id", "indicator_id", "year", "quarter", 
    "quarter_number", "quarter_start_date", "period_start_date", "period_end_date",
    "metric_code", "climate_metric", "quarterly_value", "aggregation_type",
    "temperature_celsius", "precipitation_mm", "pressure_pascals", "humidity_percentage",
    "unit_of_measure", "min_daily_value", "max_daily_value", "stddev_daily_value",
    "daily_records_count", "created_at", "updated_at"
)

print(f"Quarterly aggregations created: {quarterly_climate.count():,} records")
quarterly_climate.printSchema()

# Show quarterly summary
print(f"\nQuarterly Aggregation Summary:")
quarterly_summary = quarterly_climate.agg(
    countDistinct("location_id").alias("unique_locations"),
    countDistinct("quarter").alias("unique_quarters"),
    countDistinct("metric_code").alias("unique_metrics"),
    min("period_start_date").alias("earliest_quarter"),
    max("period_end_date").alias("latest_quarter")
).collect()[0]

print(f"   Locations: {quarterly_summary['unique_locations']:,}")
print(f"   Quarters: {quarterly_summary['unique_quarters']:,}")
print(f"   Metrics: {quarterly_summary['unique_metrics']}")
print(f"   Time range: {quarterly_summary['earliest_quarter']} to {quarterly_summary['latest_quarter']}")

# Show sample quarterly data
print(f"\nSample Quarterly Data by Metric:")
for metric in ["C01", "C09", "C12"]:
    print(f"\n{metric} Quarterly Sample:")
    quarterly_climate.filter(col("metric_code") == metric).select(
        "quarter", "location_id", "climate_metric", "quarterly_value", 
        "aggregation_type", "daily_records_count"
    ).limit(3).show(truncate=False)

## Create Annual Aggregations

In [None]:
# Create annual aggregations
print("=== CREATING ANNUAL CLIMATE AGGREGATIONS ===")

# Add year start date
annual_base = daily_climate.withColumn(
    "year_start_date",
    date_trunc("year", col("measurement_date"))
)

# Define aggregation logic based on metric type
annual_climate = annual_base.groupBy(
    "location_id", "indicator_id", "year", "year_start_date", 
    "metric_code", "climate_metric", "unit_of_measure"
).agg(
    # Use SUM for precipitation (C09), AVG for others
    when(col("metric_code") == "C09", sum("measurement_value"))
    .otherwise(avg("measurement_value")).alias("annual_value"),
    
    # Additional statistics
    min("measurement_value").alias("min_daily_value"),
    max("measurement_value").alias("max_daily_value"),
    stddev("measurement_value").alias("stddev_daily_value"),
    count("measurement_value").alias("daily_records_count"),
    
    # Date range
    min("measurement_date").alias("period_start_date"),
    max("measurement_date").alias("period_end_date")
)

# Add aggregation type and computed columns
annual_climate = annual_climate.withColumn(
    "aggregation_type",
    when(col("metric_code") == "C09", lit("SUM")).otherwise(lit("AVG"))
).withColumn(
    "annual_climate_id", 
    row_number().over(Window.orderBy("location_id", "year", "metric_code"))
).withColumn(
    "created_at", lit(PROCESSING_TIMESTAMP)
).withColumn(
    "updated_at", lit(PROCESSING_TIMESTAMP)
)

# Add specific metric columns for easier querying
annual_climate = annual_climate.withColumn(
    "temperature_celsius",
    when(col("unit_of_measure") == "Celsius", col("annual_value")).otherwise(lit(None))
).withColumn(
    "precipitation_mm",
    when(col("unit_of_measure") == "Millimeters", col("annual_value")).otherwise(lit(None))
).withColumn(
    "pressure_pascals",
    when(col("unit_of_measure") == "Pascals", col("annual_value")).otherwise(lit(None))
).withColumn(
    "humidity_percentage",
    when(col("unit_of_measure") == "Percentage", col("annual_value")).otherwise(lit(None))
)

# Select final annual columns
annual_climate = annual_climate.select(
    "annual_climate_id", "location_id", "indicator_id", "year", 
    "year_start_date", "period_start_date", "period_end_date",
    "metric_code", "climate_metric", "annual_value", "aggregation_type",
    "temperature_celsius", "precipitation_mm", "pressure_pascals", "humidity_percentage",
    "unit_of_measure", "min_daily_value", "max_daily_value", "stddev_daily_value",
    "daily_records_count", "created_at", "updated_at"
)

print(f"Annual aggregations created: {annual_climate.count():,} records")
annual_climate.printSchema()

# Show annual summary
print(f"\nAnnual Aggregation Summary:")
annual_summary = annual_climate.agg(
    countDistinct("location_id").alias("unique_locations"),
    countDistinct("year").alias("unique_years"),
    countDistinct("metric_code").alias("unique_metrics"),
    min("period_start_date").alias("earliest_year"),
    max("period_end_date").alias("latest_year")
).collect()[0]

print(f"   Locations: {annual_summary['unique_locations']:,}")
print(f"   Years: {annual_summary['unique_years']:,}")
print(f"   Metrics: {annual_summary['unique_metrics']}")
print(f"   Time range: {annual_summary['earliest_year']} to {annual_summary['latest_year']}")

# Show sample annual data
print(f"\nSample Annual Data by Metric:")
for metric in ["C01", "C09", "C12"]:
    print(f"\n{metric} Annual Sample:")
    annual_climate.filter(col("metric_code") == metric).select(
        "year", "location_id", "climate_metric", "annual_value", 
        "aggregation_type", "daily_records_count"
    ).limit(3).show(truncate=False)

## Data Quality Validation

In [None]:
# Comprehensive data quality validation
print("=== DATA QUALITY VALIDATION ===")

# Validate quarterly aggregations
print(f"\n1. Quarterly Aggregations Validation:")

quarterly_validation = quarterly_climate.agg(
    avg("quarterly_value").alias("avg_quarterly_value"),
    min("quarterly_value").alias("min_quarterly_value"),
    max("quarterly_value").alias("max_quarterly_value"),
    sum(when(col("quarterly_value") == 0.0, 1).otherwise(0)).alias("zero_values"),
    sum(when(col("quarterly_value").isNull(), 1).otherwise(0)).alias("null_values")
).collect()[0]

print(f"   Average quarterly value: {quarterly_validation['avg_quarterly_value']:.2f}")
print(f"   Value range: {quarterly_validation['min_quarterly_value']:.2f} to {quarterly_validation['max_quarterly_value']:.2f}")
print(f"   Zero values: {quarterly_validation['zero_values']:,}")
print(f"   Null values: {quarterly_validation['null_values']:,}")

# Validate aggregation consistency
print(f"\n   Quarterly Aggregation Type Distribution:")
quarterly_climate.groupBy("metric_code", "aggregation_type").count().orderBy("metric_code").show()

# Validate annual aggregations
print(f"\n2. Annual Aggregations Validation:")

annual_validation = annual_climate.agg(
    avg("annual_value").alias("avg_annual_value"),
    min("annual_value").alias("min_annual_value"),
    max("annual_value").alias("max_annual_value"),
    sum(when(col("annual_value") == 0.0, 1).otherwise(0)).alias("zero_values"),
    sum(when(col("annual_value").isNull(), 1).otherwise(0)).alias("null_values")
).collect()[0]

print(f"   Average annual value: {annual_validation['avg_annual_value']:.2f}")
print(f"   Value range: {annual_validation['min_annual_value']:.2f} to {annual_validation['max_annual_value']:.2f}")
print(f"   Zero values: {annual_validation['zero_values']:,}")
print(f"   Null values: {annual_validation['null_values']:,}")

print(f"\n   Annual Aggregation Type Distribution:")
annual_climate.groupBy("metric_code", "aggregation_type").count().orderBy("metric_code").show()

# Cross-validation: Check that precipitation uses SUM, others use AVG
print(f"\n3. Aggregation Logic Validation:")

precip_quarterly = quarterly_climate.filter(col("metric_code") == "C09")
temp_quarterly = quarterly_climate.filter(col("metric_code") == "C01")

precip_agg_type = precip_quarterly.select("aggregation_type").distinct().collect()
temp_agg_type = temp_quarterly.select("aggregation_type").distinct().collect()

print(f"   Precipitation (C09) uses: {[row.aggregation_type for row in precip_agg_type]}")
print(f"   Temperature (C01) uses: {[row.aggregation_type for row in temp_agg_type]}")

# Validate record completeness
expected_quarterly_records = quarterly_climate.groupBy("location_id", "year").agg(
    countDistinct("quarter_number").alias("quarters_per_year")
).agg(
    min("quarters_per_year").alias("min_quarters"),
    max("quarters_per_year").alias("max_quarters"),
    avg("quarters_per_year").alias("avg_quarters")
).collect()[0]

print(f"\n4. Completeness Validation:")
print(f"   Quarters per location/year - Min: {expected_quarterly_records['min_quarters']}, Max: {expected_quarterly_records['max_quarters']}, Avg: {expected_quarterly_records['avg_quarters']:.1f}")

if quarterly_validation['avg_quarterly_value'] == 0.0:
    print(f"\nWARNING: Quarterly data appears corrupted (all zeros)")
else:
    print(f"\nData quality validation PASSED - aggregations look healthy")

## Save Quarterly Aggregations

In [None]:
# Save quarterly climate aggregations
quarterly_output_path = os.path.join(SILVER_PATH, "fact_climate_quarterly")

print(f"=== SAVING QUARTERLY CLIMATE AGGREGATIONS ===")
print(f"Output path: {quarterly_output_path}")

try:
    # Cache for performance
    quarterly_climate.cache()
    
    # Get final count for validation
    quarterly_count = quarterly_climate.count()
    print(f"Quarterly records to save: {quarterly_count:,}")
    
    # Save as Delta table with partitioning
    quarterly_climate.write \
        .format("delta") \
        .mode("overwrite") \
        .option("overwriteSchema", "true") \
        .option("dataChange", "true") \
        .partitionBy("year", "metric_code") \
        .save(quarterly_output_path)
    
    print(f"SUCCESS: Quarterly aggregations saved to {quarterly_output_path}")
    
    # Post-save validation
    saved_quarterly = spark.read.format("delta").load(quarterly_output_path)
    saved_quarterly_count = saved_quarterly.count()
    
    print(f"\nPost-save validation:")
    print(f"   Expected records: {quarterly_count:,}")
    print(f"   Saved records: {saved_quarterly_count:,}")
    print(f"   Validation: {'PASSED' if saved_quarterly_count == quarterly_count else 'FAILED'}")
    
    # Show partition structure
    print(f"\nPartition structure sample:")
    saved_quarterly.select("year", "metric_code").distinct().orderBy("year", "metric_code").show(10)
    
except Exception as e:
    print(f"ERROR saving quarterly aggregations: {e}")
    import traceback
    traceback.print_exc()

## Save Annual Aggregations

In [None]:
# Save annual climate aggregations
annual_output_path = os.path.join(SILVER_PATH, "fact_climate_annual")

print(f"=== SAVING ANNUAL CLIMATE AGGREGATIONS ===")
print(f"Output path: {annual_output_path}")

try:
    # Cache for performance
    annual_climate.cache()
    
    # Get final count for validation
    annual_count = annual_climate.count()
    print(f"Annual records to save: {annual_count:,}")
    
    # Save as Delta table with partitioning
    annual_climate.write \
        .format("delta") \
        .mode("overwrite") \
        .option("overwriteSchema", "true") \
        .option("dataChange", "true") \
        .partitionBy("year", "metric_code") \
        .save(annual_output_path)
    
    print(f"SUCCESS: Annual aggregations saved to {annual_output_path}")
    
    # Post-save validation
    saved_annual = spark.read.format("delta").load(annual_output_path)
    saved_annual_count = saved_annual.count()
    
    print(f"\nPost-save validation:")
    print(f"   Expected records: {annual_count:,}")
    print(f"   Saved records: {saved_annual_count:,}")
    print(f"   Validation: {'PASSED' if saved_annual_count == annual_count else 'FAILED'}")
    
    # Show partition structure
    print(f"\nPartition structure sample:")
    saved_annual.select("year", "metric_code").distinct().orderBy("year", "metric_code").show(10)
    
except Exception as e:
    print(f"ERROR saving annual aggregations: {e}")
    import traceback
    traceback.print_exc()

## Create Summary Views for Chatbot Queries

In [None]:
# Create summary views optimized for chatbot queries
print("=== CREATING CHATBOT-OPTIMIZED SUMMARY VIEWS ===")

# Load dimension tables for enhanced summaries
try:
    dim_location = spark.read.format("delta").load(os.path.join(SILVER_PATH, "dim_location_fixed"))
    print("Loaded dim_location_fixed for enhanced summaries")
except:
    dim_location = spark.read.format("delta").load(os.path.join(SILVER_PATH, "dim_location"))
    print("Loaded dim_location (fallback)")

# Create quarterly summary with location names
quarterly_summary_view = quarterly_climate.join(
    dim_location.select("location_id", "location_name", "location_type"), 
    "location_id", "left"
).select(
    "quarterly_climate_id", "location_id", "location_name", "location_type",
    "year", "quarter", "quarter_number", "metric_code", "climate_metric",
    "quarterly_value", "aggregation_type", "temperature_celsius", 
    "precipitation_mm", "pressure_pascals", "humidity_percentage",
    "unit_of_measure", "daily_records_count", "period_start_date", "period_end_date"
)

# Create annual summary with location names
annual_summary_view = annual_climate.join(
    dim_location.select("location_id", "location_name", "location_type"), 
    "location_id", "left"
).select(
    "annual_climate_id", "location_id", "location_name", "location_type",
    "year", "metric_code", "climate_metric", "annual_value", "aggregation_type",
    "temperature_celsius", "precipitation_mm", "pressure_pascals", "humidity_percentage",
    "unit_of_measure", "daily_records_count", "period_start_date", "period_end_date"
)

print(f"\nQuarterly summary view: {quarterly_summary_view.count():,} records")
print(f"Annual summary view: {annual_summary_view.count():,} records")

# Show sample enhanced data
print(f"\nSample Enhanced Quarterly Data:")
quarterly_summary_view.filter(col("metric_code") == "C01").select(
    "location_name", "quarter", "climate_metric", "quarterly_value", "temperature_celsius"
).limit(5).show(truncate=False)

print(f"\nSample Enhanced Annual Data:")
annual_summary_view.filter(col("metric_code") == "C09").select(
    "location_name", "year", "climate_metric", "annual_value", "precipitation_mm"
).limit(5).show(truncate=False)

# Generate metric definitions for chatbot
metric_definitions = {
    "C01_quarterly": "Quarterly mean air surface temperature (°C) - Average of daily temperatures",
    "C03_quarterly": "Quarterly highest temperature (°C) - Average of daily maximum temperatures", 
    "C04_quarterly": "Quarterly lowest temperature (°C) - Average of daily minimum temperatures",
    "C09_quarterly": "Quarterly total precipitation (mm) - Sum of daily precipitation",
    "C12_quarterly": "Quarterly mean surface pressure (Pascals) - Average of daily pressure readings",
    "C13_quarterly": "Quarterly mean humidity level (%) - Average of daily humidity readings",
    "C01_annual": "Annual mean air surface temperature (°C) - Average of daily temperatures",
    "C03_annual": "Annual highest temperature (°C) - Average of daily maximum temperatures",
    "C04_annual": "Annual lowest temperature (°C) - Average of daily minimum temperatures",
    "C09_annual": "Annual total precipitation (mm) - Sum of daily precipitation",
    "C12_annual": "Annual mean surface pressure (Pascals) - Average of daily pressure readings",
    "C13_annual": "Annual mean humidity level (%) - Average of daily humidity readings"
}

print(f"\nMetric Definitions for Chatbot Integration:")
for metric, definition in metric_definitions.items():
    print(f"   {metric}: {definition}")

## Performance Optimization and Final Summary

In [None]:
# Final summary and performance optimization
print("=" * 70)
print("CLIMATE AGGREGATIONS - PROCESSING SUMMARY")
print("=" * 70)

processing_end = datetime.now()
processing_duration = processing_end - PROCESSING_TIMESTAMP

print(f"\nProcessing Details:")
print(f"   Start time: {PROCESSING_TIMESTAMP}")
print(f"   End time: {processing_end}")
print(f"   Duration: {processing_duration}")

# Final statistics
if 'daily_climate' in locals() and 'quarterly_climate' in locals() and 'annual_climate' in locals():
    daily_count = daily_climate.count()
    quarterly_count = quarterly_climate.count() 
    annual_count = annual_climate.count()
    
    print(f"\nData Processing Results:")
    print(f"   Daily records processed: {daily_count:,}")
    print(f"   Quarterly records created: {quarterly_count:,}")
    print(f"   Annual records created: {annual_count:,}")
    print(f"   Data reduction ratio: {daily_count/quarterly_count:.1f}:1 (daily to quarterly)")
    print(f"   Data reduction ratio: {daily_count/annual_count:.1f}:1 (daily to annual)")

print(f"\nCreated Climate Aggregation Tables:")
print(f"   fact_climate_quarterly - Partitioned by year, metric_code")
print(f"   fact_climate_annual - Partitioned by year, metric_code")

print(f"\nAggregation Logic Applied:")
print(f"   Temperature metrics (C01, C03, C04): AVG() of daily values")
print(f"   Precipitation (C09): SUM() of daily values")
print(f"   Pressure (C12): AVG() of daily values")
print(f"   Humidity (C13): AVG() of daily values")

print(f"\nLakehouse Chatbot Benefits:")
print(f"   Fast quarterly and annual trend analysis")
print(f"   Efficient time-series queries across multiple years")
print(f"   Pre-computed aggregations for dashboard performance")
print(f"   Location-aware climate summaries")
print(f"   Comprehensive statistical metrics (min, max, stddev)")
print(f"   Metric-specific columns for easy filtering")

print(f"\nOptimal Query Patterns for Chatbot:")
print(f"   • 'Show quarterly temperature trends in Manila' → fact_climate_quarterly")
print(f"   • 'Annual rainfall comparison across regions' → fact_climate_annual")
print(f"   • 'Climate patterns over the last 5 years' → Both tables")
print(f"   • 'Seasonal analysis by quarter' → fact_climate_quarterly")

print(f"\nNext Integration Steps:")
print(f"   1. Register tables in chatbot metadata catalog")
print(f"   2. Create time-series visualization templates")
print(f"   3. Add aggregation tables to query optimizer")
print(f"   4. Implement intelligent query routing (daily vs quarterly vs annual)")

print(f"\n" + "=" * 70)
print("CLIMATE AGGREGATIONS PROCESSING COMPLETE")
print("=" * 70)

# Cleanup
if 'quarterly_climate' in locals():
    quarterly_climate.unpersist()
if 'annual_climate' in locals():
    annual_climate.unpersist()

# Stop Spark
spark.stop()
print("\nSpark session stopped.")