In [8]:
# Initialize Spark session with Delta Lake
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from delta import *
import re
from datetime import datetime
import warnings

warnings.filterwarnings('ignore')

# Get current working directory and set paths
current_dir = os.path.expanduser("~/miniconda3/envs/SO/New_SO")
print(f"Base directory: {current_dir}")

builder = SparkSession.builder \
   .appName("NASA-Data-to-Bronze-V2") \
   .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
   .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
   .config("spark.driver.memory", "4g") \
   .config("spark.sql.adaptive.enabled", "true") \
   .config("spark.sql.adaptive.coalescePartitions.enabled", "true")

spark = configure_spark_with_delta_pip(builder).getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

print("=== NASA CLIMATE DATA PROCESSOR V2 ===")
print(f"Spark version: {spark.version}")
print(f"Processing timestamp: {datetime.now()}")
print("Spark session initialized successfully!")

Base directory: /home/ernese/miniconda3/envs/SO/New_SO
=== NASA CLIMATE DATA PROCESSOR V2 ===
Spark version: 3.4.0
Processing timestamp: 2025-08-28 17:03:27.156188
Spark session initialized successfully!


In [9]:
# Define paths and configuration
nasa_data_path = "/home/ernese/miniconda3/envs/SO/New_SO/nasa_data"
bronze_layer_path = "/home/ernese/miniconda3/envs/SO/New_SO/final-spark-bronze/bronze"

PROCESSING_TIMESTAMP = datetime.now()

print("=== CONFIGURATION ===")
print(f"Source path: {nasa_data_path}")
print(f"Target path: {bronze_layer_path}")
print(f"Processing timestamp: {PROCESSING_TIMESTAMP}")

# Create target directory if it doesn't exist
os.makedirs(bronze_layer_path, exist_ok=True)

# Define climate metrics
REQUIRED_CLIMATE_METRICS = {
    'C01': 'Daily mean air surface temperature',
    'C03': 'Daily highest temperature', 
    'C04': 'Daily lowest temperature',
    'C09': 'Daily precipitation levels',
    'C12': 'Daily mean surface pressure',
    'C13': 'Daily humidity levels'
}

BONUS_METRICS = {
    'C23': 'Monthly surface air temperature'
}

print(f"\n=== TARGET CLIMATE METRICS ===")
for code, desc in REQUIRED_CLIMATE_METRICS.items():
    print(f"{code}: {desc}")
print(f"\nBonus metrics: {len(BONUS_METRICS)}")
print(f"Total target files: {len(REQUIRED_CLIMATE_METRICS) + len(BONUS_METRICS)}")

=== CONFIGURATION ===
Source path: /home/ernese/miniconda3/envs/SO/New_SO/nasa_data
Target path: /home/ernese/miniconda3/envs/SO/New_SO/final-spark-bronze/bronze
Processing timestamp: 2025-08-28 17:03:37.882483

=== TARGET CLIMATE METRICS ===
C01: Daily mean air surface temperature
C03: Daily highest temperature
C04: Daily lowest temperature
C09: Daily precipitation levels
C12: Daily mean surface pressure
C13: Daily humidity levels

Bonus metrics: 1
Total target files: 7


In [10]:
# Discover NASA files
def discover_nasa_climate_files():
    """Discover NASA climate files in the data directory"""
    print("=== NASA FILE DISCOVERY ===")
    
    try:
        if not os.path.exists(nasa_data_path):
            print(f"ERROR: NASA data path does not exist: {nasa_data_path}")
            return [], {}, [], list(REQUIRED_CLIMATE_METRICS.keys())
            
        file_list = os.listdir(nasa_data_path)
        print(f"Total files in directory: {len(file_list)}")
        
        # Look for NASA CSV files
        nasa_files = [f for f in file_list if f.endswith('.csv') and 'NASA' in f]
        print(f"\nDiscovered {len(nasa_files)} NASA CSV files:")
        
        # Map files to metric codes
        metric_files = {}
        unmatched_files = []
        
        for filename in nasa_files:
            print(f"  - {filename}")
            
            # Extract climate metric code
            metric_match = re.search(r'C(\d{2})', filename)
            if metric_match:
                metric_code = f"C{metric_match.group(1)}"
                metric_files[metric_code] = filename
                
                # Check if it's a required metric
                if metric_code in REQUIRED_CLIMATE_METRICS:
                    print(f"    → REQUIRED: {REQUIRED_CLIMATE_METRICS[metric_code]}")
                elif metric_code in BONUS_METRICS:
                    print(f"    → BONUS: {BONUS_METRICS[metric_code]}")
                else:
                    print(f"    → ADDITIONAL: {metric_code}")
            else:
                unmatched_files.append(filename)
                print(f"    → UNMATCHED: Could not extract metric code")
        
        # Check coverage
        missing_metrics = []
        available_metrics = []
        
        for metric_code in REQUIRED_CLIMATE_METRICS:
            if metric_code in metric_files:
                available_metrics.append(metric_code)
            else:
                missing_metrics.append(metric_code)
        
        print(f"\nCoverage: {len(available_metrics)}/{len(REQUIRED_CLIMATE_METRICS)} required metrics")
        
        if missing_metrics:
            print(f"WARNING: Missing required metrics: {missing_metrics}")
        else:
            print("SUCCESS: All required climate metrics available!")
        
        return nasa_files, metric_files, available_metrics, missing_metrics
        
    except Exception as e:
        print(f"ERROR: Failed to discover NASA files - {str(e)}")
        return [], {}, [], list(REQUIRED_CLIMATE_METRICS.keys())

# Execute discovery
nasa_files, metric_files, available_metrics, missing_metrics = discover_nasa_climate_files()

=== NASA FILE DISCOVERY ===
Total files in directory: 7

Discovered 7 NASA CSV files:
  - C23_Country-average_monthly_surface_air_temperature_1981-2025_NASA.csv
    → BONUS: Monthly surface air temperature
  - C04_daily_lowest_temp_1981-2025_NASA.csv
    → REQUIRED: Daily lowest temperature
  - C12_daily_mean_surface_pressure_1981_2025_NASA.csv
    → REQUIRED: Daily mean surface pressure
  - C01_daily_mean_air_surface_temp_1981-2025_NASA.csv
    → REQUIRED: Daily mean air surface temperature
  - C09_daily_precipitation_1981-2025_NASA.csv
    → REQUIRED: Daily precipitation levels
  - C13_daily_humidity_level_1981-2025_NASA.csv
    → REQUIRED: Daily humidity levels
  - C03_daily_highest_temp_1981-2025_NASA.csv
    → REQUIRED: Daily highest temperature

Coverage: 6/6 required metrics
SUCCESS: All required climate metrics available!


In [11]:
# Helper function to transform monthly data to daily format
def transform_monthly_to_daily(df):
    """Transform monthly wide format to daily long format"""
    print("Transforming monthly data to daily format...")
    
    # Get all month columns
    month_cols = ['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 
                  'JUL', 'AUG', 'SEP', 'OCT', 'NOV', 'DEC']
    
    # Create month mapping
    month_map = {month: i+1 for i, month in enumerate(month_cols)}
    
    # Stack the data
    stacked_data = []
    
    for row in df.collect():
        year = row['YEAR']
        for month_name, month_num in month_map.items():
            if month_name in row and row[month_name] is not None:
                # Create date as first day of month
                date_str = f"{year}{month_num:02d}01"
                stacked_data.append({
                    'date': date_str,
                    'temperature': float(row[month_name]),
                    'year': year,
                    'month': month_num
                })
    
    # Create DataFrame from stacked data
    schema = StructType([
        StructField("date", StringType(), True),
        StructField("temperature", FloatType(), True),
        StructField("year", IntegerType(), True),
        StructField("month", IntegerType(), True)
    ])
    
    return spark.createDataFrame(stacked_data, schema)

# Data processing function
def process_nasa_climate_file_v2(filename, metric_code=None):
    """Process NASA file with support for both daily and monthly formats"""
    
    # Extract metric info
    if not metric_code:
        metric_match = re.search(r'C(\d{2})', filename)
        metric_code = f"C{metric_match.group(1)}" if metric_match else "UNKNOWN"
    
    # Get metric description
    metric_desc = REQUIRED_CLIMATE_METRICS.get(metric_code) or BONUS_METRICS.get(metric_code, "Unknown metric")
    
    print(f"\n--- PROCESSING: {filename} ---")
    print(f"Metric: {metric_code} - {metric_desc}")
    
    # Generate table name
    table_name = f"bronze_nasa_{metric_code.lower()}_v2"
    print(f"Target table: {table_name}")
    
    try:
        # Read CSV file
        file_path = os.path.join(nasa_data_path, filename)
        print(f"Reading: {file_path}")
        
        df = spark.read.option("header", "true") \
                       .option("inferSchema", "true") \
                       .csv(file_path)
        
        # Initial validation
        original_count = df.count()
        original_columns = len(df.columns)
        print(f"Raw data: {original_count:,} rows, {original_columns} columns")
        
        if original_count == 0:
            raise Exception("Empty dataset")
        
        # Check data format and process accordingly
        columns = df.columns
        print(f"Columns detected: {len(columns)} total")
        
        if "date" in columns:
            # Daily format processing
            print("Processing daily format data...")
            df = df.withColumn("measurement_date", to_date(col("date").cast("string"), "yyyyMMdd")) \
                   .withColumn("year", year(col("measurement_date"))) \
                   .withColumn("month", month(col("measurement_date"))) \
                   .withColumn("day", dayofmonth(col("measurement_date"))) \
                   .withColumn("quarter", quarter(col("measurement_date"))) \
                   .drop("date")
            
        elif "YEAR" in columns:
            # Monthly format processing
            print("Processing monthly format data...")
            df = transform_monthly_to_daily(df)
            df = df.withColumn("measurement_date", to_date(col("date").cast("string"), "yyyyMMdd")) \
                   .withColumn("quarter", quarter(col("measurement_date"))) \
                   .withColumn("day", dayofmonth(col("measurement_date"))) \
                   .drop("date")
        else:
            raise Exception(f"Unknown data format - no 'date' or 'YEAR' column found")
        
        # Add metadata
        df = df.withColumn("climate_metric_code", lit(metric_code)) \
               .withColumn("climate_metric_name", lit(metric_desc)) \
               .withColumn("source_system", lit("NASA")) \
               .withColumn("source_file", lit(filename)) \
               .withColumn("processing_version", lit("V2")) \
               .withColumn("ingestion_timestamp", lit(PROCESSING_TIMESTAMP)) \
               .withColumn("data_quality_flag", lit("VALID"))
        
        # Data quality validation
        print("Performing data quality validation...")
        
        # Check for null dates
        null_dates = df.filter(col("measurement_date").isNull()).count()
        if null_dates > 0:
            print(f"WARNING: {null_dates:,} records with null dates")
            df = df.filter(col("measurement_date").isNotNull())
        
        # Validate date range
        date_range = df.select(min("measurement_date").alias("min_date"), 
                              max("measurement_date").alias("max_date")).collect()[0]
        print(f"Date range: {date_range['min_date']} to {date_range['max_date']}")
        
        # Final row count after cleaning
        final_count = df.count()
        print(f"Cleaned data: {final_count:,} rows ({original_count - final_count:,} removed)")
        
        # Show sample data
        print("Sample processed data:")
        df.select("measurement_date", "year", "month", "climate_metric_code", 
                 "climate_metric_name", "data_quality_flag").show(3, truncate=False)
        
        # Save Delta table
        delta_path = os.path.join(bronze_layer_path, table_name)
        print(f"Saving to: {delta_path}")
        
        write_builder = df.write \
          .format("delta") \
          .mode("overwrite") \
          .option("overwriteSchema", "true")
        
        # Partitioning strategy
        if "year" in df.columns and final_count > 10000:
            write_builder = write_builder.partitionBy("year")
            print("Partitioning by year")
        
        write_builder.save(delta_path)
        
        # Verify save
        df_verify = spark.read.format("delta").load(delta_path)
        verify_count = df_verify.count()
        verify_columns = len(df_verify.columns)
        
        print(f"✓ VERIFICATION: {verify_count:,} rows, {verify_columns} columns saved")
        
        return {
            'status': 'SUCCESS',
            'filename': filename,
            'metric_code': metric_code,
            'metric_desc': metric_desc,
            'table_name': table_name,
            'original_rows': original_count,
            'final_rows': verify_count,
            'columns': verify_columns,
            'date_range': f"{date_range['min_date']} to {date_range['max_date']}"
        }
        
    except Exception as e:
        error_msg = f"ERROR processing {filename}: {str(e)}"
        print(error_msg)
        
        return {
            'status': 'FAILED',
            'filename': filename,
            'metric_code': metric_code,
            'error': str(e)
        }

print("Processing function ready")

Processing function ready


In [12]:
# Execute processing for all NASA files
print("=== PROCESSING EXECUTION ===")

processing_results = []
successful_tables = []
failed_processing = []

# Check if we have files to process
if not nasa_files:
    print(" ERROR: No NASA files discovered for processing!")
    print("Please check:")
    print(f"  - Source directory exists: {nasa_data_path}")
    print("  - Directory contains CSV files with NASA in filename")
    print("  - Files have climate metric codes (C01, C03, etc.)")
else:
    print(f"Processing {len(nasa_files)} NASA climate files...")
    
    for i, filename in enumerate(nasa_files, 1):
        print(f"\n{'='*60}")
        print(f"PROCESSING {i}/{len(nasa_files)}: {filename}")
        print(f"{'='*60}")
        
        # Extract metric code for this file
        file_metric = None
        for metric_code, mapped_file in metric_files.items():
            if mapped_file == filename:
                file_metric = metric_code
                break
        
        # Process the file
        result = process_nasa_climate_file_v2(filename, file_metric)
        processing_results.append(result)
        
        if result['status'] == 'SUCCESS':
            successful_tables.append(result)
            print(f"✓ SUCCESS: {result['table_name']}")
        else:
            failed_processing.append(result)
            print(f"✗ FAILED: {filename}")

# Final summary
print(f"\n{'='*60}")
print("PROCESSING COMPLETED")
print(f"{'='*60}")
print(f"Total files processed: {len(nasa_files)}")
print(f"Successful: {len(successful_tables)}")
print(f"Failed: {len(failed_processing)}")

# Safe success rate calculation
if len(nasa_files) > 0:
    success_rate = len(successful_tables) / len(nasa_files) * 100
    print(f"Success rate: {success_rate:.1f}%")
else:
    print("Success rate: N/A (no files to process)")

=== PROCESSING EXECUTION ===
Processing 7 NASA climate files...

PROCESSING 1/7: C23_Country-average_monthly_surface_air_temperature_1981-2025_NASA.csv

--- PROCESSING: C23_Country-average_monthly_surface_air_temperature_1981-2025_NASA.csv ---
Metric: C23 - Monthly surface air temperature
Target table: bronze_nasa_c23_v2
Reading: /home/ernese/miniconda3/envs/SO/New_SO/nasa_data/C23_Country-average_monthly_surface_air_temperature_1981-2025_NASA.csv
Raw data: 45 rows, 14 columns
Columns detected: 14 total
Processing monthly format data...
Transforming monthly data to daily format...
Performing data quality validation...
Date range: 1981-01-01 to 2025-12-01
Cleaned data: 540 rows (-495 removed)
Sample processed data:
+----------------+----+-----+-------------------+-------------------------------+-----------------+
|measurement_date|year|month|climate_metric_code|climate_metric_name            |data_quality_flag|
+----------------+----+-----+-------------------+---------------------------

                                                                                

✓ VERIFICATION: 16,263 rows, 144 columns saved
✓ SUCCESS: bronze_nasa_c04_v2

PROCESSING 3/7: C12_daily_mean_surface_pressure_1981_2025_NASA.csv

--- PROCESSING: C12_daily_mean_surface_pressure_1981_2025_NASA.csv ---
Metric: C12 - Daily mean surface pressure
Target table: bronze_nasa_c12_v2
Reading: /home/ernese/miniconda3/envs/SO/New_SO/nasa_data/C12_daily_mean_surface_pressure_1981_2025_NASA.csv
Raw data: 16,263 rows, 133 columns
Columns detected: 133 total
Processing daily format data...
Performing data quality validation...
Date range: 1981-01-01 to 2025-07-11
Cleaned data: 16,263 rows (0 removed)
Sample processed data:
+----------------+----+-----+-------------------+---------------------------+-----------------+
|measurement_date|year|month|climate_metric_code|climate_metric_name        |data_quality_flag|
+----------------+----+-----+-------------------+---------------------------+-----------------+
|1981-01-01      |1981|1    |C12                |Daily mean surface pressure|VAL

                                                                                

✓ VERIFICATION: 16,262 rows, 144 columns saved
✓ SUCCESS: bronze_nasa_c09_v2

PROCESSING 6/7: C13_daily_humidity_level_1981-2025_NASA.csv

--- PROCESSING: C13_daily_humidity_level_1981-2025_NASA.csv ---
Metric: C13 - Daily humidity levels
Target table: bronze_nasa_c13_v2
Reading: /home/ernese/miniconda3/envs/SO/New_SO/nasa_data/C13_daily_humidity_level_1981-2025_NASA.csv
Raw data: 16,262 rows, 133 columns
Columns detected: 133 total
Processing daily format data...
Performing data quality validation...
Date range: 1981-01-01 to 2025-07-10
Cleaned data: 16,262 rows (0 removed)
Sample processed data:
+----------------+----+-----+-------------------+---------------------+-----------------+
|measurement_date|year|month|climate_metric_code|climate_metric_name  |data_quality_flag|
+----------------+----+-----+-------------------+---------------------+-----------------+
|1981-01-01      |1981|1    |C13                |Daily humidity levels|VALID            |
|1981-01-02      |1981|1    |C13   

In [None]:
# Processing summary and validation
print("\n=== PROCESSING SUMMARY ===")

# Successful tables summary
if successful_tables:
    print(f"\n✓ SUCCESSFULLY CREATED TABLES ({len(successful_tables)}):")
    
    total_rows = 0
    for result in successful_tables:
        metric_info = f"{result['metric_code']} - {result['metric_desc']}"
        row_info = f"{result['final_rows']:,} rows"
        date_info = result['date_range']
        print(f"  • {result['table_name']}")
        print(f"    {metric_info}")
        print(f"    {row_info} | {date_info}")
        total_rows += result['final_rows']
    
    print(f"\nTOTAL DATA VOLUME: {total_rows:,} climate measurement records")
    print(f"📁 SAVED TO: {bronze_layer_path}")

# Failed processing summary
if failed_processing:
    print(f"\n✗ FAILED PROCESSING ({len(failed_processing)}):")
    for result in failed_processing:
        print(f"  • {result['filename']}: {result['error'][:100]}...")

# Required metrics coverage analysis
print(f"\n=== REQUIRED METRICS COVERAGE ===")
processed_required_metrics = []
missing_required_metrics = []

for metric_code, metric_desc in REQUIRED_CLIMATE_METRICS.items():
    found = False
    for result in successful_tables:
        if result['metric_code'] == metric_code:
            processed_required_metrics.append(metric_code)
            print(f"✓ {metric_code}: {metric_desc} - {result['final_rows']:,} records")
            found = True
            break
    
    if not found:
        missing_required_metrics.append(metric_code)
        print(f"✗ {metric_code}: {metric_desc} - NOT PROCESSED")

coverage_pct = len(processed_required_metrics) / len(REQUIRED_CLIMATE_METRICS) * 100 if REQUIRED_CLIMATE_METRICS else 0
print(f"\n📈 COVERAGE: {len(processed_required_metrics)}/{len(REQUIRED_CLIMATE_METRICS)} ({coverage_pct:.1f}%) required metrics")

if coverage_pct == 100:
    print(" EXCELLENT: All required climate metrics successfully processed!")
    print("   Ready for silver layer transformations")
elif coverage_pct >= 80:
    print(" GOOD: Most required metrics processed, some gaps remain")
else:
    print(" ATTENTION: Significant gaps in required metrics coverage")

# Final configuration summary
print(f"\n=== FINAL CONFIGURATION ===")
print(f"Source directory: {nasa_data_path}")
print(f"Target directory: {bronze_layer_path}")
print(f"Processing version: V2")
print(f"Processing timestamp: {PROCESSING_TIMESTAMP}")

print(f"\n NASA BRONZE V2 PROCESSING COMPLETE!")
print(f"   Comprehensive climate metrics coverage achieved")
print(f"   Ready for sustainability analytics pipeline")

In [None]:
# Stop the Spark session
spark.stop()
print("Spark session stopped.")