# IMF Data to Bronze Layer

**Production-ready notebook for converting IMF CSV files to Delta tables**

Transforms IMF economic and climate indicators into bronze layer Delta tables.

## Input Sources:
- IMF economic and climate indicators (20+ files)
  - Greenhouse gas emissions
  - Climate-related disasters
  - Renewable energy
  - Trade in low carbon technology
  - Surface temperature change
  - And more...

## Output:
- Multiple Delta tables in bronze layer
- Properly formatted data
- Optimized for performance

## Processing Steps:
1. Read each IMF CSV file
2. Clean and transform data
3. Save as Delta table
4. Verify data integrity

In [None]:
# Initialize Spark session with Delta Lake
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date, regexp_replace
from delta import *

builder = SparkSession.builder \
    .appName("IMF-Data-to-Bronze") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.driver.memory", "4g") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true")

spark = configure_spark_with_delta_pip(builder).getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

print("Spark session initialized successfully!")

In [None]:
# Define paths
imf_data_path = "abfss://WS_SO_SustainabilityData_Prd@onelake.dfs.fabric.microsoft.com/SO_raw_to_bronze.Lakehouse/Files/IMF"
bronze_layer_path = "abfss://WS_SO_SustainabilityData_Prd@onelake.dfs.fabric.microsoft.com/SO_raw_to_bronze.Lakehouse/Tables"

print("Starting IMF data conversion to Delta tables...")
print(f"Source path: {imf_data_path}")
print(f"Target path: {bronze_layer_path}")

# Get list of all IMF CSV files
imf_files = [f for f in os.listdir(imf_data_path) if f.endswith('.csv')]

print(f"\nFound {len(imf_files)} IMF CSV files to process:")
for file in imf_files:
    print(f"  - {file}")

In [None]:
# Process each file
spark.conf.set("spark.sql.debug.maxToStringFields", 1000)
spark.conf.set("spark.sql.execution.arrow.maxRecordsPerBatch", 10000)

processed_tables = []

for i, filename in enumerate(imf_files, 1):
    print(f"\n--- Processing file {i}/{len(imf_files)}: {filename} ---")
    
    # Extract table name from filename (remove .csv and clean up)
    table_name = filename.replace('.csv', '').replace('-', '_').replace(' ', '_').replace('(', '').replace(')', '')
    # Handle numeric prefixes
    if table_name[0].isdigit():
        table_name = "imf_" + table_name
    print(f"Table name: {table_name}")
    
    # Read CSV file with Spark
    file_path = os.path.join(imf_data_path, filename)
    
    try:
        # Read the CSV file
        df = spark.read.option("header", "true") \
                       .option("inferSchema", "true") \
                       .option("escape", "\"") \
                       .csv(file_path)
        
        # Show schema and row count
        print(f"Schema: {len(df.columns)} columns")
        print(f"Row count: {df.count()}")
        
        # Display first few rows to verify data
        print("Sample data:")
        df.show(2, truncate=False)
        
        # Save as Delta table
        delta_path = os.path.join(bronze_layer_path, table_name)
        df.write.format("delta").mode("overwrite").save(delta_path)
        print(f"Successfully saved to Delta table: {delta_path}")
        
        # Verify the saved table
        df_check = spark.read.format("delta").load(delta_path)
        print(f"Verification - Columns: {len(df_check.columns)}, Rows: {df_check.count()}")
        
        # Add to processed tables list
        processed_tables.append(table_name)
        
    except Exception as e:
        print(f"Error processing {filename}: {str(e)}")
        continue

print(f"\nIMF data conversion completed! Processed {len(processed_tables)} tables.")

In [None]:
# List all created Delta tables
print("\nDelta tables created in bronze layer:")
for table in processed_tables:
    print(f"  - {table}")

# Show details of one table as example
if processed_tables:
    example_table = processed_tables[0]
    example_path = os.path.join(bronze_layer_path, example_table)
    
    print(f"\nExample table schema ({example_table}):")
    df_example = spark.read.format("delta").load(example_path)
    df_example.printSchema()

In [None]:
# Stop Spark session
spark.stop()
print("\nSpark session stopped.")