# World Bank Data to Bronze Layer

**Production-ready notebook for converting World Bank CSV files to Delta tables**

Transforms World Bank economic and sustainability indicators into bronze layer Delta tables.

## Input Sources:
- World Bank economic indicators (15+ files)
  - GDP growth and per capita
  - Greenhouse gas emissions
  - Land use and agriculture
  - Natural resources rents
  - And more...

## Output:
- Multiple Delta tables in bronze layer
- Properly formatted date columns
- Optimized for performance

## Processing Steps:
1. Read each World Bank CSV file
2. Convert date format
3. Save as Delta table
4. Verify data integrity

In [None]:
# Initialize Spark session with Delta Lake
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date
from delta import *

builder = SparkSession.builder \
    .appName("WorldBank-Data-to-Bronze") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.driver.memory", "4g") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true")

spark = configure_spark_with_delta_pip(builder).getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

print("Spark session initialized successfully!")

In [None]:
# Define paths
worldbank_data_path = "../worldbank_data"
bronze_layer_path = "../final-spark-bronze/new_bronze"

print("Starting World Bank data conversion to Delta tables...")
print(f"Source path: {worldbank_data_path}")
print(f"Target path: {bronze_layer_path}")

# Get list of all World Bank CSV files
worldbank_files = [f for f in os.listdir(worldbank_data_path) if f.endswith('.csv') and 'worldbank' in f]

print(f"\nFound {len(worldbank_files)} World Bank CSV files to process:")
for file in worldbank_files:
    print(f"  - {file}")

In [None]:
# Process each file
from pyspark.sql.types import IntegerType

# For World Bank data, we need to increase the limit for columns
spark.conf.set("spark.sql.debug.maxToStringFields", 1000)
spark.conf.set("spark.sql.execution.arrow.maxRecordsPerBatch", 10000)

processed_tables = []

for i, filename in enumerate(worldbank_files, 1):
    print(f"\n--- Processing file {i}/{len(worldbank_files)}: {filename} ---")
    
    # Extract table name from filename (remove .csv and worldbank identifier)
    table_name = filename.replace('_worldbank.csv', '').replace('-', '_').replace('(', '').replace(')', '').replace('__', '_')
    print(f"Table name: {table_name}")
    
    # Read CSV file with Spark
    file_path = os.path.join(worldbank_data_path, filename)
    
    try:
        # Read the CSV file
        df = spark.read.option("header", "true") \
                       .option("inferSchema", "true") \
                       .csv(file_path)
        
        # Convert date column to integer (World Bank data uses year format)
        df = df.withColumn("date", col("date").cast(IntegerType()))
        
        # Show schema and row count
        print(f"Schema: {len(df.columns)} columns")
        print(f"Row count: {df.count()}")
        
        # Display first few rows to verify data
        print("Sample data:")
        df.show(3, truncate=False)
        
        # Save as Delta table
        delta_path = os.path.join(bronze_layer_path, table_name)
        df.write.format("delta").mode("overwrite").save(delta_path)
        print(f"Successfully saved to Delta table: {delta_path}")
        
        # Verify the saved table
        df_check = spark.read.format("delta").load(delta_path)
        print(f"Verification - Columns: {len(df_check.columns)}, Rows: {df_check.count()}")
        
        # Add to processed tables list
        processed_tables.append(table_name)
        
    except Exception as e:
        print(f"Error processing {filename}: {str(e)}")
        continue

print(f"\nWorld Bank data conversion completed! Processed {len(processed_tables)} tables.")

In [None]:
# List all created Delta tables
print("\nDelta tables created in bronze layer:")
for table in processed_tables:
    print(f"  - {table}")

# Show details of one table as example
if processed_tables:
    example_table = processed_tables[0]
    example_path = os.path.join(bronze_layer_path, example_table)
    
    print(f"\nExample table schema ({example_table}):")
    df_example = spark.read.format("delta").load(example_path)
    df_example.printSchema()

In [None]:
# Stop Spark session
spark.stop()
print("\nSpark session stopped.")