# PSA Population Data to Bronze Layer - Improved

This notebook processes PSA Population CSV files to Delta tables with improved encoding handling.
Population data includes birth statistics, death records, demographic data, and housing information.

Improvements:
- Multiple encoding support (utf-8, latin1, iso-8859-1, cp1252, utf-16)
- Skip bad lines automatically
- Better error handling and reporting
- Empty row filtering

Expected format:
- Row 1: Title (quoted string)
- Row 2: Empty
- Row 3: Column headers (semicolon-separated)
- Row 4+: Data (semicolon-separated)

In [None]:
# Initialize Spark session with Delta Lake
import os
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, trim, when
from delta import configure_spark_with_delta_pip
import re
import json
from datetime import datetime
import warnings

# Suppress pandas warnings
warnings.filterwarnings('ignore')

builder = SparkSession.builder \
    .appName("PSA-Population-Bronze-Improved") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.driver.memory", "4g") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true")

spark = configure_spark_with_delta_pip(builder).getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

print("Spark session initialized successfully")
print(f"Spark version: {spark.version}")

In [None]:
# Define paths and helper functions
psa_data_path = "../PSA/Population"
bronze_layer_path = "../final-spark-bronze/bronze_population_improved"

# Create output directory
os.makedirs(bronze_layer_path, exist_ok=True)

def clean_column_name(col_name):
    """Clean column names for Delta compatibility"""
    if not col_name or col_name.strip() == "":
        return "unnamed_column"
    
    cleaned = str(col_name).strip().replace('"', '').replace("'", '')
    cleaned = re.sub(r'[^a-zA-Z0-9_]', '_', cleaned)
    cleaned = re.sub(r'_+', '_', cleaned).strip('_')
    
    if not cleaned:
        cleaned = "unnamed_column"
    elif cleaned[0].isdigit():
        cleaned = f"col_{cleaned}"
    
    return cleaned

def clean_table_name(filename):
    """Generate clean table name from filename"""
    table_name = filename.replace('.csv', '')
    table_name = clean_column_name(table_name)
    table_name = f"psa_population_{table_name}"
    
    if len(table_name) > 100:
        table_name = table_name[:100].rstrip('_')
    
    return table_name

def detect_encoding(file_path):
    """Detect file encoding by trying to read first few lines"""
    encodings = ['utf-8', 'latin1', 'iso-8859-1', 'cp1252', 'utf-16']
    
    for encoding in encodings:
        try:
            with open(file_path, 'r', encoding=encoding) as f:
                f.read(1000)  # Try to read first 1000 characters
            return encoding
        except UnicodeDecodeError:
            continue
        except Exception:
            continue
    
    return 'latin1'  # Fallback encoding

def read_psa_csv_with_pandas(file_path):
    """Read PSA CSV using pandas with robust encoding handling"""
    try:
        # First detect the most likely encoding
        detected_encoding = detect_encoding(file_path)
        print(f"    Detected encoding: {detected_encoding}")
        
        # Try with detected encoding first
        encodings_to_try = [detected_encoding] + ['utf-8', 'latin1', 'iso-8859-1', 'cp1252']
        encodings_to_try = list(dict.fromkeys(encodings_to_try))  # Remove duplicates
        
        for encoding in encodings_to_try:
            try:
                df_pandas = pd.read_csv(
                    file_path,
                    sep=';',
                    skiprows=2,
                    encoding=encoding,
                    na_values=['', '....', 'null', 'NULL', 'n/a', 'N/A'],
                    keep_default_na=True,
                    dtype=str,
                    on_bad_lines='skip',
                    skipinitialspace=True
                )
                
                print(f"    Successfully read with encoding: {encoding}")
                break
                
            except UnicodeDecodeError:
                print(f"    Failed with encoding: {encoding}")
                continue
            except pd.errors.EmptyDataError:
                return None, False, "File is empty or has no valid data"
        else:
            return None, False, "Could not decode file with any supported encoding"
        
        # Validate DataFrame
        if df_pandas.empty:
            return None, False, "DataFrame is empty after parsing"
        
        # Clean up the DataFrame
        original_shape = df_pandas.shape
        
        # Remove completely empty rows
        df_pandas = df_pandas.dropna(how='all')
        
        # Remove rows where all values are empty strings
        df_pandas = df_pandas[~df_pandas.apply(lambda x: x.str.strip().eq('').all(), axis=1)]
        
        if df_pandas.empty:
            return None, False, "No valid data rows found after cleaning"
        
        # Clean column names
        original_columns = df_pandas.columns.tolist()
        df_pandas.columns = [clean_column_name(col) for col in df_pandas.columns]
        
        # Remove duplicate column names by adding suffix
        seen_columns = {}
        new_columns = []
        for col in df_pandas.columns:
            if col in seen_columns:
                seen_columns[col] += 1
                new_columns.append(f"{col}_{seen_columns[col]}")
            else:
                seen_columns[col] = 0
                new_columns.append(col)
        
        df_pandas.columns = new_columns
        
        print(f"    Cleaned data: {original_shape} -> {df_pandas.shape}")
        
        # Convert to Spark DataFrame
        df_spark = spark.createDataFrame(df_pandas)
        
        return df_spark, True, None
        
    except Exception as e:
        return None, False, f"Unexpected error: {str(e)}"

print("Helper functions defined")
print(f"Source path: {psa_data_path}")
print(f"Target path: {bronze_layer_path}")

In [None]:
# Get list of PSA Population CSV files
psa_files = [f for f in os.listdir(psa_data_path) if f.endswith('.csv')]
psa_files.sort()

print(f"Found {len(psa_files)} PSA Population CSV files to process")
print("Files to process:")
for i, file in enumerate(psa_files, 1):
    print(f"  {i:2d}. {file}")

In [None]:
# Process a test file first
if psa_files:
    test_file = psa_files[0]
    test_file_path = os.path.join(psa_data_path, test_file)
    
    print(f"Testing with: {test_file}")
    df_test, success, error = read_psa_csv_with_pandas(test_file_path)
    
    if success:
        print(f"Test successful: {df_test.count()} rows, {len(df_test.columns)} columns")
        print("Sample columns:", df_test.columns[:5])
        df_test.show(3, truncate=True)
    else:
        print(f"Test failed: {error}")

In [None]:
# Process all Population files
processed_tables = []
failed_files = []
processing_details = []

print(f"Processing all {len(psa_files)} files...")

for i, filename in enumerate(psa_files, 1):
    print(f"\nProcessing {i}/{len(psa_files)}: {filename}")
    
    table_name = clean_table_name(filename)
    file_path = os.path.join(psa_data_path, filename)
    
    try:
        df, parse_success, parse_error = read_psa_csv_with_pandas(file_path)
        
        if not parse_success:
            print(f"  Failed to parse: {parse_error}")
            failed_files.append((filename, parse_error))
            continue
        
        row_count = df.count()
        col_count = len(df.columns)
        
        print(f"  Parsed: {row_count} rows, {col_count} columns")
        
        if row_count == 0:
            print(f"  Skipped: No data rows")
            failed_files.append((filename, "No data rows found"))
            continue
        
        # Save to Delta
        delta_path = os.path.join(bronze_layer_path, table_name)
        
        df.write.format("delta") \
          .mode("overwrite") \
          .option("delta.columnMapping.mode", "name") \
          .option("delta.minReaderVersion", "2") \
          .option("delta.minWriterVersion", "5") \
          .save(delta_path)
        
        # Verify
        df_check = spark.read.format("delta").load(delta_path)
        verify_count = df_check.count()
        
        print(f"  Saved and verified: {verify_count} rows")
        processed_tables.append(table_name)
        
        processing_details.append({
            'filename': filename,
            'table_name': table_name,
            'rows': verify_count,
            'columns': col_count,
            'status': 'success'
        })
        
    except Exception as e:
        error_msg = f"Processing error: {str(e)}"
        print(f"  {error_msg}")
        failed_files.append((filename, error_msg))
        
        processing_details.append({
            'filename': filename,
            'error': error_msg,
            'status': 'failed'
        })

print(f"\nProcessing complete!")
print(f"Successfully processed: {len(processed_tables)} tables")
print(f"Failed: {len(failed_files)} files")
if len(processed_tables) + len(failed_files) > 0:
    success_rate = len(processed_tables)/(len(processed_tables)+len(failed_files))*100
    print(f"Success rate: {success_rate:.1f}%")

In [None]:
# Display detailed results
print("\nDETAILED PROCESSING SUMMARY")
print("=" * 60)

if failed_files:
    print(f"\nFailed files ({len(failed_files)}):")
    for i, (filename, error) in enumerate(failed_files, 1):
        print(f"  {i:2d}. {filename}")
        print(f"      Error: {error}")

print(f"\nSuccessfully created Delta tables ({len(processed_tables)}):")
for i, detail in enumerate([d for d in processing_details if d['status'] == 'success'], 1):
    print(f"  {i:2d}. {detail['table_name']}")
    print(f"      Source: {detail['filename']}")
    print(f"      Data: {detail['rows']:,} rows, {detail['columns']} columns")

# Calculate statistics
if processing_details:
    successful_details = [d for d in processing_details if d['status'] == 'success']
    if successful_details:
        total_rows = sum(d['rows'] for d in successful_details)
        avg_columns = sum(d['columns'] for d in successful_details) / len(successful_details)
        
        print(f"\nStatistics:")
        print(f"  Total rows processed: {total_rows:,}")
        print(f"  Average columns per table: {avg_columns:.1f}")
        print(f"  Largest table: {max(successful_details, key=lambda x: x['rows'])['rows']:,} rows")
        print(f"  Smallest table: {min(successful_details, key=lambda x: x['rows'])['rows']:,} rows")

In [None]:
# Save comprehensive processing report
report = {
    'timestamp': datetime.now().isoformat(),
    'data_source': 'PSA Population',
    'total_files': len(psa_files),
    'successful_tables': len(processed_tables),
    'failed_files': len(failed_files),
    'success_rate': len(processed_tables)/(len(processed_tables)+len(failed_files))*100 if (len(processed_tables)+len(failed_files)) > 0 else 0,
    'processing_details': processing_details,
    'processed_tables': processed_tables,
    'failed_files': [{'filename': f, 'error': e} for f, e in failed_files]
}

report_path = os.path.join(bronze_layer_path, 'processing_report_detailed.json')
with open(report_path, 'w') as f:
    json.dump(report, f, indent=2)

print(f"\nDetailed processing report saved to: {report_path}")

In [None]:
# Example table validation
if processed_tables:
    print("\nEXAMPLE TABLE VALIDATION")
    print("=" * 50)
    
    example_table = processed_tables[0]
    example_path = os.path.join(bronze_layer_path, example_table)
    
    print(f"Validating: {example_table}")
    
    df_example = spark.read.format("delta").load(example_path)
    
    print(f"\nTable statistics:")
    print(f"  Rows: {df_example.count():,}")
    print(f"  Columns: {len(df_example.columns)}")
    
    print(f"\nSchema:")
    df_example.printSchema()
    
    print(f"\nSample data:")
    df_example.show(5, truncate=False)
    
    print(f"\nData quality check:")
    total_rows = df_example.count()
    for col_name in df_example.columns[:5]:
        null_count = df_example.filter(col(col_name).isNull()).count()
        null_pct = (null_count / total_rows) * 100 if total_rows > 0 else 0
        print(f"  {col_name}: {null_count:,} nulls ({null_pct:.1f}%)")

In [None]:
# Stop Spark session
spark.stop()
print("Spark session stopped")
print("\nProcessing complete. Check the bronze_population_improved directory for Delta tables.")