In [3]:
"""
Simple test script to verify Athena connection
Run this FIRST to make sure everything is set up correctly
"""
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

print("=" * 60)
print("Testing Athena Connection")
print("=" * 60)
print()

# Step 1: Check environment variables
print("STEP 1: Checking Environment Variables")
print("-" * 60)

required_vars = {
    'AWS_REGION': os.getenv('AWS_REGION'),
    'S3_BUCKET': os.getenv('S3_BUCKET'),
    'ATHENA_DATABASE': os.getenv('ATHENA_DATABASE'),
    'ATHENA_OUTPUT_LOCATION': os.getenv('ATHENA_OUTPUT_LOCATION')
}

all_set = True
for var, value in required_vars.items():
    if value:
        # Only show first 30 chars for security
        display_value = value[:30] + '...' if len(value) > 30 else value
        print(f"✓ {var}: {display_value}")
    else:
        print(f"✗ {var}: NOT SET")
        all_set = False

if not all_set:
    print("\n❌ Missing required environment variables!")
    print("Create a .env file with the following:")
    print("""
AWS_REGION=us-east-1
S3_BUCKET=your-bucket-name
ATHENA_DATABASE=climate_finance_db
ATHENA_OUTPUT_LOCATION=s3://your-bucket-name/athena-results/
""")
    exit(1)

print()

# Step 2: Test AWS credentials
print("STEP 2: Testing AWS Credentials")
print("-" * 60)

try:
    import boto3
    
    # Try to access S3 to verify credentials
    s3_client = boto3.client('s3', region_name=os.getenv('AWS_REGION'))
    
    # List buckets (just to test credentials work)
    response = s3_client.list_buckets()
    print(f"✓ AWS credentials valid")
    print(f"✓ Found {len(response['Buckets'])} S3 buckets")
    
    # Check if your specific bucket exists
    bucket_name = os.getenv('S3_BUCKET')
    if any(b['Name'] == bucket_name for b in response['Buckets']):
        print(f"✓ Bucket '{bucket_name}' exists")
    else:
        print(f"⚠ Warning: Bucket '{bucket_name}' not found in your account")
        print(f"  Available buckets: {[b['Name'] for b in response['Buckets'][:5]]}")
        
except Exception as e:
    print(f"❌ AWS credential error: {e}")
    print("\nMake sure you have AWS credentials configured:")
    print("  Option 1: Run 'aws configure'")
    print("  Option 2: Add to .env file:")
    print("    AWS_ACCESS_KEY_ID=your-key")
    print("    AWS_SECRET_ACCESS_KEY=your-secret")
    exit(1)

print()

# Step 3: Test Athena connection
print("STEP 3: Testing Athena Connection")
print("-" * 60)

try:
    from pyathena import connect
    import pandas as pd
    
    # Create connection
    connection = connect(
        region_name=os.getenv('AWS_REGION'),
        s3_staging_dir=os.getenv('ATHENA_OUTPUT_LOCATION'),
        schema_name=os.getenv('ATHENA_DATABASE')
    )
    
    print("✓ Connection created successfully")
    
    # Test simple query
    cursor = connection.cursor()
    print("✓ Cursor created")
    
    # Try to list tables in the database
    cursor.execute("SHOW TABLES")
    tables = cursor.fetchall()
    
    print(f"✓ Query executed successfully")
    print(f"✓ Found {len(tables)} tables in database")
    print(f"\nAvailable tables:")
    for table in tables:
        print(f"  - {table[0]}")
    
    cursor.close()
    
except ImportError:
    print("❌ PyAthena not installed")
    print("Run: pip install pyathena")
    exit(1)
except Exception as e:
    print(f"❌ Athena connection error: {e}")
    print("\nPossible issues:")
    print("  1. Database name incorrect (check ATHENA_DATABASE)")
    print("  2. S3 staging location doesn't exist or no permissions")
    print("  3. IAM permissions missing for Athena")
    exit(1)

print()

# Step 4: Test querying your institution table
print("STEP 4: Testing Institution Table Query")
print("-" * 60)

try:
    connection = connect(
        region_name=os.getenv('AWS_REGION'),
        s3_staging_dir=os.getenv('ATHENA_OUTPUT_LOCATION'),
        schema_name=os.getenv('ATHENA_DATABASE')
    )
    
    cursor = connection.cursor()
    
    # Query institution table
    query = "SELECT * FROM institution LIMIT 5"
    print(f"Running query: {query}")
    
    cursor.execute(query)
    
    # Get results as pandas DataFrame
    # Method 1: Using fetchall() - returns pandas DataFrame
    # df = cursor.fetchall()
    
    # Alternative Method 2: Convert manually if needed
    import pandas as pd
    columns = [desc[0] for desc in cursor.description]
    data = cursor.fetchall()
    df = pd.DataFrame(data, columns=columns)
    
    print(f"✓ Query successful!")
    print(f"✓ Retrieved {len(df)} rows")
    print(f"\nColumns in institution table:")
    for col in df.columns:
        print(f"  - {col}")
    
    print(f"\nSample data (first 2 rows):")
    print(df.head(2).to_string())
    
    cursor.close()
    connection.close()
    
except Exception as e:
    print(f"❌ Query error: {e}")
    print("\nPossible issues:")
    print("  1. 'institution' table doesn't exist")
    print("  2. Table structure different than expected")
    print("  3. No data in the table")
    exit(1)

print()
print("=" * 60)
print("✅ All tests passed! Your Athena connection is working.")
print("=" * 60)
print()
print("You can now run the main app with:")
print("  streamlit run app.py")

Testing Athena Connection

STEP 1: Checking Environment Variables
------------------------------------------------------------
✓ AWS_REGION: us-east-1
✓ S3_BUCKET: cpi-uk-us-datascience-stage
✓ ATHENA_DATABASE: ref_testing
✓ ATHENA_OUTPUT_LOCATION: s3://cpi-uk-us-datascience-sta...

STEP 2: Testing AWS Credentials
------------------------------------------------------------
✓ AWS credentials valid
✓ Found 10 S3 buckets
✓ Bucket 'cpi-uk-us-datascience-stage' exists

STEP 3: Testing Athena Connection
------------------------------------------------------------
✓ Connection created successfully
✓ Cursor created
✓ Query executed successfully
✓ Found 20 tables in database

Available tables:
  - country_coefficients
  - country_gearing_ratios
  - country_multipliers
  - data_source
  - double_counting_exclusions
  - exchange_rates
  - gender
  - geography
  - geography_indicators
  - geography_standardization
  - institution
  - institution_ownership
  - institution_standardization
  - instr

In [2]:
import pyarrow.parquet as pq

parquet_file = pq.ParquetFile("s3://cpi-uk-us-datascience-stage/auxiliary-data/reference-data/reference-db/geography/data.parquet")

# Show schema
print(parquet_file.schema)

<pyarrow._parquet.ParquetSchema object at 0x10152e080>
required group field_id=-1 schema {
  optional int64 field_id=-1 id_geography_cpi;
  optional binary field_id=-1 country_cpi (String);
  optional binary field_id=-1 region_cpi (String);
  optional binary field_id=-1 region_cpi_granular (String);
  optional binary field_id=-1 region_cpi_additional (String);
  optional int64 field_id=-1 year_added;
  optional binary field_id=-1 oecd_membership (String);
  optional binary field_id=-1 dac_membership (String);
  optional binary field_id=-1 income_level (String);
  optional binary field_id=-1 unfccc_classification (String);
  optional binary field_id=-1 wb_classification (String);
  optional binary field_id=-1 r3_ipcc (String);
  optional binary field_id=-1 r6_ipcc (String);
  optional binary field_id=-1 r10_ipcc (String);
  optional binary field_id=-1 development_status (String);
  optional binary field_id=-1 development_status_2 (String);
  optional boolean field_id=-1 sids;
  optional

In [1]:
"""
Debug script to find exactly which import is failing
"""
import sys
import traceback

def test_import(module_path, class_name=None):
    """Test importing a module or class"""
    try:
        if class_name:
            exec(f"from {module_path} import {class_name}")
            print(f"✓ {module_path}.{class_name}")
        else:
            exec(f"import {module_path}")
            print(f"✓ {module_path}")
        return True
    except Exception as e:
        print(f"✗ {module_path}.{class_name if class_name else ''}")
        print(f"  Error: {e}")
        traceback.print_exc()
        print()
        return False

print("=" * 60)
print("Testing Imports")
print("=" * 60)
print()

print("STEP 1: Testing basic module imports")
print("-" * 60)
test_import("database")
test_import("services")
test_import("ui")
test_import("utils")
print()

print("STEP 2: Testing utils (no dependencies)")
print("-" * 60)
test_import("utils.text_processing", "TextProcessor")
test_import("utils.fuzzy_matching", "FuzzyMatcher")
print()

print("STEP 3: Testing database")
print("-" * 60)
test_import("database.connection", "DatabaseConnection")
test_import("database.queries", "QueryService")
print()

print("STEP 4: Testing services")
print("-" * 60)
test_import("services.validation_service", "ValidationService")
test_import("services.enrichment_service", "EnrichmentService")
test_import("services.standardization_service", "StandardizationService")
test_import("services.audit_service", "AuditService")
test_import("services.institution_service", "InstitutionService")
print()

print("STEP 5: Testing UI")
print("-" * 60)
test_import("ui.components")
test_import("ui.institution_form")
test_import("ui.bulk_upload")
print()

print("=" * 60)
print("Diagnosis complete")
print("=" * 60)

Testing Imports

STEP 1: Testing basic module imports
------------------------------------------------------------
✓ database
✓ services
✓ ui
✓ utils

STEP 2: Testing utils (no dependencies)
------------------------------------------------------------
✓ utils.text_processing.TextProcessor
✓ utils.fuzzy_matching.FuzzyMatcher

STEP 3: Testing database
------------------------------------------------------------
✓ database.connection.DatabaseConnection
✓ database.queries.QueryService

STEP 4: Testing services
------------------------------------------------------------
✓ services.validation_service.ValidationService
✓ services.enrichment_service.EnrichmentService
✓ services.standardization_service.StandardizationService
✓ services.audit_service.AuditService
✓ services.institution_service.InstitutionService

STEP 5: Testing UI
------------------------------------------------------------
✓ ui.components
✓ ui.institution_form
✓ ui.bulk_upload

Diagnosis complete
