clear out s3 partitions

In [14]:
# Clear S3 partitions - TESTING CLEANUP
import boto3
from botocore.exceptions import ClientError
from botocore.config import Config
import time

# Configuration - update these values based on your setup
S3_BUCKET = "healthcare-data-lake-prod"  # or your bucket name
S3_PREFIX = "partitioned-data"
REGION = "us-east-1"

# Initialize S3 client with retry configuration
s3_client = boto3.client('s3', 
                        region_name=REGION,
                        config=Config(retries={'max_attempts': 3, 'mode': 'adaptive'}))

def list_all_objects(bucket, prefix):
    """List all objects under the given prefix"""
    objects = []
    paginator = s3_client.get_paginator('list_objects_v2')
    
    try:
        pages = paginator.paginate(Bucket=bucket, Prefix=prefix)
        for page in pages:
            for obj in page.get('Contents', []):
                objects.append({
                    'Key': obj['Key'],
                    'Size': obj['Size'],
                    'LastModified': obj['LastModified']
                })
        return objects
    except ClientError as e:
        print(f"❌ Error listing objects: {e}")
        return []

def delete_objects_batch(bucket, objects, batch_size=1000):
    """Delete objects in batches"""
    deleted_count = 0
    total_size = 0
    
    for i in range(0, len(objects), batch_size):
        batch = objects[i:i + batch_size]
        
        # Prepare delete request
        delete_objects = [{'Key': obj['Key']} for obj in batch]
        
        try:
            response = s3_client.delete_objects(
                Bucket=bucket,
                Delete={
                    'Objects': delete_objects,
                    'Quiet': False
                }
            )
            
            # Count successful deletions
            deleted = response.get('Deleted', [])
            errors = response.get('Errors', [])
            
            deleted_count += len(deleted)
            total_size += sum(obj['Size'] for obj in batch if obj['Key'] in [d['Key'] for d in deleted])
            
            if errors:
                print(f"⚠️  Errors in batch {i//batch_size + 1}:")
                for error in errors:
                    print(f"    {error['Key']}: {error['Message']}")
            
            print(f"✅ Batch {i//batch_size + 1}: Deleted {len(deleted)} objects")
            
        except ClientError as e:
            print(f"❌ Error deleting batch {i//batch_size + 1}: {e}")
            continue
    
    return deleted_count, total_size

def clear_s3_partitions(bucket, prefix, confirm=False):
    """Clear all S3 partitions under the given prefix"""
    
    if not confirm:
        print("⚠️  WARNING: This will delete ALL data under the specified prefix!")
        print(f"   Bucket: {bucket}")
        print(f"   Prefix: {prefix}")
        print("\n🔍 First, let's see what will be deleted...")
        
        # List objects to show what will be deleted
        objects = list_all_objects(bucket, prefix)
        
        if not objects:
            print("✅ No objects found under the specified prefix.")
            return
        
        print(f"\n�� Found {len(objects)} objects to delete:")
        
        # Group by partition structure
        partitions = {}
        total_size = 0
        
        for obj in objects:
            # Extract partition path (everything before the filename)
            key_parts = obj['Key'].split('/')
            if len(key_parts) > 1:
                partition_path = '/'.join(key_parts[:-1])
                if partition_path not in partitions:
                    partitions[partition_path] = {'count': 0, 'size': 0}
                partitions[partition_path]['count'] += 1
                partitions[partition_path]['size'] += obj['Size']
                total_size += obj['Size']
        
        print(f"\n🗂️  Partition summary:")
        for partition, stats in sorted(partitions.items()):
            size_mb = stats['size'] / (1024 * 1024)
            print(f"   {partition}: {stats['count']} files, {size_mb:.1f} MB")
        
        print(f"\n�� Total: {len(objects)} objects, {total_size / (1024 * 1024):.1f} MB")
        
        # Ask for confirmation
        print(f"\n⚠️  Are you sure you want to delete all {len(objects)} objects?")
        print("   Type 'DELETE' to confirm, or anything else to cancel:")
        user_input = input("> ").strip()
        
        if user_input != 'DELETE':
            print("❌ Deletion cancelled.")
            return
    
    print(f"\n🗑️  Starting deletion of S3 partitions...")
    start_time = time.time()
    
    # Get all objects
    objects = list_all_objects(bucket, prefix)
    
    if not objects:
        print("✅ No objects found to delete.")
        return
    
    # Delete objects in batches
    deleted_count, deleted_size = delete_objects_batch(bucket, objects)
    
    end_time = time.time()
    duration = end_time - start_time
    
    print(f"\n✅ Deletion completed!")
    print(f"   Deleted: {deleted_count} objects")
    print(f"   Size: {deleted_size / (1024 * 1024):.1f} MB")
    print(f"   Duration: {duration:.1f} seconds")
    
    # Verify deletion
    print(f"\n🔍 Verifying deletion...")
    remaining_objects = list_all_objects(bucket, prefix)
    
    if remaining_objects:
        print(f"⚠️  Warning: {len(remaining_objects)} objects still remain")
        for obj in remaining_objects[:5]:  # Show first 5 remaining
            print(f"   {obj['Key']}")
        if len(remaining_objects) > 5:
            print(f"   ... and {len(remaining_objects) - 5} more")
    else:
        print("✅ All objects successfully deleted!")

# Run the cleanup
print("�� S3 Partition Cleanup Tool")
print("=" * 50)

# Uncomment the line below to run the cleanup
#clear_s3_partitions(S3_BUCKET, S3_PREFIX)

# For safety, the function is commented out by default
# Uncomment the line above and run the cell to start the cleanup process
print("⚠️  Cleanup function is commented out for safety.")
print("   Uncomment the last line to run the cleanup.")

�� S3 Partition Cleanup Tool
⚠️  Cleanup function is commented out for safety.
   Uncomment the last line to run the cleanup.


In [None]:
# Basic notebook cell to inspect fact enriched data from S3
import polars as pl
import boto3
from botocore.exceptions import ClientError
import io

# Configuration - update these values based on your setup
S3_BUCKET = "healthcare-data-lake-prod"  # or your bucket name
S3_PREFIX = "partitioned-data"
REGION = "us-east-1"

# Initialize S3 client
s3_client = boto3.client('s3', region_name=REGION)

def list_s3_partitions(bucket, prefix):
    """List all parquet files in S3 partitions"""
    partitions = []
    paginator = s3_client.get_paginator('list_objects_v2')
    
    try:
        pages = paginator.paginate(Bucket=bucket, Prefix=prefix)
        for page in pages:
            for obj in page.get('Contents', []):
                if obj['Key'].endswith('.parquet'):
                    partitions.append(f"s3://{bucket}/{obj['Key']}")
        return partitions
    except ClientError as e:
        print(f"Error listing partitions: {e}")
        return []

def download_and_inspect_partition(s3_path, sample_rows=10):
    """Download a partition and inspect its contents"""
    try:
        # Parse S3 path
        if s3_path.startswith('s3://'):
            s3_path = s3_path[5:]
        bucket, key = s3_path.split('/', 1)
        
        # Download parquet file
        response = s3_client.get_object(Bucket=bucket, Key=key)
        parquet_data = response['Body'].read()
        
        # Load into polars
        df = pl.read_parquet(io.BytesIO(parquet_data))
        
        print(f"📊 Partition: {s3_path}")
        print(f"📏 Shape: {df.height:,} rows × {len(df.columns)} columns")
        print(f"📅 Columns: {list(df.columns)}")
        print("\n�� Sample data:")
        print(df.head(sample_rows))
        
        print("\n📈 Basic statistics:")
        print(f"  - Negotiated rates: {df['negotiated_rate'].min():.2f} to {df['negotiated_rate'].max():.2f}")
        print(f"  - Unique codes: {df['code'].n_unique()}")
        print(f"  - Unique NPIs: {df['npi'].n_unique()}")
        print(f"  - States: {df['state'].unique().to_list()}")
        
        return df
        
    except Exception as e:
        print(f"❌ Error downloading partition {s3_path}: {e}")
        return None

# List available partitions
print("�� Listing S3 partitions...")
partitions = list_s3_partitions(S3_BUCKET, S3_PREFIX)
print(f"Found {len(partitions)} partitions")

if partitions:
    # Inspect the first partition
    print(f"\n📋 Inspecting first partition:")
    sample_df = download_and_inspect_partition(partitions[0])
    print(sample_df.head(2))
    
    # Show partition structure
    print(f"\n🗂️  Partition structure:")
    for i, partition in enumerate(partitions[:5]):  # Show first 5 partitions
        print(f"  {i+1}. {partition}")
    
    if len(partitions) > 5:
        print(f"  ... and {len(partitions) - 5} more partitions")
        
else:
    print("❌ No partitions found. Make sure the ETL3 pipeline has been run and data is uploaded to S3.")

�� Listing S3 partitions...
Found 4 partitions

📋 Inspecting first partition:
📊 Partition: healthcare-data-lake-prod/partitioned-data/payer_slug=unitedhealthcare-of-georgia-inc/state=AL/billing_class=professional/procedure_set=Evaluation_and_Management/procedure_class=Behavioral_health_services/primary_taxonomy_code=133V00000X/stat_area_name=Birmingham-Cullman-Talladega,_AL_CSA/year=2025/month=08/fact_rate_enriched.parquet
📏 Shape: 10 rows × 57 columns
📅 Columns: ['fact_uid', 'state', 'year_month', 'payer_slug', 'billing_class', 'code_type', 'code', 'pg_uid', 'pos_set_id', 'negotiated_type', 'negotiation_arrangement', 'negotiated_rate', 'expiration_date', 'provider_group_id_raw', 'reporting_entity_name', 'code_description', 'code_name', 'proc_set', 'proc_class', 'proc_group', 'reporting_entity_name_right', 'version', 'payer_slug_right', 'provider_group_id_raw_right', 'version_right', 'pos_members', 'npi', 'organization_name', 'nppes_fetch_date', 'status', 'primary_taxonomy_code', 'repl

In [None]:
sample_df.head(2)


fact_uid,state,year_month,payer_slug,billing_class,code_type,code,pg_uid,pos_set_id,negotiated_type,negotiation_arrangement,negotiated_rate,expiration_date,provider_group_id_raw,reporting_entity_name,code_description,code_name,proc_set,proc_class,proc_group,reporting_entity_name_right,version,payer_slug_right,provider_group_id_raw_right,version_right,pos_members,npi,organization_name,nppes_fetch_date,status,primary_taxonomy_code,replacement_npi,credential,enumeration_date,primary_taxonomy_state,first_name,primary_taxonomy_desc,last_name,last_updated,sole_proprietor,enumeration_type,primary_taxonomy_license,nppes_fetched,tin_type,tin_value,state_geo,latitude,longitude,county_name,county_fips,stat_area_name,stat_area_code,matched_address,year,month,procedure_set,procedure_class
str,str,str,str,str,str,str,str,str,str,str,f64,str,str,str,str,str,str,str,str,str,str,str,i64,str,list[str],str,str,str,str,str,null,str,str,str,str,str,str,str,str,str,str,bool,str,str,str,f64,f64,str,str,str,str,str,str,str,str,str
"""b19276e2e55bb4ad046bc4da98487e…","""AL""","""2025-08""","""unitedhealthcare-of-georgia-in…","""professional""","""CPT""","""99484""","""416b4940b4cda3351ac694eb901de7…","""17b00c58b3dcdb9c20cb2a70b52a4c…","""negotiated""","""ffs""",108.63,"""9999-12-31""","""882""","""UnitedHealthcare of Georgia In…","""Care management services for b…","""CARE MGMT SERVICES BEHAVIORAL …","""Evaluation and Management""","""Behavioral health services""","""Other""","""UnitedHealthcare of Georgia In…","""1.0.0""","""unitedhealthcare-of-georgia-in…",882,"""1.0.0""","[""02"", ""05"", … ""61""]","""1295227007""",,"""2018-06-06""","""A""","""133V00000X""",,"""RD""","""2018-06-06""","""AL""","""RAVEN""","""Dietitian, Registered""","""HOLMES""","""2018-06-06""","""NO""","""NPI-1""","""2615""",True,"""ein""","""580572465""","""AL""",33.508422,-86.788636,"""Jefferson County""","""01073""","""Birmingham-Cullman-Talladega, …","""142""","""833 ST VINCENTS DR, BIRMINGHAM…","""2025""","""08""","""Evaluation and Management""","""Behavioral health services"""
"""b19276e2e55bb4ad046bc4da98487e…","""AL""","""2025-08""","""unitedhealthcare-of-georgia-in…","""professional""","""CPT""","""99484""","""416b4940b4cda3351ac694eb901de7…","""17b00c58b3dcdb9c20cb2a70b52a4c…","""negotiated""","""ffs""",108.63,"""9999-12-31""","""882""","""UnitedHealthcare of Georgia In…","""Care management services for b…","""CARE MGMT SERVICES BEHAVIORAL …","""Evaluation and Management""","""Behavioral health services""","""Other""","""UnitedHealthcare of Georgia In…","""1.0.0""","""unitedhealthcare-of-georgia-in…",882,"""1.0.0""","[""02"", ""05"", … ""61""]","""1295227007""",,"""2018-06-06""","""A""","""133V00000X""",,"""RD""","""2018-06-06""","""AL""","""RAVEN""","""Dietitian, Registered""","""HOLMES""","""2018-06-06""","""NO""","""NPI-1""","""2615""",True,"""ein""","""611665353""","""AL""",33.508422,-86.788636,"""Jefferson County""","""01073""","""Birmingham-Cullman-Talladega, …","""142""","""833 ST VINCENTS DR, BIRMINGHAM…","""2025""","""08""","""Evaluation and Management""","""Behavioral health services"""


In [18]:
print(len(sample_df))

10


In [19]:
print(sample_df.columns)

['fact_uid', 'state', 'year_month', 'payer_slug', 'billing_class', 'code_type', 'code', 'pg_uid', 'pos_set_id', 'negotiated_type', 'negotiation_arrangement', 'negotiated_rate', 'expiration_date', 'provider_group_id_raw', 'reporting_entity_name', 'code_description', 'code_name', 'proc_set', 'proc_class', 'proc_group', 'reporting_entity_name_right', 'version', 'payer_slug_right', 'provider_group_id_raw_right', 'version_right', 'pos_members', 'npi', 'organization_name', 'nppes_fetch_date', 'status', 'primary_taxonomy_code', 'replacement_npi', 'credential', 'enumeration_date', 'primary_taxonomy_state', 'first_name', 'primary_taxonomy_desc', 'last_name', 'last_updated', 'sole_proprietor', 'enumeration_type', 'primary_taxonomy_license', 'nppes_fetched', 'tin_type', 'tin_value', 'state_geo', 'latitude', 'longitude', 'county_name', 'county_fips', 'stat_area_name', 'stat_area_code', 'matched_address', 'year', 'month', 'procedure_set', 'procedure_class']
