In [1]:
from google.colab import drive
drive.mount('/content/drive')

import os

#folder paths
project_folder = '/content/drive/MyDrive/Healthcare_Analysis_Project'
data_raw_path = f'{project_folder}/data_raw'
data_processed_path = f'{project_folder}/data_processed'

Mounted at /content/drive


In [2]:
!pip install pandas requests openpyxl -q

import pandas as pd
import requests
import json
from datetime import datetime

Fetch CDC Social Vulnerability Index Data

In [5]:
def fetch_cdc_svi_data():
    """
    Fetch CDC Social Vulnerability Index data
    Source: CDC/ATSDR Social Vulnerability Index
    Updated: Uses ArcGIS REST API as the direct CSV downloads have moved
    """
    print("="*60)
    print("FETCHING CDC SOCIAL VULNERABILITY INDEX DATA")
    print("="*60)

    # Option 1: Try the ArcGIS REST API endpoint for county-level data
    # This is more reliable than the old direct CSV links
    base_url = "https://onemap.cdc.gov/onemapservices/rest/services/SVI/CDC_ATSDR_Social_Vulnerability_Index_2022_USA/FeatureServer/2/query"

    # Parameters to get all county records
    params = {
        'where': '1=1',  # Get all records
        'outFields': '*',  # Get all fields
        'f': 'json',  # Return format
        'resultRecordCount': 5000  # Maximum records per request
    }

    print(f"\n Connecting to CDC ArcGIS API...")
    print(f" URL: {base_url}")

    try:
        response = requests.get(base_url, params=params, timeout=60)

        if response.status_code == 200:
            data = response.json()

            if 'features' in data:
                # Extract features and convert to DataFrame
                records = []
                for feature in data['features']:
                    records.append(feature['attributes'])

                df = pd.DataFrame(records)

                # Save the data
                filepath = f"{data_raw_path}/cdc_svi_raw.csv"
                df.to_csv(filepath, index=False)

                print(f"\n SUCCESS! Data downloaded from ArcGIS API")
                print(f" Saved to: {filepath}")
                print(f"\n Dataset Info:")
                print(f"   ‚Ä¢ Total rows: {len(df):,}")
                print(f"   ‚Ä¢ Total columns: {len(df.columns)}")
                print(f"   ‚Ä¢ File size: {os.path.getsize(filepath) / 1_000_000:.1f} MB")

                return df
            else:
                print(f"\n ERROR: No features found in response")
                return None

        else:
            print(f"\n ERROR: Status code {response.status_code}")
            print("The CDC has moved their data. Please visit:")
            print("https://www.atsdr.cdc.gov/place-health/php/svi/svi-data-documentation-download.html")
            return None

    except Exception as e:
        print(f"\n ERROR: {str(e)}")
        print("\n  NOTE: The CDC changed their data distribution method.")
        print("   Alternative download locations:")
        print("   1. Interactive Map: https://www.atsdr.cdc.gov/place-health/php/svi/svi-interactive-map.html")
        print("   2. Data & Documentation: https://www.atsdr.cdc.gov/place-health/php/svi/svi-data-documentation-download.html")
        print("   3. ArcGIS Online: Search for 'CDC Social Vulnerability Index 2022'")
        return None

# Run the function
cdc_df = fetch_cdc_svi_data()

# Show preview if successful
if cdc_df is not None:
    print("\n" + "="*60)
    print("DATA PREVIEW")
    print("="*60)
    display(cdc_df.head())

    print("\n Available Columns:")
    for i, col in enumerate(cdc_df.columns[:20], 1):  # Show first 20
        print(f"   {i}. {col}")
    if len(cdc_df.columns) > 20:
        print(f"   ... and {len(cdc_df.columns) - 20} more columns")

FETCHING CDC SOCIAL VULNERABILITY INDEX DATA

üì° Connecting to CDC ArcGIS API...
üîó URL: https://onemap.cdc.gov/onemapservices/rest/services/SVI/CDC_ATSDR_Social_Vulnerability_Index_2022_USA/FeatureServer/2/query

‚úÖ SUCCESS! Data downloaded from ArcGIS API
üíæ Saved to: /content/drive/MyDrive/Healthcare_Analysis_Project/data_raw/cdc_svi_raw.csv

üìä Dataset Info:
   ‚Ä¢ Total rows: 2,000
   ‚Ä¢ Total columns: 161
   ‚Ä¢ File size: 1.5 MB

DATA PREVIEW


Unnamed: 0,ST,STATE,ST_ABBR,STCNTY,COUNTY,FIPS,LOCATION,AREA_SQMI,E_TOTPOP,M_TOTPOP,...,MP_AIAN,EP_NHPI,MP_NHPI,EP_TWOMORE,MP_TWOMORE,EP_OTHERRACE,MP_OTHERRACE,GRASP_ID,Shape__Area,Shape__Length
0,1,Alabama,AL,1001,Autauga County,1001020100,Census Tract 201; Autauga County; Alabama,3.793569,1865,368,...,2.0,0.0,2.0,5.5,4.2,0.0,2.0,1,0.000948,0.150433
1,1,Alabama,AL,1001,Autauga County,1001020200,Census Tract 202; Autauga County; Alabama,1.282174,1861,396,...,2.0,0.0,2.0,7.3,6.3,0.4,0.9,2,0.000319,0.092349
2,1,Alabama,AL,1001,Autauga County,1001020300,Census Tract 203; Autauga County; Alabama,2.065364,3492,593,...,1.1,0.0,1.1,4.2,5.2,0.0,1.1,3,0.000516,0.100281
3,1,Alabama,AL,1001,Autauga County,1001020400,Census Tract 204; Autauga County; Alabama,2.464984,3987,411,...,0.3,0.0,0.9,2.9,2.6,0.0,0.9,4,0.000607,0.113935
4,1,Alabama,AL,1001,Autauga County,1001020501,Census Tract 205.01; Autauga County; Alabama,2.395243,4121,709,...,0.9,0.0,0.9,1.0,1.3,0.0,0.9,5,0.000597,0.108863



üìã Available Columns:
   1. ST
   2. STATE
   3. ST_ABBR
   4. STCNTY
   5. COUNTY
   6. FIPS
   7. LOCATION
   8. AREA_SQMI
   9. E_TOTPOP
   10. M_TOTPOP
   11. E_HU
   12. M_HU
   13. E_HH
   14. M_HH
   15. E_POV150
   16. M_POV150
   17. E_UNEMP
   18. M_UNEMP
   19. E_HBURD
   20. M_HBURD
   ... and 141 more columns


 Fetch CMS Hospital Data via API

In [9]:
import json

def fetch_cms_hospital_data():
    """
    Fetch CMS Hospital General Information data
    Uses the data.json catalog to find the correct dataset
    """
    print("="*60)
    print("FETCHING CMS HOSPITAL DATA")
    print("="*60)

    print(f"\n Step 1: Fetching CMS data catalog...")

    try:
        # Get the catalog
        catalog_url = "https://data.cms.gov/data.json"
        catalog_response = requests.get(catalog_url, timeout=60)

        if catalog_response.status_code != 200:
            print(f" ERROR: Could not fetch catalog")
            return None

        catalog = catalog_response.json()

        # Search for hospital datasets
        print(f"\n Step 2: Searching for Hospital General Information...")

        hospital_keywords = ['hospital general', 'hospital compare', 'provider general']
        found_dataset = None

        for dataset in catalog['dataset']:
            title = dataset.get('title', '').lower()
            description = dataset.get('description', '').lower()

            # Look for hospital general information
            if any(keyword in title or keyword in description for keyword in hospital_keywords):
                if 'general information' in title or 'general information' in description:
                    found_dataset = dataset
                    print(f"‚úì Found: {dataset['title']}")
                    break

        if not found_dataset:
            print(" Searching for any hospital provider data...")
            # Fallback: look for any hospital provider dataset
            for dataset in catalog['dataset']:
                title = dataset.get('title', '').lower()
                if 'hospital' in title and 'provider' in title:
                    found_dataset = dataset
                    print(f" Found alternative: {dataset['title']}")
                    break

        if not found_dataset:
            print(" Could not find suitable hospital dataset")
            print("\n Try downloading manually from:")
            print("   https://data.cms.gov/provider-data/topics/hospitals")
            return None

        # Get the API endpoint
        api_endpoint = None
        download_url = None

        for distro in found_dataset['distribution']:
            if distro.get('format') == 'API' and distro.get('description') == 'latest':
                api_endpoint = distro.get('accessURL')
            if distro.get('mediaType') == 'text/csv' and distro.get('description') == 'latest':
                download_url = distro.get('downloadURL')

        # Try CSV download first (more reliable)
        if download_url:
            print(f"\n Step 3: Downloading CSV file...")
            print(f" URL: {download_url}")

            csv_response = requests.get(download_url, timeout=120)

            if csv_response.status_code == 200:
                filepath = f"{data_raw_path}/cms_hospitals_raw.csv"

                with open(filepath, 'wb') as f:
                    f.write(csv_response.content)

                df = pd.read_csv(filepath)

                print(f"\n SUCCESS! Data downloaded")
                print(f" Saved to: {filepath}")
                print(f"\n Dataset Info:")
                print(f"   ‚Ä¢ Total hospitals: {len(df):,}")
                print(f"   ‚Ä¢ Total columns: {len(df.columns)}")
                print(f"   ‚Ä¢ File size: {len(csv_response.content) / 1_000_000:.1f} MB")

                return df

        # Fallback to API
        if api_endpoint:
            print(f"\n Step 3: Using API endpoint...")
            print(f" URL: {api_endpoint}")

            # Get total count
            stats_response = requests.get(api_endpoint + "/stats", timeout=30)
            total_rows = 10000
            if stats_response.status_code == 200:
                total_rows = stats_response.json().get('total_rows', 10000)
                print(f"‚úì Total records: {total_rows:,}")

            # Fetch data with pagination
            all_data = []
            offset = 0
            size = 5000
            max_records = min(total_rows, 10000)

            while offset < max_records:
                params = {'size': size, 'offset': offset}
                print(f"   Fetching records {offset+1} to {min(offset+size, max_records)}...", end=" ")

                response = requests.get(api_endpoint, params=params, timeout=60)

                if response.status_code == 200:
                    data = response.json()
                    if not data:
                        break
                    all_data.extend(data)
                    print(f"‚úì")
                    if len(data) < size:
                        break
                    offset += size
                else:
                    print(f" Error {response.status_code}")
                    break

            if all_data:
                df = pd.DataFrame(all_data)
                filepath = f"{data_raw_path}/cms_hospitals_raw.csv"
                df.to_csv(filepath, index=False)

                print(f"\n SUCCESS! Data downloaded")
                print(f" Saved to: {filepath}")
                print(f"\n Dataset Info:")
                print(f"   ‚Ä¢ Total hospitals: {len(all_data):,}")
                print(f"   ‚Ä¢ Total columns: {len(df.columns)}")

                return df

        print(" No download method available")
        return None

    except Exception as e:
        print(f"\n ERROR: {str(e)}")
        print("\n Manual download:")
        print("   1. Visit: https://data.cms.gov/provider-data/topics/hospitals")
        print("   2. Click 'Export' and download CSV")
        return None

# Run the function
hospitals_df = fetch_cms_hospital_data()

# Show preview if successful
if hospitals_df is not None:
    print("\n" + "="*60)
    print("DATA PREVIEW")
    print("="*60)
    display(hospitals_df.head())

    print("\nüìã Available Columns:")
    for i, col in enumerate(hospitals_df.columns[:15], 1):
        print(f"   {i}. {col}")

    if len(hospitals_df.columns) > 15:
        print(f"   ... and {len(hospitals_df.columns) - 15} more columns")

FETCHING CMS HOSPITAL DATA

üì° Step 1: Fetching CMS data catalog...

üîç Step 2: Searching for Hospital General Information...
‚ö†Ô∏è  Searching for any hospital provider data...
‚úì Found alternative: Medicare Inpatient Hospitals - by Provider

üì• Step 3: Using API endpoint...
üîó URL: https://data.cms.gov/data-api/v1/dataset/ee6fb1a5-39b9-46b3-a980-a7284551a732/data
‚úì Total records: 3,093
   Fetching records 1 to 3093... ‚úì

‚úÖ SUCCESS! Data downloaded
üíæ Saved to: /content/drive/MyDrive/Healthcare_Analysis_Project/data_raw/cms_hospitals_raw.csv

üìä Dataset Info:
   ‚Ä¢ Total hospitals: 3,093
   ‚Ä¢ Total columns: 57

DATA PREVIEW


Unnamed: 0,Rndrng_Prvdr_CCN,Rndrng_Prvdr_Org_Name,Rndrng_Prvdr_St,Rndrng_Prvdr_City,Rndrng_Prvdr_Zip5,Rndrng_Prvdr_State_Abrvtn,Rndrng_Prvdr_State_FIPS,Rndrng_Prvdr_RUCA,Rndrng_Prvdr_RUCA_Desc,Tot_Benes,...,Bene_CC_PH_Diabetes_V2_Pct,Bene_CC_PH_HF_NonIHD_V2_Pct,Bene_CC_PH_Hyperlipidemia_V2_Pct,Bene_CC_PH_Hypertension_V2_Pct,Bene_CC_PH_IschemicHeart_V2_Pct,Bene_CC_PH_Osteoporosis_V2_Pct,Bene_CC_PH_Parkinson_V2_Pct,Bene_CC_PH_Arthritis_V2_Pct,Bene_CC_PH_Stroke_TIA_V2_Pct,Bene_Avg_Risk_Scre
0,10001,Southeast Health Medical Center,1108 Ross Clark Circle,Dothan,36301,AL,1,2,Metropolitan area high commuting: primary flow...,3088,...,0.4957901554,0.4381476684,0.75,0.75,0.5,0.1444300518,0.0272020725,0.6091321244,0.2678108808,2.0048648
1,10005,Marshall Medical Centers South Campus,2505 U S Highway 431 North,Boaz,35957,AL,1,4,Micropolitan area core: primary flow within an...,1123,...,0.4336598397,0.482635797,0.75,0.75,0.4737310775,0.1665182547,0.0471950134,0.606411398,0.2644701692,1.723296633
2,10006,North Alabama Medical Center,1701 Veterans Drive,Florence,35630,AL,1,1,Metropolitan area core: primary flow within an...,2634,...,0.4753227031,0.4202733485,0.75,0.75,0.5170842825,0.1924829157,0.0584662111,0.6511009871,0.2141230068,1.910259056
3,10007,Mizell Memorial Hospital,702 N Main St,Opp,36467,AL,1,7,Small town core: primary flow within an urban ...,252,...,0.5079365079,0.5238095238,0.75,0.75,0.4404761905,0.1111111111,0.0753968254,0.6031746032,0.1944444444,1.9350848218
4,10008,Crenshaw Community Hospital,101 Hospital Circle,Luverne,36049,AL,1,3,Metropolitan area low commuting: primary flow ...,89,...,0.4494382022,0.3033707865,0.75,0.75,0.4269662921,0.202247191,0.0337078652,0.5505617978,0.1797752809,1.6179516343



üìã Available Columns:
   1. Rndrng_Prvdr_CCN
   2. Rndrng_Prvdr_Org_Name
   3. Rndrng_Prvdr_St
   4. Rndrng_Prvdr_City
   5. Rndrng_Prvdr_Zip5
   6. Rndrng_Prvdr_State_Abrvtn
   7. Rndrng_Prvdr_State_FIPS
   8. Rndrng_Prvdr_RUCA
   9. Rndrng_Prvdr_RUCA_Desc
   10. Tot_Benes
   11. Tot_Submtd_Cvrd_Chrg
   12. Tot_Pymt_Amt
   13. Tot_Mdcr_Pymt_Amt
   14. Tot_Dschrgs
   15. Tot_Cvrd_Days
   ... and 42 more columns
