<a href="https://colab.research.google.com/github/chepalkalden/Artificial-Intellegence-Projects/blob/main/US%20Health-Care%20Claims%20Calssification/ETL_PIPELINE_CLAIMS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Downloading the data from CMS GOV into google drive

In [None]:
import os
import zipfile
import urllib.request
import pandas as pd
from datetime import datetime
from sklearn.preprocessing import LabelEncoder

# Create claims-categorization directory and a data subdirectory within it
os.makedirs("/content/drive/MyDrive/Colab Notebooks/Claims-Categorization/data", exist_ok=True)

# CMS ZIP URLs
urls = {
    "benificiary": "https://www.cms.gov/research-statistics-data-and-systems/downloadable-public-use-files/synpufs/downloads/de1_0_2008_beneficiary_summary_file_sample_1.zip",
    "carrier": "http://downloads.cms.gov/files/DE1_0_2008_to_2010_Carrier_Claims_Sample_1A.zip",
    "inpatient": "https://www.cms.gov/research-statistics-data-and-systems/downloadable-public-use-files/synpufs/downloads/de1_0_2008_to_2010_inpatient_claims_sample_1.zip",
    "outpatient": "https://www.cms.gov/research-statistics-data-and-systems/downloadable-public-use-files/synpufs/downloads/de1_0_2008_to_2010_outpatient_claims_sample_1.zip",
    "drug": "http://downloads.cms.gov/files/DE1_0_2008_to_2010_Prescription_Drug_Events_Sample_1.zip"
}

def download_and_extract(name, url):
    zip_path = f"/content/drive/MyDrive/Colab Notebooks/Claims-Categorization/data/{name}.zip"
    extract_path = f"/content/drive/MyDrive/Colab Notebooks/Claims-Categorization/data/{name}"
    urllib.request.urlretrieve(url, zip_path)
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)
    print(f"{name} data extracted to {extract_path}")
    return extract_path

Transform & Feature Engineering

In [None]:
def clean_and_transform(df, claim_type):
    df = df.drop_duplicates()

    # Check if 'CLM_THRU_DT' exists before dropping NaNs and creating date features
    if 'CLM_THRU_DT' in df.columns:
        df = df.dropna(subset=['CLM_THRU_DT'])

        # Date features
        df['CLM_THRU_DT'] = pd.to_datetime(df['CLM_THRU_DT'], errors='coerce')
        df['day_of_week'] = df['CLM_THRU_DT'].dt.dayofweek
        df['month'] = df['CLM_THRU_DT'].dt.month
    else:
        print(f"'CLM_THRU_DT' column not found in {claim_type} data. Skipping date feature creation.")


    # Normalize categorical features
    for col in ['DGNS_CD_1', 'PRCDR_CD_1', 'PRVDR_NUM']:
        if col in df.columns:
            df[col] = LabelEncoder().fit_transform(df[col].astype(str))

    # Feature engineering: frequency of procedures
    if 'PRCDR_CD_1' in df.columns:
        proc_freq = df['PRCDR_CD_1'].value_counts().to_dict()
        df['proc_freq'] = df['PRCDR_CD_1'].map(proc_freq)

    df['claim_type'] = claim_type
    return df

In [None]:
Saving the cleaned data into Google Cloud

In [None]:
def save_cleaned_data(df, name):
    output_path = f"/content/drive/MyDrive/Colab Notebooks/Claims-Categorization/data/cleaned_{name}.csv"
    df.to_csv(output_path, index=False)
    print(f"Saved cleaned {name} data to {output_path}")


Main constructor function to Run the ETL Pipeline

In [None]:
if __name__ == "__main__":
    for name, url in urls.items():
        path = download_and_extract(name, url)
        csv_files = [f for f in os.listdir(path) if f.endswith(".csv")]
        if csv_files:
            df = pd.read_csv(os.path.join(path, csv_files[0]), low_memory=False)
            cleaned_df = clean_and_transform(df, claim_type=name)
            save_cleaned_data(cleaned_df, name)

benificiary data extracted to /content/drive/MyDrive/Colab Notebooks/Claims-Categorization/data/benificiary
'CLM_THRU_DT' column not found in benificiary data. Skipping date feature creation.
Saved cleaned benificiary data to /content/drive/MyDrive/Colab Notebooks/Claims-Categorization/data/cleaned_benificiary.csv
carrier data extracted to /content/drive/MyDrive/Colab Notebooks/Claims-Categorization/data/carrier
Saved cleaned carrier data to /content/drive/MyDrive/Colab Notebooks/Claims-Categorization/data/cleaned_carrier.csv
inpatient data extracted to /content/drive/MyDrive/Colab Notebooks/Claims-Categorization/data/inpatient
Saved cleaned inpatient data to /content/drive/MyDrive/Colab Notebooks/Claims-Categorization/data/cleaned_inpatient.csv
outpatient data extracted to /content/drive/MyDrive/Colab Notebooks/Claims-Categorization/data/outpatient
Saved cleaned outpatient data to /content/drive/MyDrive/Colab Notebooks/Claims-Categorization/data/cleaned_outpatient.csv
drug data extrac