# Load Data

## Import Libraries

In [118]:
# Import pandas, numpy, math and matplotlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# seaborn is a data visualization library built on matplotlib
import seaborn as sns

# set the plotting style
sns.set_style("whitegrid")

## Load Diabetes Datasets

### Diabetes 130-US Hospitals for Years 1999-2008

In [119]:
# Loading dataset
diabetic_UCI = pd.read_csv("https://raw.githubusercontent.com/data5100-group/diabetes-risk-analysis/main/data/diabetic_data.csv")

# Loading mapping
ids_mapping_UCI = pd.read_csv("https://raw.githubusercontent.com/data5100-group/diabetes-risk-analysis/main/data/IDS_mapping.csv")

### Comprehensive Diabetes Clinical Dataset

In [120]:
# Loading dataset
diabetes_kaggle = pd.read_csv("https://raw.githubusercontent.com/data5100-group/diabetes-risk-analysis/main/data/diabetes_dataset.csv")

## Understanding the Data

In [121]:
diabetes_kaggle.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 16 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   year                  100000 non-null  int64  
 1   gender                100000 non-null  object 
 2   age                   100000 non-null  float64
 3   location              100000 non-null  object 
 4   race:AfricanAmerican  100000 non-null  int64  
 5   race:Asian            100000 non-null  int64  
 6   race:Caucasian        100000 non-null  int64  
 7   race:Hispanic         100000 non-null  int64  
 8   race:Other            100000 non-null  int64  
 9   hypertension          100000 non-null  int64  
 10  heart_disease         100000 non-null  int64  
 11  smoking_history       100000 non-null  object 
 12  bmi                   100000 non-null  float64
 13  hbA1c_level           100000 non-null  float64
 14  blood_glucose_level   100000 non-null  int64  
 15  d

In [122]:
diabetic_UCI.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Data columns (total 50 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   encounter_id              101766 non-null  int64 
 1   patient_nbr               101766 non-null  int64 
 2   race                      101766 non-null  object
 3   gender                    101766 non-null  object
 4   age                       101766 non-null  object
 5   weight                    101766 non-null  object
 6   admission_type_id         101766 non-null  int64 
 7   discharge_disposition_id  101766 non-null  int64 
 8   admission_source_id       101766 non-null  int64 
 9   time_in_hospital          101766 non-null  int64 
 10  payer_code                101766 non-null  object
 11  medical_specialty         101766 non-null  object
 12  num_lab_procedures        101766 non-null  int64 
 13  num_procedures            101766 non-null  int64 
 14  num_

# Clean the Datasets

## Columns to Keep

In [123]:
diabetic_UCI = diabetic_UCI[[
    # Identifiers: One patient number can have multiple encounter ids
   "encounter_id", "patient_nbr",

    # Demographics
    "race", "gender", "age",

    # Admission/Discharge info
    "admission_type_id", "discharge_disposition_id", "admission_source_id",

    # Hospital Stay/ Utilization
    "time_in_hospital", "num_medications",
    "number_outpatient", "number_emergency", "number_inpatient",

    # Diagnosis Codes
    "diag_1", "diag_2", "diag_3", "number_diagnoses",

    # Medication Information
    "metformin", "repaglinide", "nateglinide", "chlorpropamide", "glimepiride",
    "acetohexamide", "glipizide", "glyburide", "tolbutamide", "pioglitazone",
    "rosiglitazone", "acarbose", "miglitol", "troglitazone", "tolazamide", "examide",
    "citoglipton", "insulin", "glyburide-metformin", "glipizide-metformin", "glimepiride-pioglitazone",
    "metformin-rosiglitazone", "metformin-pioglitazone", "change", "diabetesMed",

    # Outcome
    "readmitted"
]]

In [124]:
diabetes_kaggle = diabetes_kaggle[[
    # Demographics
    "age", "gender", "race:AfricanAmerican", "race:Asian", "race:Caucasian", "race:Other",

    # Factors
    "hypertension", "heart_disease", "smoking_history", "bmi", "hbA1c_level", "blood_glucose_level",

    # Outcome
    "diabetes"
]]

## Recombine diabetic_UCI and ids_mapping_UCI

In [125]:
first, second = ids_mapping_UCI.columns[:2]

m = ids_mapping_UCI.copy()
m[first] = m[first].astype(str).str.strip()

SECS = ["admission_type_id", "discharge_disposition_id", "admission_source_id"]

# Mark section rows only where the cell equals a section name
m["section"] = m[first].where(m[first].isin(SECS)).ffill()

# The top block in the CSV admission_type_id has no explicit marker row,
# so fill any remaining NAs with the first section name.
m["section"] = m["section"].fillna("admission_type_id")

# Keep only the three sections we care about
m = m[m["section"].isin(SECS)].copy()

# Keep only numeric ID rows within each section
m["id_num"] = pd.to_numeric(m[first], errors="coerce")
m = m[m["id_num"].notna()].copy()
m["id_num"] = m["id_num"].astype(int)

# Build dicts per section
maps = {sec: dict(zip(g["id_num"], g[second])) for sec, g in m.groupby("section")}
print("Mapping sizes:", {k: len(v) for k, v in maps.items()})

# Replace codes in-place in the main dataframe
for col in SECS:
    mp = maps.get(col, {})
    diabetic_UCI[col] = (
        pd.to_numeric(diabetic_UCI[col], errors="coerce")
          .astype("Int64")
          .map(mp)
          .astype("string")
    )

Mapping sizes: {'admission_source_id': 25, 'admission_type_id': 8, 'discharge_disposition_id': 30}


In [126]:
# Rename Columns without "id"
diabetic_UCI = diabetic_UCI.rename(columns={
    "admission_type_id": "admission_type",
    "discharge_disposition_id": "discharge_disposition",
    "admission_source_id": "admission_source",
})

# Exporting CSVs

In [129]:
# Turn the cleaned datasets to CSVs
diabetic_UCI.to_csv(
    'diabetic_UCI',
    encoding='utf-8-sig',
    index=False
)

diabetes_kaggle.to_csv(
    'diabetes_kaggle',
    encoding='utf-8-sig',
    index=False
)

In [130]:
# Download datasets
from google.colab import files

files.download('diabetic_UCI')
files.download('diabetes_kaggle')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

##