In [None]:
import os
import sys
from pathlib import Path

import pandas as pd
import plotly.express as px

sys.path.insert(0, str(Path("").resolve().parents[1]))
from utils.db_manager import MimicDBManager

# Configuration

In [None]:
width = 1200
height = 600
img_path = "/app/ml_model/EDA/images"
Path(img_path).mkdir(parents=True, exist_ok=True)
plot_fig = False
pd.set_option("display.max_rows", 100)

In [None]:
db_manager = MimicDBManager(port=5432)

# Quick Sanity check
Let's do a quick sanity check to make sure that the DB is being correctly accessed and works as expected

In [None]:
db_manager.retrieve_table_names()

In [None]:
[db_manager.count_rows(table_name=table) for table in db_manager.retrieve_table_names()[:10]]

In [None]:
table_name = db_manager.retrieve_table_names()[0]
db_manager.retrieve_id(table_name, id=18207287, id_column="subject_id")

In [None]:
pd.DataFrame.from_dict(db_manager.retrieve_column_types(table_name), orient="index", columns=["data_type"]).reset_index(
    names="column"
)

# EDA

In [None]:
tables = (
    pd.DataFrame.from_dict(
        {table: db_manager.retrieve_column_types(table) for table in db_manager.retrieve_table_names()}, orient="index"
    )
    .fillna("-")
    .sort_index()
    .reset_index(names="full_table_name")
)
tables[["parent_table", "table_name"]] = tables.full_table_name.str.split(".", expand=True)

## Table Descriptions

In [None]:
tables[tables.parent_table == "mimiciv_hosp"]

In [None]:
tables[tables.parent_table == "mimiciv_icu"]

In [None]:
tables[tables.parent_table == "mimiciv_derived"]

There are 63 tables created through the "concepts" database building process.

## Acute Renal Failure (ARF) / Acute Kidney Injury (AKI) EDA

Let's look now for patients with Acute Renal Failure, which should ahve an ICD-9 diagnostic code of 5848.

Note: ICD-9 Diagnostic codes can be found in: https://www2.gov.bc.ca/assets/gov/health/practitioner-pro/medical-services-plan/diag-codes_genitourinary.pdf

In [None]:
data = db_manager.retrieve_all("mimiciv_hosp.diagnoses_icd")
data["icd_code"] = data["icd_code"].str.strip()
arf = data[data.icd_code == "5848"]
arf

In [None]:
aki_patients = arf.merge(db_manager.retrieve_all("mimiciv_hosp.patients"), on="subject_id")
aki_patients

### Demographics

In [None]:
fig = px.histogram(
    aki_patients,
    x="anchor_age",
    color="gender",
    barmode="overlay",
    marginal="box",
    text_auto=True,
    width=width,
    height=height,
    title="Distribution of Patients with ICD-9 code 5848 in MIMIC-IV",
)
if plot_fig:
    fig.show()
fig.write_image(f"{img_path}/1.ICD-9_code-5848_distribution.png")

Not too many patients to work with. How about patients with *any* type of acute renal failure

In [None]:
data = db_manager.retrieve_all("mimiciv_hosp.diagnoses_icd")
data["icd_code"] = data["icd_code"].str.strip()
arf = data[data.icd_code.str.startswith("584")].drop_duplicates(subset="subject_id")
arf

In [None]:
aki_patients = (
    arf.merge(db_manager.retrieve_all("mimiciv_hosp.patients"), on="subject_id")
    .merge(db_manager.retrieve_all("mimiciv_hosp.admissions"), on=["subject_id", "hadm_id"])
    .merge(
        db_manager.retrieve_all("mimiciv_icu.icustays")
        .sort_values("outtime")
        .drop_duplicates(subset=["subject_id", "hadm_id"]),
        on=["subject_id", "hadm_id"],
        how="left",
    )
)
aki_patients["deceased"] = aki_patients["dod"].apply(lambda x: True if str(x).strip() != "None" else False)
aki_patients["icu_stay_days"] = (aki_patients["outtime"] - aki_patients["intime"]).dt.total_seconds() / 3600 / 24
aki_patients["icu_stay_days"] = aki_patients["icu_stay_days"].fillna(0)
aki_patients

In [None]:
aki_patients.columns

In [None]:
print(
    f"A total of {len(aki_patients[aki_patients.deceased==True])} patients suffering ARF/AKI died druing their stay in the hospital"
)
fig = px.histogram(
    aki_patients,
    x="anchor_age",
    facet_row="gender",
    barmode="group",
    marginal="box",
    color="deceased",
    nbins=30,
    text_auto=True,
    width=width,
    height=height,
    title="Histogram of ARF patients by age, deceased status, and gender",
)
if plot_fig:
    fig.show()
fig.write_image(f"{img_path}/2.1.anchor_age-deceased-gender.png")

In [None]:
fig = px.histogram(
    aki_patients,
    x="anchor_age",
    # facet_row="race",
    barmode="group",
    color="race",
    nbins=20,
    text_auto=True,
    width=width,
    height=height,
    title="Histogram of ARF patients by age and race",
)
if plot_fig:
    fig.show()
fig.write_image(f"{img_path}/2.2.anchor_age-race.png")

The top 2 groups overshadow the rest, let's remove them so we can take a better look at the rest of the  groups

In [None]:
fig = px.histogram(
    aki_patients[
        ~aki_patients.race.isin(aki_patients.value_counts(subset="race").reset_index().race.loc[:2].to_list())
    ],
    x="anchor_age",
    # facet_row="race",
    barmode="group",
    color="race",
    nbins=20,
    text_auto=True,
    width=width,
    height=height,
    title="Histogram of ARF patients by age and race excluding top 3",
)
if plot_fig:
    fig.show()
fig.write_image(f"{img_path}/2.3.anchor_age-race_no_top_3.png")

Now specifically let's look at hispanics

In [None]:
fig = px.histogram(
    aki_patients[aki_patients.race.str.lower().str.contains("hispanic")],
    x="anchor_age",
    # facet_row="race",
    barmode="group",
    color="race",
    nbins=20,
    text_auto=True,
    width=width,
    height=height,
    title="Histogram of ARF patients by age and race for hispanics",
)
if plot_fig:
    fig.show()
fig.write_image(f"{img_path}/2.4.anchor_age-race_hispanic.png")

### By ICD codes

91 is the hard limit for age on the DB, to guarantee non-identifiying patients, which would explain the higher amount of patients (both alive and deceased) in the last age group.

Let's take a look at the ICD codes as well

In [None]:
fig = px.histogram(
    aki_patients,
    x="anchor_age",
    barmode="group",
    marginal="box",
    color="icd_code",
    nbins=10,
    text_auto=True,
    width=width,
    height=height,
    title="Histogram of ARF patients by age and specific ICD-9 code",
)
if plot_fig:
    fig.show()
fig.write_image(f"{img_path}/3.1.anchor_age-icd_code.png")

In [None]:
print(f"A total of {len(aki_patients[aki_patients.icu_stay_days>0])} patients entered the ICU")
fig = px.histogram(
    aki_patients[aki_patients.icu_stay_days > 0],
    x="icu_stay_days",
    barmode="group",
    marginal="box",
    color="icd_code",
    nbins=50,
    text_auto=True,
    width=width,
    height=height,
    title="Histogram of ARF patients by ICU stay (in days) and specific ICD-9 code",
)
if plot_fig:
    fig.show()
fig.write_image(f"{img_path}/3.2.icu_stay_days-icd_code.png")

## Creatinine measurements
Now let's take a look at some visualizations using the creatinine information in the derived tables

In [None]:
creatinine = db_manager.retrieve_all("mimiciv_derived.kdigo_creatinine").fillna(0)
creatinine

### Creatinine low last 48 hours

In [None]:
filtering = "creat_low_past_48hr"

fig = px.histogram(
    aki_patients.merge(
        creatinine.sort_values(filtering, ascending=False).drop_duplicates(subset=["hadm_id"]), on="hadm_id"
    ),
    x=filtering,
    # facet_row="race",
    # barmode="group",
    marginal="box",
    color="icd_code",
    # nbins=20,
    text_auto=True,
    width=width,
    height=height,
    title=f"Histogram of creatinine values based on {filtering}",
)

if plot_fig:
    fig.show()
fig.write_image(f"{img_path}/4.1.max_{filtering}.png")

In [None]:
filtering = "creat_low_past_48hr"
data = aki_patients.merge(
    creatinine.sort_values(filtering, ascending=False).drop_duplicates(subset=["hadm_id"]), on="hadm_id"
)
fig = px.histogram(
    data[data.icd_code.isin(data.value_counts(subset="icd_code").reset_index().icd_code.to_list()[2:])],
    x=filtering,
    # facet_row="race",
    # barmode="group",
    marginal="box",
    color="icd_code",
    # nbins=20,
    text_auto=True,
    width=width,
    height=height,
    title=f"Histogram of creatinine values based on {filtering}, bottom 2 ICD-9 codes",
)

if plot_fig:
    fig.show()
fig.write_image(f"{img_path}/4.2.max_{filtering}_bottom_2.png")

### Creatinine low last 48 days

In [None]:
filtering = "creat_low_past_7day"

fig = px.histogram(
    aki_patients.merge(
        creatinine.sort_values(filtering, ascending=False).drop_duplicates(subset=["hadm_id"]), on="hadm_id"
    ),
    x=filtering,
    # facet_row="race",
    # barmode="group",
    marginal="box",
    color="icd_code",
    # nbins=20,
    text_auto=True,
    width=width,
    height=height,
    title=f"Histogram of creatinine values based on {filtering}",
)

if plot_fig:
    fig.show()
fig.write_image(f"{img_path}/4.3.max_{filtering}.png")

In [None]:
filtering = "creat_low_past_7day"
data = aki_patients.merge(
    creatinine.sort_values(filtering, ascending=False).drop_duplicates(subset=["hadm_id"]), on="hadm_id"
)
fig = px.histogram(
    data[data.icd_code.isin(data.value_counts(subset="icd_code").reset_index().icd_code.to_list()[2:])],
    x=filtering,
    # facet_row="race",
    # barmode="group",
    marginal="box",
    color="icd_code",
    # nbins=20,
    text_auto=True,
    width=width,
    height=height,
    title=f"Histogram of creatinine values based on {filtering}, bottom 2 ICD-9 codes",
)

if plot_fig:
    fig.show()
fig.write_image(f"{img_path}/4.4.max_{filtering}_bottom_2.png")

### Raw creatinine value

In [None]:
filtering = "creat"

fig = px.histogram(
    aki_patients.merge(
        creatinine.sort_values(filtering, ascending=False).drop_duplicates(subset=["hadm_id"]), on="hadm_id"
    ),
    x=filtering,
    # facet_row="race",
    # barmode="group",
    marginal="box",
    color="icd_code",
    # nbins=20,
    text_auto=True,
    width=width,
    height=height,
    title=f"Histogram of creatinine values based on {filtering}",
)

if plot_fig:
    fig.show()
fig.write_image(f"{img_path}/4.5.max_{filtering}.png")

In [None]:
filtering = "creat"
data = aki_patients.merge(
    creatinine.sort_values(filtering, ascending=False).drop_duplicates(subset=["hadm_id"]), on="hadm_id"
)
fig = px.histogram(
    data[data.icd_code.isin(data.value_counts(subset="icd_code").reset_index().icd_code.to_list()[2:])],
    x=filtering,
    # facet_row="race",
    # barmode="group",
    marginal="box",
    color="icd_code",
    # nbins=20,
    text_auto=True,
    width=width,
    height=height,
    title=f"Histogram of creatinine values based on {filtering}, bottom 2 ICD-9 codes",
)

if plot_fig:
    fig.show()
fig.write_image(f"{img_path}/4.6.max_{filtering}_bottom_2.png")

## KDIGO stages

In [None]:
kdigo_data = db_manager.retrieve_all("mimiciv_derived.kdigo_stages")
kdigo_data

In [None]:
data = aki_patients.merge(
    kdigo_data.sort_values("aki_stage_smoothed", ascending=False).drop_duplicates(subset="subject_id"),
    on=["subject_id", "hadm_id"],
    how="left",
).fillna(0)
data["aki_stage_smoothed"] = data["aki_stage_smoothed"].astype(int)

fig = px.histogram(
    data.sort_values("aki_stage_smoothed"),
    x="aki_stage_smoothed",
    # facet_row="race",
    # barmode="group",
    marginal="box",
    color="icd_code",
    # nbins=4,
    text_auto=True,
    width=width,
    height=height,
    title=f"Histogram of Maximum AKI Stage",
)
fig.update_xaxes(type="category")
if plot_fig:
    fig.show()
fig.write_image(f"{img_path}/5.1.max_aki_stage.png")

In [None]:
fig = px.histogram(
    data[data.icd_code.isin(data.value_counts(subset="icd_code").reset_index().icd_code.to_list()[2:])].sort_values(
        "aki_stage_smoothed"
    ),
    x="aki_stage_smoothed",
    # facet_row="race",
    # barmode="group",
    marginal="box",
    color="icd_code",
    # nbins=4,
    text_auto=True,
    width=width,
    height=height,
    title=f"Histogram of Maximum AKI Stage, bottom 2 ICD-9 codes",
)
fig.update_xaxes(type="category")
if plot_fig:
    fig.show()
fig.write_image(f"{img_path}/5.2.max_aki_stage_bottom_2.png")

## Medication Information

The main helpers to reduce AKI development are epinephrine, norepinephrine, dopamine and vassopresin, let's take a look at their values for our populations to see if they were used during the patient's stay.

In [None]:
meds = {}

### Norepinephrine

In [None]:
med = "norepinephrine"
meds[med] = (
    db_manager.retrieve_all(f"mimiciv_derived.{med}")
    .groupby("stay_id")
    .aggregate({"vaso_rate": sum, "vaso_amount": sum})
    .reset_index()
)
meds[med]["medication"] = med
meds[med]

In [None]:
data = aki_patients.merge(
    meds[med].sort_values("vaso_amount", ascending=False).drop_duplicates("stay_id"), on="stay_id", how="left"
).fillna({"vaso_amount": 0, "medication": "None"})
fig = px.histogram(
    data[data.vaso_amount > 0],
    x="vaso_amount",
    # facet_row="medication",
    # barmode="group",
    marginal="box",
    color="icd_code",
    # nbins=4,
    text_auto=True,
    width=width,
    height=height,
    title=f"Histogram of Max {med} Amount per icd_code",
)

if plot_fig:
    fig.show()
fig.write_image(f"{img_path}/6.1.max_{med}_per_icd_code.png")

### Epinephrine

In [None]:
med = "epinephrine"
meds[med] = (
    db_manager.retrieve_all(f"mimiciv_derived.{med}")
    .groupby("stay_id")
    .aggregate({"vaso_rate": sum, "vaso_amount": sum})
    .reset_index()
)
meds[med]["medication"] = med
meds[med]

In [None]:
data = aki_patients.merge(
    meds[med].sort_values("vaso_amount", ascending=False).drop_duplicates("stay_id"), on="stay_id", how="left"
).fillna({"vaso_amount": 0, "medication": "None"})
fig = px.histogram(
    data[data.vaso_amount > 0],
    x="vaso_amount",
    # facet_row="medication",
    # barmode="group",
    marginal="box",
    color="icd_code",
    # nbins=4,
    text_auto=True,
    width=width,
    height=height,
    title=f"Histogram of Max {med} Amount per icd_code",
)

if plot_fig:
    fig.show()
fig.write_image(f"{img_path}/6.2.max_{med}_per_icd_code.png")

### Dopamine

In [None]:
med = "dopamine"
meds[med] = (
    db_manager.retrieve_all(f"mimiciv_derived.{med}")
    .groupby("stay_id")
    .aggregate({"vaso_rate": sum, "vaso_amount": sum})
    .reset_index()
)
meds[med]["medication"] = med
meds[med]

In [None]:
data = aki_patients.merge(
    meds[med].sort_values("vaso_amount", ascending=False).drop_duplicates("stay_id"), on="stay_id", how="left"
).fillna({"vaso_amount": 0, "medication": "None"})
fig = px.histogram(
    data[data.vaso_amount > 0],
    x="vaso_amount",
    # facet_row="medication",
    # barmode="group",
    marginal="box",
    color="icd_code",
    # nbins=4,
    text_auto=True,
    width=width,
    height=height,
    title=f"Histogram of Max {med} Amount per icd_code",
)

if plot_fig:
    fig.show()
fig.write_image(f"{img_path}/6.3.max_{med}_per_icd_code.png")

### Vasopressin

In [None]:
med = "vasopressin"
meds[med] = (
    db_manager.retrieve_all(f"mimiciv_derived.{med}")
    .groupby("stay_id")
    .aggregate({"vaso_rate": sum, "vaso_amount": sum})
    .reset_index()
)
meds[med]["medication"] = med
meds[med]

In [None]:
data = aki_patients.merge(
    meds[med].sort_values("vaso_amount", ascending=False).drop_duplicates("stay_id"), on="stay_id", how="left"
).fillna({"vaso_amount": 0, "medication": "None"})
fig = px.histogram(
    data[data.vaso_amount > 0],
    x="vaso_amount",
    # facet_row="medication",
    # barmode="group",
    marginal="box",
    color="icd_code",
    # nbins=4,
    text_auto=True,
    width=width,
    height=height,
    title=f"Histogram of Max {med} Amount per icd_code",
)

if plot_fig:
    fig.show()
fig.write_image(f"{img_path}/6.4.max_{med}_per_icd_code.png")

In [None]:
med_list = []
for med in meds:
    med_list.append(
        aki_patients.merge(
            meds[med].sort_values("vaso_amount", ascending=False).drop_duplicates("stay_id"), on="stay_id", how="left"
        ).fillna({"vaso_amount": 0, "medication": "None"})
    )
meds_df = pd.concat(med_list).drop_duplicates(subset=["subject_id", "medication"])
meds_df

In [None]:
fig = px.histogram(
    meds_df[meds_df.vaso_amount > 0],
    x="vaso_amount",
    # pattern_shape="icd_code",
    # barmode="group",
    marginal="box",
    color="medication",
    # nbins=4,
    text_auto=True,
    width=width,
    height=height,
    title=f"Histogram of Max Medication Amount per Medication type ",
)
# fig.update_xaxes(type='category')
if plot_fig:
    fig.show()
fig.write_image(f"{img_path}/6.5.max_medications_combined.png")