In [None]:
import os
import sys
from pathlib import Path

import pandas as pd
import plotly.express as px

sys.path.insert(0, str(Path("").resolve().parents[1]))
from utils.db_manager import MimicDBManager

In [None]:
width = 1200
height = 600
img_path = "/app/ml_model/EDA/images"
Path(img_path).mkdir(parents=True, exist_ok=True)
plot_fig = False
pd.set_option("display.max_rows", 100)

In [None]:
db_manager = MimicDBManager(port=5432)
db_manager.retrieve_table_names()

In [None]:
[db_manager.count_rows(table_name=table) for table in db_manager.retrieve_table_names()[:10]]

In [None]:
table_name = db_manager.retrieve_table_names()[0]
db_manager.retrieve_id(table_name, id=18207287, id_column="subject_id")

In [None]:
pd.DataFrame.from_dict(db_manager.retrieve_column_types(table_name), orient="index", columns=["data_type"]).reset_index(
    names="column"
)

In [None]:
tables = (
    pd.DataFrame.from_dict(
        {table: db_manager.retrieve_column_types(table) for table in db_manager.retrieve_table_names()}, orient="index"
    )
    .fillna("-")
    .sort_index()
    .reset_index(names="full_table_name")
)
tables[["parent_table", "table_name"]] = tables.full_table_name.str.split(".", expand=True)

In [None]:
tables[tables.parent_table == "mimiciv_hosp"]

In [None]:
tables[tables.parent_table == "mimiciv_icu"]

In [None]:
tables[tables.parent_table == "mimiciv_derived"]

There are 63 tables created through the "concepts" database building process.

Let's look now for patients with Acute Renal Failure.

Note: ICD-9 Diagnostic codes can be found in: https://www2.gov.bc.ca/assets/gov/health/practitioner-pro/medical-services-plan/diag-codes_genitourinary.pdf

In [None]:
data = db_manager.retrieve_all("mimiciv_hosp.diagnoses_icd")
data["icd_code"] = data["icd_code"].str.strip()
arf = data[data.icd_code == "5848"]
arf

In [None]:
aki_patients = arf.merge(db_manager.retrieve_all("mimiciv_hosp.patients"), on="subject_id")
aki_patients

In [None]:
fig = px.histogram(
    aki_patients,
    x="anchor_age",
    color="gender",
    barmode="overlay",
    marginal="box",
    text_auto=True,
    width=width,
    height=height,
    title="Distribution of Patients with ICD-9 code 5848 in MIMIC-IV",
)
if plot_fig:
    fig.show()
fig.write_image(f"{img_path}/ICD-9_code-5848_distribution.png")

Not too many patients to work with. How about patients with *any* type of acute renal failure

In [None]:
data = db_manager.retrieve_all("mimiciv_hosp.diagnoses_icd")
data["icd_code"] = data["icd_code"].str.strip()
arf = data[data.icd_code.str.startswith("584")]
arf

In [None]:
aki_patients = (
    arf.merge(db_manager.retrieve_all("mimiciv_hosp.patients"), on="subject_id")
    .merge(db_manager.retrieve_all("mimiciv_icu.icustays"), on="subject_id")
    .merge(db_manager.retrieve_all("mimiciv_hosp.admissions"), on="subject_id")
)
aki_patients["deceased"] = aki_patients["dod"].apply(lambda x: True if str(x).strip() != "None" else False)
aki_patients["icu_stay_days"] = (aki_patients["outtime"] - aki_patients["intime"]).dt.total_seconds() / 3600 / 24
aki_patients

In [None]:
fig = px.histogram(
    aki_patients,
    x="anchor_age",
    facet_row="gender",
    barmode="group",
    marginal="box",
    color="deceased",
    nbins=30,
    text_auto=True,
    width=width,
    height=height,
    title="Histogram of ARF patients by age, deceased status, and gender",
)
if plot_fig:
    fig.show()
fig.write_image(f"{img_path}/anchor_age-deceased-gender.png")

In [None]:
fig = px.histogram(
    aki_patients,
    x="anchor_age",
    # facet_row="race",
    barmode="group",
    color="race",
    nbins=20,
    text_auto=True,
    width=width,
    height=height,
    title="Histogram of ARF patients by age and race",
)
if plot_fig:
    fig.show()
fig.write_image(f"{img_path}/anchor_age-race.png")

The top 2 groups overshadow the rest, let's remove them so we can take a better look at the other groups

In [None]:
fig = px.histogram(
    aki_patients[
        ~aki_patients.race.isin(aki_patients.value_counts(subset="race").reset_index().race.loc[:1].to_list())
    ],
    x="anchor_age",
    # facet_row="race",
    barmode="group",
    color="race",
    nbins=20,
    text_auto=True,
    width=width,
    height=height,
    title="Histogram of ARF patients by age and race excluding top 2",
)
if plot_fig:
    fig.show()
fig.write_image(f"{img_path}/anchor_age-race_no_top_2.png")

In [None]:
fig = px.histogram(
    aki_patients[aki_patients.race.str.lower().str.contains("hispanic")],
    x="anchor_age",
    # facet_row="race",
    barmode="group",
    color="race",
    nbins=20,
    text_auto=True,
    width=width,
    height=height,
    title="Histogram of ARF patients by age and race for hispanics",
)
if plot_fig:
    fig.show()
fig.write_image(f"{img_path}/anchor_age-race_hispanic.png")

91 is the hard limit for age on the DB, to guarantee non-identifiying patients, which would explain the higher amount of patients (both alive and deceased) in the last age group.

Let's take a look at the ICD codes as well

In [None]:
fig = px.histogram(
    aki_patients,
    x="anchor_age",
    barmode="group",
    marginal="box",
    color="icd_code",
    nbins=10,
    text_auto=True,
    width=width,
    height=height,
    title="Histogram of ARF patients by age and specific ICD-9 code",
)
if plot_fig:
    fig.show()
fig.write_image(f"{img_path}/anchor_age-icd_code.png")

In [None]:
fig = px.histogram(
    aki_patients,
    x="icu_stay_days",
    barmode="group",
    marginal="box",
    color="icd_code",
    nbins=50,
    text_auto=True,
    width=width,
    height=height,
    title="Histogram of ARF patients by ICU stay (in days) and specific ICD-9 code",
)
if plot_fig:
    fig.show()
fig.write_image(f"{img_path}/icu_stay_days-icd_code.png")