In [1]:
import numpy as np
import pandas as pd
import datetime
import copy
import time
import os
import re
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import operator

from tqdm.auto import tqdm, trange
from tqdm.notebook import tqdm
from datetime import timedelta

tqdm.pandas()

In [2]:
# Edit to point to your MIMIC directory.
dataDirStr = '/Users/gmessier/data/mimic-1.4/'

In [3]:
drgcodes_df = pd.read_csv(dataDirStr + "DRGCODES.csv")
drgcodes_df.columns = drgcodes_df.columns.str.lower()
drgcodes_df

Unnamed: 0,row_id,subject_id,hadm_id,drg_type,drg_code,description,drg_severity,drg_mortality
0,342,2491,144486,HCFA,28,"TRAUMATIC STUPOR & COMA, COMA <1 HR AGE >17 WI...",,
1,343,24958,162910,HCFA,110,MAJOR CARDIOVASCULAR PROCEDURES WITH COMPLICAT...,,
2,344,18325,153751,HCFA,390,NEONATE WITH OTHER SIGNIFICANT PROBLEMS,,
3,345,17887,182692,HCFA,14,SPECIFIC CEREBROVASCULAR DISORDERS EXCEPT TRAN...,,
4,346,11113,157980,HCFA,390,NEONATE WITH OTHER SIGNIFICANT PROBLEMS,,
...,...,...,...,...,...,...,...,...
125552,123452,71582,101422,MS,221,CARDIAC VALVE & OTH MAJ CARDIOTHORACIC PROC W/...,,
125553,123453,46449,110075,APR,1653,Coronary Bypass w/ Cardiac Cath Or Percutaneou...,3.0,2.0
125554,123454,46449,110075,APR,1653,Coronary Bypass w/ Cardiac Cath Or Percutaneou...,3.0,2.0
125555,123455,46449,110075,MS,234,CORONARY BYPASS W CARDIAC CATH W/O MCC,,


`DRGCODES.parquet` contains diagnoses reglated groups (DRG) for patients. This table is directly linked to the patients (subject_id) and admissions (hadm_id) tables.

In [4]:
print(f"There are {drgcodes_df.subject_id.nunique()} patients who are linked to a DRG")

There are 46511 patients who are linked to a DRG


`drg_type` is categorical data. It contains the type of DRG code in the entry. There are three types of DRG codes. HCFA is "Health Care Financing Administration", MS is "Medicare" and APR is "All Payers Registry".

In [5]:
c = drgcodes_df.drg_type.value_counts()
p = drgcodes_df.drg_type.value_counts(normalize=True).mul(100).round(2)
pd.concat([c,p], axis=1, keys=['counts', '%'])

Unnamed: 0,counts,%
APR,66634,53.07
HCFA,31644,25.2
MS,27279,21.73


`drg_code` contains a code which represented the diagnoses billed for by the hospital. There are a total of 1789 DRG codes.

In [6]:
c = drgcodes_df.drg_code.value_counts()[:5]
p = p = drgcodes_df.drg_code.value_counts(normalize=True).mul(100).round(2)[:5]
pd.concat([c,p], axis=1, keys=['counts', '%'])

Unnamed: 0,counts,%
7204,3042,2.42
391,2653,2.11
1662,1367,1.09
6402,1342,1.07
871,1226,0.98


`description` is a human understandable summary of the given `drg_code`. 

In [7]:
c = drgcodes_df.description.value_counts()[:5]
p = drgcodes_df.description.value_counts(normalize=True).mul(100).round(2)[:5]
pd.concat([c,p], axis=1, keys=['counts', '%'])

Unnamed: 0,counts,%
Septicemia & Disseminated Infections,3854,3.07
NORMAL NEWBORN,2805,2.24
Cardiac Valve Procedures w/o Cardiac Catheterization,2486,1.98
Coronary Bypass w/o Cardiac Cath Or Percutaneous Cardiac Procedure,2378,1.89
"Neonate, Bwt > 2499g, Normal Newborn Or Neonate W Other Problem",2099,1.67


`drg_severity` and `drg_mortality` provide additional granuality to the DRG codes when the `drg_type` is "APR". Severity and mortality allow for higher billing costs when a diagnosis is more/less severe.