In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [3]:
from src.parsers import mimic, hirid
from src.modeling import discovery, plots, querier
from src.utils import constants

In [6]:
import sys
# import win32com.client
import os
import pandas as pd
import win32com.client

def setup_io_config(root_path):
    """
    Input - Output config. Add dataset paths
    :root_path -> Repo path which contains 'data' and 'res' folders
    """

    # MIMIC
    is_shortcut = True if "data.lnk" in os.listdir(root_path) else False 
    
    if (is_shortcut):
        path_shortcut =  os.path.join(root_path, "data.lnk")
        shell = win32com.client.Dispatch("WScript.Shell")
        mimic_data = shell.CreateShortCut(path_shortcut).Targetpath
    else:
        mimic_data = os.path.join(f"{root_path}", "data") 
    mimic_path = os.path.join(f"{root_path}", "results")

    # HIRID
    hirid_data = f'{root_path}/data/hirid-a-high-time-resolution-icu-dataset-1.1.1/raw_stage/'
    hirid_path = f'{root_path}/data/hirid-a-high-time-resolution-icu-dataset-1.1.1'
    
    return mimic_data, mimic_path, hirid_data, hirid_path

def setup_stratification_config():
    gender="MF"
    age_b=40
    age_a=80 
    ethnicity="WHITE" 
    lab_mapping= constants.LAB_MAPPING
    before_windows = [(0,12)]
    after_windows = [(0,12)]
    return gender, age_a, age_b, ethnicity, lab_mapping, before_windows, after_windows

In [7]:
# IO Config
root_path ="C:\\Users\\danco\\My Drive\\Master\\Datasets\\MIMIC iii"
#root_path = "/Users/pavan/Library/CloudStorage/GoogleDrive-f20190038@hyderabad.bits-pilani.ac.in/My Drive/TAU/Code/DrugLab"
data, res, raw_path, res_path = setup_io_config(root_path=root_path)

# Stratification Config
gender, age_a, age_b, ethnicity, lab_mapping, before_windows, after_windows = setup_stratification_config()

In [8]:
import traceback
try:
    mimic_parser = mimic.MIMICParser(data=data, res=res, gender=gender, age_b=age_b, age_a=age_a, ethnicity=ethnicity, load="AUTOMATIC_MAPPING_MIMIC")
    m_med1, m_med2, m_labs = mimic_parser.parse(use_pairs=False, load_from_raw=False, load_raw_chartevents=False)
except:
    traceback.print_exc()

Loading med data...
Loaded med data.
Load 1st and 2nd medication data...
Loaded 1st and 2nd medication data.
Load Lab data...


  labs = pd.read_csv(os.path.join(self.data, constants.MIMIC_III_PREPROCESSED_PATH, constants.MIMIC_III_PREPROCESSED_LABDATA))


Loaded Lab data.


In [9]:
mimic_data_querier = querier.DatasetQuerier(
    data = data,
    res = res,
    t_labs=m_labs, 
    t_med1=m_med1, 
    t_med2=m_med2,
    gender=gender, 
    age_b=age_b, 
    age_a=age_a, 
    ethnicity=ethnicity, 
    lab_mapping=lab_mapping
)

In [10]:
#b_w = [(0,6), (6,12)]
#a_w = [(0,4), (4,8), (8,12)]
b_w = [(0,12)]
a_w = [(0,12)]
med_lab_pair_1 = mimic_data_querier.query('Insulin - Regular', 'Glucose', b_w, a_w)

No data found for the given lab test Glucose


In [55]:
import scipy.stats as stats

med_lab_data = med_lab_pair_1[0].copy()
lab_name = 'Glucose'
med_name = 'Insulin - Regular'

# Function
discovery_res = []
template = {
    "Lab Name" : lab_name,
    "Med Name": med_name
}

for aw in a_w:
    for bw in b_w:
        # Initializing variable for a before and after window
        row = template.copy()
        t = "abs"
        a, b = f"after_{t}_{aw}_sp", f"before_{t}_{bw}_sp"
        pvals, ttest = [], []
        med_lab_data = med_lab_data.dropna(subset=[a,b])
        
        # Performing tests
        c_m, pval_m = stats.mannwhitneyu(med_lab_data[b], med_lab_data[a])
        c_t, pval_t = stats.ttest_ind(med_lab_data[b], med_lab_data[a])
        c_t_p, pval_t_p = stats.ttest_rel(med_lab_data[b], med_lab_data[a])
        
        # Adding data to dataframe
        row["Before Window (in Hours)"] = b
        row["After Window (in Hours)"] = a
        row["Mannwhitneyu Test"] = pval_t
        row["TTest Independent"] = pval_t
        row["TTest Paired"] = pval_t_p
        
        discovery_res.append(row)
        

In [None]:
from statsmodels.stats.multitest import multipletests

bonferroni_analysis = multipletests(test_pval_data["MannwhitneyuTest"], alpha=0.01, method='bonferroni')
reject_bonferroni, pvals_corrected, _, alphacBonf = bonferroni_analysis
test_pval_data["BonferroniPvals"] = pd.Series(pvals_corrected)

### pvals - FDR Analysis
fdr1_analysis = multipletests(test_pval_data["MannwhitneyuTest"], alpha=0.01, method='fdr_bh')
reject_fdr, pvals_corrected1, _, alphacBonf = fdr1_analysis
test_pval_data['FDR Benjamini Corrected'] = pd.Series(pvals_corrected1)

significant_bonferroni = test_pval_data[reject_bonferroni]
significant_fdr = test_pval_data[reject_fdr]


In [56]:
res_m

{'MannwhitneyuTest_after_abs_(0, 4)_sp_before_abs_(0, 6)_sp': 1.3506194138148627e-05,
 'MannwhitneyuTest_after_abs_(0, 4)_sp_before_abs_(6, 12)_sp': 0.14239423323579198,
 'MannwhitneyuTest_after_abs_(4, 8)_sp_before_abs_(0, 6)_sp': 1.216036778397965e-10,
 'MannwhitneyuTest_after_abs_(4, 8)_sp_before_abs_(6, 12)_sp': 0.0006351553966462922,
 'MannwhitneyuTest_after_abs_(8, 12)_sp_before_abs_(0, 6)_sp': 6.818553458677554e-07,
 'MannwhitneyuTest_after_abs_(8, 12)_sp_before_abs_(6, 12)_sp': 0.04862437159069593}

In [57]:
res_t

{'TTest_after_abs_(0, 4)_sp_before_abs_(0, 6)_sp': 1.3506194138148627e-05,
 'TTest_after_abs_(0, 4)_sp_before_abs_(6, 12)_sp': 0.14239423323579198,
 'TTest_after_abs_(4, 8)_sp_before_abs_(0, 6)_sp': 1.216036778397965e-10,
 'TTest_after_abs_(4, 8)_sp_before_abs_(6, 12)_sp': 0.0006351553966462922,
 'TTest_after_abs_(8, 12)_sp_before_abs_(0, 6)_sp': 6.818553458677554e-07,
 'TTest_after_abs_(8, 12)_sp_before_abs_(6, 12)_sp': 0.04862437159069593}

In [58]:
res_t_p

{'TTest_paired_after_abs_(0, 4)_sp_before_abs_(0, 6)_sp': 2.0315631600371384e-11,
 'TTest_paired_after_abs_(0, 4)_sp_before_abs_(6, 12)_sp': 0.03490444161949971,
 'TTest_paired_after_abs_(4, 8)_sp_before_abs_(0, 6)_sp': 7.774602765517878e-15,
 'TTest_paired_after_abs_(4, 8)_sp_before_abs_(6, 12)_sp': 7.732270089274215e-06,
 'TTest_paired_after_abs_(8, 12)_sp_before_abs_(0, 6)_sp': 1.1014602916722819e-10,
 'TTest_paired_after_abs_(8, 12)_sp_before_abs_(6, 12)_sp': 0.006028434509404024}

In [60]:
m_final_lab_med_data = mimic_data_querier.generate_med_lab_data(None, None, None, before_windows, after_windows)

  col_vals[i] = pd.merge(col_vals[i-1], col_vals[i], how="outer", on=list(t_med1.columns)+["LAB_ITEMID"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final["LAB_NAME"] = final["LAB_ITEMID"]


In [64]:
med_lab_pair_data = m_final_lab_med_data[0]

In [125]:
analyzer = ClinicalDiscoveryAnalysis(med_lab_pair_data)
lab_name = 'Glucose'
med_name = 'Insulin - Regular'

In [None]:
res_comp_fixed = analyzer.analyze(before_windows, after_windows)
res_comp_fixed = res_comp_fixed.dropna(subset=["TTest Paired"])

In [126]:
sig_pairs = analyzer.generate_significant(res_comp_fixed)

In [91]:
res_dis_1 = analyzer.statistical_tests(med_name, lab_name, before_windows, after_windows)

In [92]:
pd.DataFrame(res_dis_1)

Unnamed: 0,Lab Name,Med Name,Before Window (in Hours),After Window (in Hours),Mannwhitneyu Test,TTest Independent,TTest Paired
0,Glucose,Insulin - Regular,"(0, 12)","(0, 12)",1.29514e-43,1.7080099999999998e-19,6.66218e-50


In [None]:
pd.DataFrame(res_dis_1)

Unnamed: 0,Lab Name,Med Name,Before Window (in Hours),After Window (in Hours),Mannwhitneyu Test,TTest Independent,TTest Paired
0,Glucose,Insulin - Regular,"before_abs_(0, 6)_sp","after_abs_(0, 4)_sp",2.189386e-14,1.485992e-07,1.9236739999999998e-19
1,Glucose,Insulin - Regular,"before_abs_(6, 12)_sp","after_abs_(0, 4)_sp",4.326885e-13,0.07642086,0.01329148
2,Glucose,Insulin - Regular,"before_abs_(0, 6)_sp","after_abs_(4, 8)_sp",7.587147e-14,1.44419e-11,6.954229999999999e-20
3,Glucose,Insulin - Regular,"before_abs_(6, 12)_sp","after_abs_(4, 8)_sp",0.530481,0.1426278,0.07623842
4,Glucose,Insulin - Regular,"before_abs_(0, 6)_sp","after_abs_(8, 12)_sp",8.997221e-09,6.818553e-07,1.10146e-10
5,Glucose,Insulin - Regular,"before_abs_(6, 12)_sp","after_abs_(8, 12)_sp",0.08560342,0.04862437,0.006028435


In [124]:
import scipy.stats as stats
from statsmodels.stats.multitest import multipletests
import pandas as pd


class ClinicalDiscoveryAnalysis:
    def __init__(self, med_lab_pair_data):
        self.med_lab_pair_data = med_lab_pair_data
    
    def statistical_tests(self, med_name, lab_name, before_windows, after_windows):
        """Perform statistical tests on the before and after lab test values of given medication and lab test pairs. Comparision done between given before and after windows

        Args:
            med_name (_type_): _description_
            lab_name (_type_): _description_
            before_windows (_type_): _description_
            after_windows (_type_): _description_

        Returns:
            _type_: _description_
        """
        med_lab_data = self.med_lab_pair_data.copy()
        med_lab_data = med_lab_data[med_lab_data["LAB_NAME"]==lab_name]
        med_lab_data = med_lab_data[med_lab_data["MED_NAME"]==med_name]
        
        discovery_res = []
        template = {
            "Lab Name" : lab_name,
            "Med Name": med_name
        }

        for aw in after_windows:
            for bw in before_windows:
                # Initializing variable for a before and after window
                row = template.copy()
                t = "abs"
                a, b = f"after_{t}_{aw}_sp", f"before_{t}_{bw}_sp"
                row["Before Window (in Hours)"] = bw
                row["After Window (in Hours)"] = aw
                pvals, ttest = [], []
                med_lab_data = med_lab_data.dropna(subset=[a,b])
                
                if med_lab_data.shape[0]==0:
                    row["Mannwhitneyu Test"] = 1
                    row["TTest Independent"] = 1
                    row["TTest Paired"] = 1
                    discovery_res.append(row)
                    continue
                
                # Performing tests
                c_m, pval_m = stats.mannwhitneyu(med_lab_data[b], med_lab_data[a])
                c_t, pval_t = stats.ttest_ind(med_lab_data[b], med_lab_data[a])
                c_t_p, pval_t_p = stats.ttest_rel(med_lab_data[b], med_lab_data[a])
                
                # Adding data to dataframe
                row["Mannwhitneyu Test"] = pval_m
                row["TTest Independent"] = pval_t
                row["TTest Paired"] = pval_t_p
                
                discovery_res.append(row)
    
        return discovery_res
    
    def analyze(self, before_windows, after_windows):
        """Perform statistical tests to generate p values for all medication<>lab test pairs in the given data

        Args:
            before_windows (_type_): _description_
            after_windows (_type_): _description_

        Returns:
            _type_: _description_
        """
        pairs = self.med_lab_pair_data.groupby(["MED_NAME", "LAB_NAME"]).count().index        
        discovery_res = []
        for med_name, lab_name in pairs:
            res = self.statistical_tests(med_name=med_name, lab_name=lab_name, before_windows=before_windows, after_windows=after_windows)
            if len(res)>0:
                discovery_res.append(res)
        res_df = pd.DataFrame(discovery_res)
        return res_df
    
    def generate_significant(self, pvals_med_lab, alpha=0.01, statistical_test="TTest Paired"):
        """Choose significant medication<>lab test pairs using Bonferroni and FDR analysis with pvals from the given statistical test

        Args:
            pvals_med_lab (_type_): _description_
            statistical_test (str, optional): _description_. Defaults to "Ttest Paired".

        Returns:
            _type_: _description_
        """
        
        test_pval_data = pvals_med_lab.copy()
        
        # pvals - Bonferrroni Analysis
        bonferroni_analysis = multipletests(test_pval_data[statistical_test], alpha=alpha, method='bonferroni')
        reject_bonferroni, pvals_corrected, _, alphacBonf = bonferroni_analysis
        test_pval_data["BonferroniPvals"] = pd.Series(pvals_corrected)

        ### pvals - FDR Analysis
        fdr1_analysis = multipletests(test_pval_data[statistical_test], alpha=alpha, method='fdr_bh')
        reject_fdr, pvals_corrected1, _, alphacBonf = fdr1_analysis
        test_pval_data['FDR Benjamini Corrected'] = pd.Series(pvals_corrected1)

        # choose significant
        significant_hard_thres = test_pval_data[test_pval_data[statistical_test]<alpha]
        significant_bonferroni = test_pval_data[reject_bonferroni]
        significant_fdr = test_pval_data[reject_fdr]
        
        return test_pval_data, significant_hard_thres, significant_bonferroni, significant_fdr

In [None]:
# MIMIC
mimic_parser = mimic.MIMICParser(data=data, res=res, gender=gender, age_b=age_b, age_a=age_a, ethnicity=ethnicity, lab_mapping=lab_mapping)
m_med1, m_med2, m_labs = mimic_parser.parse()
## Querier
mimic_data_querier = querier.DatasetQuerier(
    data = data,
    res = res,
    t_labs=m_labs, 
    t_med1=m_med1, 
    t_med2=m_med2,
    gender=gender, 
    age_b=age_b, 
    age_a=age_a, 
    ethnicity=ethnicity, 
    lab_mapping=lab_mapping
)
m_final_lab_med_data = mimic_data_querier.generate_med_lab_data(before_windows, after_windows)
## Discovery
discovery.ClinicalDiscoveryAnalysis()
## Plots
plotter = plots.ClinicalPlotAnalysis(
    data = data,
    res = res,
    gender=gender, 
    age_b=age_b, 
    age_a=age_a, 
    ethnicity=ethnicity, 
    lab_mapping=lab_mapping
)
m_corrs_data_df = plotter.plot(m_final_lab_med_data, m_labs, before_windows=before_windows, after_windows=after_windows)

In [None]:
# HIRID
hirid_mapping = constants.HIRID_MAPPING
hirid_parser = hirid.HiRiDParser(data=raw_path, res=res_path, gender=gender, age_b=age_b, age_a=age_a)
h_med1, h_med2, h_labs = hirid_parser.parse()
lab_ids = [l for k in hirid_mapping.values() for l in k]
h_labs_1 = h_labs[h_labs.OldITEMID.isin(lab_ids)]

hirid_data_querier = querier.DatasetQuerier(
    data = raw_path,
    res = res_path,
    t_labs=h_labs, 
    t_med1=h_med1, 
    t_med2=h_med2,
    gender=gender, 
    age_b=age_b, 
    age_a=age_a, 
    ethnicity=ethnicity, 
)
final_h_final_lab_med_data, raw_h_final_lab_med_data = hirid_data_querier.generate_med_lab_data(before_windows, after_windows)

h_plotter = plots.ClinicalPlotAnalysis(
    data = raw_path,
    res = res_path,
    gender=gender, 
    age_b=age_b, 
    age_a=age_a, 
    ethnicity="", 
    lab_mapping={}
)
h_corrs_data_df = h_plotter.plot(final_h_final_lab_med_data, h_labs, before_windows=before_windows, after_windows=after_windows)

In [7]:
# MIMIC
mimic_parser = mimic.MIMICParser(data=data, res=res, gender=gender, age_b=age_b, age_a=age_a, ethnicity=ethnicity, lab_mapping=lab_mapping)
m_med1, m_med2, m_labs = mimic_parser.parse()
## Querier
mimic_data_querier = querier.DatasetQuerier(
    data = data,
    res = res,
    gender=gender, 
    age_b=age_b, 
    age_a=age_a, 
    ethnicity=ethnicity, 
    lab_mapping=lab_mapping
)
m_final_lab_med_data = mimic_data_querier.generate_med_lab_data(m_labs, m_med1, m_med2, before_windows, after_windows)

  med2 = pd.read_csv(os.path.join(self.data, "mimiciii/1.4/preprocessed", "med2_vectorized.csv")) if use_med_vect and os.path.exists(os.path.join(self.data, "mimiciii/1.4/preprocessed", "med2_vectorized.csv")) else self.generate_med2_vect()
  col_vals[i] = pd.merge(col_vals[i-1], col_vals[i], how="outer", on=list(t_med1.columns)+["LAB_ITEMID"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final["LAB_NAME"] = final["LAB_ITEMID"]


In [82]:
plotter = plots.ClinicalPlotAnalysis(
    data = data,
    res = res,
    gender=gender, 
    age_b=age_b, 
    age_a=age_a, 
    ethnicity=ethnicity, 
    lab_mapping=lab_mapping
)
m_corrs_data_df = plotter.plot(m_final_lab_med_data, m_labs, before_windows=before_windows, after_windows=after_windows)
#m_final_lab_med_data[1]
#m_final_lab_med_data[1].groupby(['SUBJECT_ID'])['ITEMID'].count()

AttributeError: 'NoneType' object has no attribute 'groupby'

In [84]:
{k:list(v.keys()) for k, v in constants.LAB_MAPPING.items()}

{'ALT': ['Alanine aminotransferase'],
 'ANA': ['Fraction inspired oxygen'],
 'AST': ['Asparate aminotransferase'],
 'Hemoglobin': ['Hemoglobin percent',
  'Hemoglobin C',
  'Hemoglobin F',
  'Hemoglobin A2',
  'Hemoglobin'],
 'INR': ['Prothrombin time INR'],
 'bilirubin': ['Bilirubin'],
 'calcium': ['Calcium', 'Calcium ionized'],
 'creatinine': ['Creatinine', 'Creatinine ascites'],
 'glucose': ['Glucose'],
 'lactic acid': ['Lactic acid'],
 'magnesium': ['Magnesium', 'Magnesium, Urine'],
 'platelets': ['Platelets', 'Large Platelets'],
 'potassium': ['Potassium'],
 'sodium': ['Sodium'],
 'Uric acid': ['Uric Acid', 'Uric Acid, Urine'],
 'B12': ['Vitamin B12'],
 'prolactin': ['Prolactin'],
 'Amylase': ['Amylase'],
 'Lipase': ['Lipase'],
 'Aptt': ['PTT'],
 'Hematocrit': ['Hematocrit'],
 'Red blood cell': ['Red blood cell'],
 'Albumin': ['Albumin'],
 'Magnesium': ['Magnesium'],
 'CPK': []}

In [15]:
#os.mkdir('bb')
#stratify_prefix = f"{age_b}-{age_a}_{gender}_{ethnicity}"
res = f"{root_path}\\results"

ccc = os.path.join(res,f"before_after_windows_main_med_lab_first_val_{stratify_prefix}_doc_eval_new_win.csv")
#res, 
ccc
#d_m_l_doc.to_csv(ccc)
#pd.DataFrame.from_dict(res_dict_mapping_med)

'C:\\Users\\danco\\My Drive\\Master\\Datasets\\MIMIC iii\\results\\before_after_windows_main_med_lab_first_val_40-80_MF_WHITE_doc_eval_new_win.csv'

In [12]:
import pandas as pd

labs = pd.read_csv(os.path.join(data, "mimiciii/1.4/preprocessed", "lab_patient_data_mimic_extract_2.csv")) 

In [27]:
d_m_l_doc = pd.read_csv(os.path.join(data, "mimiciii", "1.4","preprocessed", "mapping_med_itemid_doc.csv")).drop(columns=["Unnamed: 0"])
dict_d_m_l = d_m_l_doc.to_dict("records")
res_dict_mapping_med = {v:k["Medication"] for k in dict_d_m_l for v in [int(id) for id in k["ITEMID_with_manual"][1:-1].split(",") if id != '']}

In [28]:
import pandas as pd
gender="MF"
age_b=40
age_a=80 
ethnicity="WHITE" 
lab_mapping= constants.LAB_MAPPING
before_windows = [(0,12)]
after_windows = [(0,12)]
use_med_vect = True

med1 = pd.read_csv(os.path.join(data, "mimiciii/1.4/preprocessed", "med1_vectorized.csv")) if use_med_vect and os.path.exists(os.path.join(
    data, "mimiciii/1.4/preprocessed", "med1_vectorized.csv")) else generate_med1_vect()
h_adm_1 = med1.sort_values(["HADM_ID", "STARTTIME"]).groupby("SUBJECT_ID").nth(0)["HADM_ID"].to_list()
med1 = med1[med1.HADM_ID.isin(h_adm_1)]
med1 = med1.drop(columns=["Unnamed: 0"])
med1 = med1[med1["AGE"]>=age_b]
med1 = med1[med1["AGE"]<=age_a]
med1 = med1[med1["GENDER"]==gender] if gender != "MF" else med1
med1 = med1[med1["ETHNICITY"]==ethnicity]
med1["MIMICExtractLabel"] = med1.apply(lambda r: res_dict_mapping_med[r["ITEMID"]] if r["ITEMID"] in res_dict_mapping_med else r["LABEL"], axis=1)
med1["STARTTIME"] = pd.to_datetime(med1["STARTTIME"])
med1["ENDTIME"] = pd.to_datetime(med1["ENDTIME"])
med1["ADMITTIME"] = pd.to_datetime(med1["ADMITTIME"])
med1["MedTimeFromAdmit"] = med1["ENDTIME"]-med1["ADMITTIME"]
med1["hours_in"] = med1["MedTimeFromAdmit"].dt.total_seconds()/3600

In [77]:
#use_med_vect = True
med1_old = pd.read_csv(os.path.join(data, "mimiciii/1.4/preprocessed", "med1_vectorized.csv"))
med1

Unnamed: 0,SUBJECT_ID,ITEMID,HADM_ID,ROW_ID_x,ICUSTAY_ID,STARTTIME,ENDTIME,AMOUNT,AMOUNTUOM,RATE,...,MedTimeFromAdmit,ROW_ID,LABEL,ABBREVIATION,DBSOURCE,LINKSTO,CATEGORY,UNITNAME,PARAM_TYPE,CONCEPTID
45596,41624,226372,178751,1538868,277546.0,2162-05-06 11:00:00,2162-05-06 11:01:00,600.000000,ml,,...,0 days 00:01:00,14457,OR Cell Saver Intake,OR Cell Saver Intake,metavision,inputevents_mv,Blood Products/Colloids,mL,Solution,
103469,53729,226364,140334,1857294,289288.0,2150-09-30 13:00:00,2150-09-30 13:01:00,4000.000000,ml,,...,0 days 00:01:00,14450,OR Crystalloid Intake,OR Crystalloid Intake,metavision,inputevents_mv,Fluids/Intake,mL,Solution,
88606,50735,226363,175220,3253371,239857.0,2190-02-23 19:17:00,2190-02-23 19:18:00,600.000000,ml,,...,0 days 00:01:00,14449,Cath Lab Intake,Cath Lab Intake,metavision,inputevents_mv,Fluids/Intake,mL,Solution,
82501,49469,226361,160152,189972,208590.0,2111-11-06 22:16:00,2111-11-06 22:17:00,1000.000000,ml,,...,0 days 00:01:00,14447,Pre-Admission Intake,Pre-Admission Intake,metavision,inputevents_mv,Fluids/Intake,mL,Solution,
195772,72555,226361,194577,2690710,227291.0,2144-08-09 06:51:00,2144-08-09 06:52:00,2000.000000,ml,,...,0 days 00:01:00,14447,Pre-Admission Intake,Pre-Admission Intake,metavision,inputevents_mv,Fluids/Intake,mL,Solution,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
291681,93321,225797,115396,3599579,248745.0,2128-09-14 17:58:00,2128-09-14 17:59:00,100.000000,ml,,...,173 days 18:58:00,13981,Free Water,Free Water,metavision,inputevents_mv,Fluids/Intake,mL,Solution,
291702,93321,225930,115396,3599570,248745.0,2128-09-15 08:22:00,2128-09-15 11:12:00,84.919287,ml,29.971513,...,174 days 12:11:00,14367,Peptamen 1.5 (Full),Peptamen 1.5 (Full),metavision,inputevents_mv,Nutrition - Enteral,mL,Solution,
291701,93321,225917,115396,3599565,248745.0,2128-09-14 17:53:00,2128-09-15 17:54:00,2497.699998,ml,103.998612,...,174 days 18:53:00,14355,TPN without Lipids,TPN without Lipids,metavision,inputevents_mv,Nutrition - Parenteral,mL,Solution,
291693,93321,225883,115396,3599574,248745.0,2128-09-16 00:30:00,2128-09-16 00:31:00,1.000000,dose,,...,175 days 01:30:00,14041,Meropenem,Meropenem,metavision,inputevents_mv,Antibiotics,dose,Solution,


In [73]:
#med1
#time_diff = (med1.ENDTIME - med1.STARTTIME).dt.total_seconds()/3600
#(time_diff < 2).value_counts()
med1 = m_p_df.groupby(["HADM_ID", "ITEMID"]).nth(0).reset_index()
med1 = med1.sort_values("MedTimeFromAdmit")
med1 = med1[med1["MedTimeFromAdmit"].dt.total_seconds()>0]
med1 = med1.sort_values(by=["ADMITTIME"]).groupby(["SUBJECT_ID", "ITEMID"]).nth(0).reset_index().sort_values(by=["MedTimeFromAdmit"])

med1.groupby('LABEL')['SUBJECT_ID'].count()

LABEL
ACD-A Citrate (1000ml)      17205
ACD-A Citrate (500ml)         316
Abciximab (Reopro)              7
Acetaminophen-IV             4523
Acyclovir                    4258
                            ...  
Vitamin K (Phytonadione)      325
Vivonex (1/2)                  42
Vivonex (3/4)                  12
Vivonex (Full)               2082
Voriconazole                  873
Name: SUBJECT_ID, Length: 278, dtype: int64

In [76]:

med1.groupby('LABEL')['SUBJECT_ID'].count().sort_values(ascending=False)

LABEL
NaCl 0.9%              15888
Dextrose 5%            13861
PO Intake              13360
Solution               11192
Magnesium Sulfate       9276
                       ...  
Isosource 1.5 (2/3)        1
Ensure (1/4)               1
Ensure (3/4)               1
Pulmocare (1/4)            1
Quinine                    1
Name: SUBJECT_ID, Length: 278, dtype: int64

In [80]:
#final_mapping_lab_itemids = {v2:k1 for k, v in constants.LAB_MAPPING.items() for k1, v1 in v.items() for v2 in v1}
#final_itemids_list = list(final_mapping_lab_itemids.keys())
final_itemids_list

[50861,
 769,
 220644,
 189,
 50878,
 770,
 220587,
 50852,
 51224,
 51225,
 51223,
 814,
 220228,
 51222,
 50811,
 51237,
 815,
 1530,
 227467,
 51465,
 50883,
 803,
 225651,
 50885,
 1538,
 848,
 225690,
 50884,
 786,
 1522,
 3746,
 51029,
 50893,
 225625,
 50808,
 816,
 225667,
 3766,
 791,
 1525,
 220615,
 50912,
 50841,
 50931,
 807,
 811,
 1529,
 50809,
 3745,
 225664,
 220621,
 226537,
 818,
 225668,
 1531,
 50960,
 51088,
 51265,
 828,
 227457,
 51240,
 829,
 1535,
 227464,
 50971,
 50822,
 837,
 1536,
 220645,
 226534,
 50983,
 50824,
 51007,
 51105,
 51010,
 50973,
 50867,
 50956,
 825,
 1533,
 227466,
 51275,
 813,
 220545,
 51221,
 50810,
 51279,
 833,
 50862,
 772,
 1521,
 227456,
 821,
 1532,
 220635]

In [60]:
def change_col_to_datetime(inputevents_mv, feature):
    inputevents_mv[feature] = pd.to_datetime(inputevents_mv[feature])
    return inputevents_mv


admits = pd.read_csv(os.path.join(data, "mimiciii/1.4/raw", "ADMISSIONS.csv.gz"))
inputevents_mv = pd.read_csv(os.path.join(data, "mimiciii/1.4/raw", "INPUTEVENTS_MV.csv.gz"))

### Merge medication and admission data
inputevents_mv = pd.merge(inputevents_mv, admits, how="inner", on=["HADM_ID", "SUBJECT_ID"])
inputevents_mv = change_col_to_datetime(inputevents_mv, 'ADMITTIME')
inputevents_mv = change_col_to_datetime(inputevents_mv, 'ENDTIME')
inputevents_mv = change_col_to_datetime(inputevents_mv, 'STARTTIME')
inputevents_mv['MedTimeFromAdmit'] = inputevents_mv['ENDTIME']-inputevents_mv['ADMITTIME']

### Add medication information from D_ITEMS table in MIMIC III dataset (like label name)
med_data = pd.read_csv(os.path.join(data, "mimiciii/1.4/raw", "D_ITEMS.csv.gz"))
med_data = med_data[med_data["DBSOURCE"]=="metavision"]

# Merge medication data with medication labels
m_p_df = pd.merge(inputevents_mv, med_data, how="inner", on="ITEMID")

In [None]:
temp = m_med1[m_med1["hours_in"]>0].copy()
temp = temp[["ITEMID", "HADM_ID", "hours_in", "AMOUNTUOM", "AMOUNT"]]

In [None]:
uom_counts = temp.groupby(["ITEMID", "AMOUNTUOM"]).count().reset_index().groupby("ITEMID").count()

In [None]:
uom_out_itemids = uom_counts[uom_counts["HADM_ID"]>1].index

In [None]:
uom_out_itemids

Index(['Acetaminophen-IV', 'Acyclovir', 'Amino Acids', 'Ampicillin',
       'Ampicillin/Sulbactam (Unasyn)', 'Azithromycin', 'Bactrim (SMX/TMP)',
       'Caspofungin', 'Cefazolin', 'Cefepime', 'Ceftazidime', 'Ceftriaxone',
       'Ciprofloxacin', 'Clindamycin', 'Coumadin (Warfarin)', 'Daptomycin',
       'Dexmedetomidine (Precedex)', 'Dilantin', 'Enoxaparin (Lovenox)',
       'Epinephrine', 'Erythromycin', 'Factor VIIa', 'Famotidine (Pepcid)',
       'Fentanyl', 'Fentanyl (Concentrate)', 'Fluconazole', 'Folic Acid',
       'Gancyclovir', 'Gentamicin', 'Hydromorphone (Dilaudid)', 'KCl (CRRT)',
       'Levofloxacin', 'Linezolid', 'Magnesium Sulfate', 'Mannitol',
       'Meropenem', 'Metronidazole', 'Micafungin', 'Morphine Sulfate',
       'Nafcillin', 'Naloxone (Narcan)', 'OR FFP Intake', 'Octreotide',
       'Pantoprazole (Protonix)', 'Piperacillin',
       'Piperacillin/Tazobactam (Zosyn)', 'Propofol',
       'Ranitidine (Prophylaxis)', 'Rifampin', 'Sodium Bicarbonate 8.4%',
       'Th

In [None]:
error_cases = temp[temp.ITEMID.isin(uom_out_itemids)]
error_cases[error_cases["ITEMID"]=="Thiamine"].AMOUNTUOM.value_counts()

mg      490
dose      2
Name: AMOUNTUOM, dtype: int64

In [None]:
uom_labs_counts = m_labs[["HADM_ID", "ITEMID", "VALUE", "VALUEUOM"]].groupby(["ITEMID", "VALUEUOM"]).count().reset_index().groupby("ITEMID").count()

In [None]:
uom_lab_out_itemids = uom_labs_counts[uom_labs_counts["HADM_ID"]>1].index

In [None]:
uom_lab_out_itemids

Index(['Calcium', 'Hemoglobin', 'Red blood cell'], dtype='object', name='ITEMID')

In [None]:
temp_lab =  m_labs[["HADM_ID", "ITEMID", "VALUE", "VALUEUOM"]]
error_lab_cases = temp_lab[temp_lab.ITEMID.isin(uom_lab_out_itemids)]
error_lab_cases[error_lab_cases["ITEMID"]=="Red blood cell"].VALUEUOM.value_counts()

m/uL      88432
/mic l     1180
Name: VALUEUOM, dtype: int64

In [None]:
mapping_inhumane = pd.read_csv(os.path.join("/Users/pavan/Library/CloudStorage/GoogleDrive-f20190038@hyderabad.bits-pilani.ac.in/.shortcut-targets-by-id/1ubRltB1OByqQIbGfHVuao0jJfcCh0KRe/TAU/temp_mapping_140722_partial - temp_mapping_140722_partial.csv"))

In [None]:
mapping_inhumane.dropna()

Unnamed: 0,belinson,ukb,tamsc,parsed_name,sheba,measurement_type,full_name,min_range,max_range,min_inhuman,max_inhuman,unit
9,C.REACTIVE.PROTEIN,crp_30710_0,High_sensetive_CRP,crp,CRP,comp_metabolic_panel,C-Reactive Protein,0.03,5,0,1000,mg/L
20,EOS.abs,eos_abs,EOS_NO,eos_abs,eos_abs,complete_blood_count,Eosinophil count,0,0.4,0,5,10e3/µL
21,EOS..,eos_per_30210_0,EOS_precent,eos_per,EOS,complete_blood_count,Eosinophils,0,7,0,20,%
22,HCT,hct_30030_0,HCT,hct,HCT,complete_blood_count,Hematocrit,36,50,20,60,%
23,HGB,hgb_30020_0,HGB,hgb,HGB,complete_blood_count,Hemoglobin,11.7,17.2,2,25,g/dL
24,LYMP.abs,lym_abs,LYM,lym_abs,lym_abs,complete_blood_count,Lymphocyte Count,1.2,3,0.1,40,10e3/µL
25,LYM.,lym_per_30180_0,LYM_precent,lym_per,LYMPH,complete_blood_count,Lymphocytes,16,43,0.1,99,%
28,MCV,mcv_30040_0,MCV,mcv,MCV,complete_blood_count,Mean corpuscular volume,76,100,40,150,fL
29,MONO.abs,mono_abs,MONO,mono_abs,mono_abs,complete_blood_count,Monocytes Count,0,1.3,0,10,10e3/µL
30,MONO.,mono_per_30190_0,MONO_precent,mono_per,MONO,complete_blood_count,Monocytes,2,13,0.1,99,%


In [None]:
map_inhum = {
    'Red blood cell':'Red Blood Cells',
    'Vitamin B12':'Vitamin B12',
    'Alanine aminotransferase':'Alanine amino transferase',
    'Asparate aminotransferase':'Aspratate aminotransferase',
    'Albumin':'Albumin',
    'Prothrombin time INR':'INR(PT)',
    'PTT':'PTT',
    'Bilirubin': 'Bilirubin',
    'Calcium':'Calcium',
    'Hematocrit':'Hematocrit',
    'Creatinine':'Creatinine',
    'Glucose':'Glocuse',
    'Magnesium':'Magnesium',
    'Platelets':'Platelet count',
    'Potassium':'Potassium',
    'Sodium':'Sodium',
    'Uric Acid':'Uric Acid'
}

In [None]:
map_inhum_r = {v:k for k,v in map_inhum.items()}

In [None]:
# m_labs.apply(lambda r: remove_inhumane(r), axis=1)
for k, v in map_inhum_r.items():
    t_row_l = mapping_inhumane[mapping_inhumane.full_name == k].iloc[0]
    temp_cal = m_labs[(m_labs['ITEMID']==v) & ((m_labs['VALUE'] < float(t_row_l["min_range"])) | (m_labs['VALUE'] > float(t_row_l["max_range"])) )]
    percent = 100 * ( (m_labs[(m_labs['ITEMID']==v)].shape[0] - temp_cal.shape[0] ) / m_labs[(m_labs['ITEMID']==v)].shape[0] )
    print(f"Measurement type : {temp_cal.VALUEUOM.unique()}")
    print(f"Inhumane values shape: {temp_cal.shape[0]}")
    print(f"Original shape : {m_labs[(m_labs['ITEMID']==v)].shape[0]} and New Shape : {m_labs[(m_labs['ITEMID']==v)].shape[0] - temp_cal.shape[0]}")
    print(f"Percentage change in {k} measurement == {round(percent, 2)} %\n")
    # m_labs = m_labs.drop(temp_cal.index)

Measurement type : ['m/uL' '/mic l']
Inhumane values shape: 67830
Original shape : 89612 and New Shape : 21782
Percentage change in Red Blood Cells measurement == 24.31 %

Measurement type : ['pg/mL']
Inhumane values shape: 316
Original shape : 953 and New Shape : 637
Percentage change in Vitamin B12 measurement == 66.84 %

Measurement type : ['IU/L' nan]
Inhumane values shape: 17786
Original shape : 34983 and New Shape : 17197
Percentage change in Alanine amino transferase measurement == 49.16 %

Measurement type : ['IU/L' nan]
Inhumane values shape: 19548
Original shape : 34962 and New Shape : 15414
Percentage change in Aspratate aminotransferase measurement == 44.09 %

Measurement type : ['g/dL' nan]
Inhumane values shape: 13268
Original shape : 20873 and New Shape : 7605
Percentage change in Albumin measurement == 36.43 %

Measurement type : []
Inhumane values shape: 0
Original shape : 77537 and New Shape : 77537
Percentage change in INR(PT) measurement == 100.0 %

Measurement type