In [None]:
import numpy as np
import pandas as pd
import psycopg2
from scipy.stats import ks_2samp
import os
import random
from google.colab import auth
from tabulate import tabulate
try:
    os.mkdir("out")
except FileExistsError:
    pass

In [None]:
def run_query(query, project_id="mimic-project-324510"):
  return pd.io.gbq.read_gbq(
      query,
      project_id=project_id,
      dialect='standard')
  
def all_but_string(list_tup, combine="AND", variable="diag.long_title", equality="LIKE", then=False):
    # Makes a SQL string that enters variables not equal to some string
    out_str = ""
    initial_comment = True
    for i, str_ in enumerate(list_tup):
        if str_ != '':
            if not initial_comment:
                out_str = out_str + f"\n{combine} "
            if "'" not in str_:
                out_str = out_str + f"""LOWER ( {variable} ) {equality} '%{str_.lower()}%' """
                if then:
                    out_str = out_str + f"""THEN '{str_.lower()}'"""
            else:
                out_str = out_str + f'''LOWER ( {variable} ) {equality} "%{str_.lower()}%" '''
                if then:
                    out_str = out_str + f'''THEN "{str_.lower()}"'''
            initial_comment = False
    return out_str

def get_static_data_index(static_data, N_measr, N_unique, N_count, N_dicom):
    # Static data must contain columns "label", "dicom_id", "valuenum"
    # Count measurements and cut fewer than N_measr total (across DICOMs)
    counts = static_data.groupby(["label", "dicom_id"])["valuenum"].count()
    DICOM_COUNTS = counts.index.get_level_values("label").value_counts()
    idxs = DICOM_COUNTS[DICOM_COUNTS >= N_measr].index
    # Count unique measurement values and cut fewer than N_unique per label
    UNIQUE_COUNTS = static_data.groupby(["label"])["valuenum"].unique().apply(lambda x: len(x))
    idxs = idxs[idxs.isin(UNIQUE_COUNTS[UNIQUE_COUNTS >= N_unique].index)]
    # Update
    counts = counts[idxs]
    # Count measurements and cut fewer than N_measr and filter the number
    # of measurements associated with each DICOM to > N_dicom
    dicom_counts = counts[counts > N_count].index.get_level_values("label").value_counts()
    high_idxs = (dicom_counts[dicom_counts > N_dicom]).index
    low_idxs = idxs[~idxs.isin(high_idxs)]
    return idxs, high_idxs, low_idxs

auth.authenticate_user()

Get overlapping patients that have been at the ICU

In [None]:
overlap_query = f"""
SELECT r.subject_id, i.hadm_id
FROM physionet-data.mimic_cxr.record_list r
JOIN physionet-data.mimic_icu.icustays i
ON r.subject_id = i.subject_id
JOIN physionet-data.mimic_cxr.dicom_metadata_string dicom
ON r.dicom_id = dicom.dicom
    AND dicom.AcquisitionDate between REPLACE ( CAST ( DATE ( i.intime ) AS STRING ), '-', '' ) and REPLACE ( CAST ( DATE ( i.outtime ) AS STRING ), '-', '' )
GROUP BY r.subject_id, i.hadm_id
"""
overlapping_subjects = run_query(overlap_query)
display(overlapping_subjects)

Unnamed: 0,subject_id,hadm_id
0,10021487,28998349
1,10082560,23284776
2,10144089,24171172
3,10148417,29867930
4,10190445,27005502
...,...,...
18316,16337794,24707597
18317,14574668,24152736
18318,17356318,25063767
18319,19209496,27405242


Find most common diagnoses in the overlapping dataset (ignoring the different icd versions)

In [None]:
N_diagnoses = 250
most_common_query = f"""
SELECT COUNT(i.long_title) as `total_cases`, i.long_title
from `physionet-data.mimic_hosp.d_icd_diagnoses` i JOIN `physionet-data.mimic_hosp.diagnoses_icd` d
ON d.icd_code = i.icd_code
    AND d.icd_version = i.icd_version
    AND d.hadm_id IN {tuple(overlapping_subjects.hadm_id.values)}
GROUP BY i.long_title
ORDER BY COUNT(i.long_title) DESC LIMIT {N_diagnoses}
"""
most_common = run_query(most_common_query)
display(most_common)
most_common.to_csv("most_common_diagnoses.csv", index=False)

Unnamed: 0,total_cases,long_title
0,5734,Unspecified essential hypertension
1,4860,Other and unspecified hyperlipidemia
2,4817,"Acute kidney failure, unspecified"
3,3973,"Congestive heart failure, unspecified"
4,3798,Atrial fibrillation
...,...,...
245,268,Other late effects of cerebrovascular disease
246,267,Unspecified hereditary and idiopathic peripher...
247,263,"Other and unspecified alcohol dependence, cont..."
248,262,Adrenal cortical steroids causing adverse effe...


In [None]:
count_diagnoses_query = lambda diagnosis: f"""
SELECT COUNT ( DISTINCT ( d.hadm_id ) ) as count, diag.long_title
from `physionet-data.mimic_hosp.diagnoses_icd` d
JOIN `physionet-data.mimic_hosp.d_icd_diagnoses` diag
ON d.icd_code = diag.icd_code
    AND d.icd_version = diag.icd_version
    AND d.hadm_id IN {tuple(overlapping_subjects.hadm_id.values)}
    AND {all_but_string([diagnosis], variable="diag.long_title", equality="LIKE")}
GROUP BY diag.long_title
ORDER BY COUNT ( DISTINCT ( d.hadm_id ) ) DESC
"""

In [None]:
diagnosis_ = "Pleural effusion"
count_diagnoses = run_query(count_diagnoses_query(diagnosis_))
count_ = count_diagnoses["count"].sum()
print(f"The total count of {diagnosis_} is {count_}")
display(count_diagnoses)

The total count of Pleural effusion is 1362


Unnamed: 0,count,long_title
0,841,Unspecified pleural effusion
1,286,"Pleural effusion, not elsewhere classified"
2,193,Malignant pleural effusion
3,42,Pleural effusion in other conditions classifie...


### Possible diagnoses:

- Atelectasis, total cases: 202

- Cardiomegaly, total cases: 44
    - Heart failure is likely more frequently coded

- Diaphragmatic hernia, total cases: 316

- Infiltration, total cases: *not found*

- Mass (cancer)
    - Sarcoma, total cases: *very few in the chest region*

- Pleural thickening, total cases: *very few*

- Pleural effusion, total cases: 1361

- Pneumonia, total unique cases: 4686
  - Requires multiple modalities:
  - X-ray
  - Test results:
    - CRP
    - White bloodcells (leucocytes)
    - Nasopharynx
    - Test spit
  - Time series:
    - O2-levels
    - Pulse
    - Respiratory rate

- Pneumothorax, total cases: 467

- Pulmonary edema, total cases: 218

- Pulmonary fibrosis, total_cases: 250

- Pulmonary nodule, total cases: 316 


From "*Large Scale Automated Reading of Frontal and Lateral Chest X-Rays using Dual Convolutional Neural Networks*" (2018)

Find patients that fit the above diagnosis

In [None]:
# 'Atelectasis'
# 'Cardiomegaly'
# 'Diaphragmatic hernia'
# 'Pneumonia'
# 'Pneumothorax'
# 'Pulmonary edema'
# 'Pleural effusion'
# 'Pulmonary fibrosis'
# 'Pulmonary nodule'

rare_disease_filter_N = 30
diagnoses_string = [
    'Pleural effusion',
    '',
]
exceptions = {
    diagnoses_string[0] : [
        'Atelectasis',
        'Cardiomegaly',
        'Diaphragmatic hernia',
        'Pneumonia',
        'Pneumothorax',
        'Pulmonary fibrosis',
        'Pulmonary nodule',
    ],
    diagnoses_string[-1] : [
        'Atelectasis',
        'Cardiomegaly',
        'Diaphragmatic hernia',
        'Pneumonia',
        'Pneumothorax',
        'Pulmonary edema',
        'Pleural effusion',
        'Pulmonary fibrosis',
        'Pulmonary nodule',
    ]
}
for diagnosis_ in diagnoses_string:
    if diagnosis_ != '':
        diag_variants = run_query(count_diagnoses_query(diagnosis_))
        exceptions[diagnosis_].extend(diag_variants[diag_variants["count"] < rare_disease_filter_N].long_title.to_list())

# Sampling

In [None]:
N = np.inf
random_state = 3

subj_query = lambda diagnosis, not_diagnosis: f"""
SELECT d.subject_id, d.hadm_id
from `physionet-data.mimic_hosp.diagnoses_icd` d
JOIN `physionet-data.mimic_hosp.d_icd_diagnoses` diag
ON d.icd_code = diag.icd_code
    AND d.icd_version = diag.icd_version
    AND d.hadm_id IN {tuple(overlapping_subjects.hadm_id.values)} {f'''
    AND ({
        all_but_string([diagnosis], variable="diag.long_title", equality="LIKE")
    })''' if diagnosis != '' else ''}
    WHERE NOT EXISTS (
        SELECT d2.hadm_id
        from `physionet-data.mimic_hosp.diagnoses_icd` d2
        JOIN `physionet-data.mimic_hosp.d_icd_diagnoses` diag2
        ON d2.icd_code = diag2.icd_code
            AND d2.icd_version = diag2.icd_version
            AND d.hadm_id = d2.hadm_id
            AND ({f'''{
                all_but_string(not_diagnosis, combine="OR", variable="diag2.long_title", equality="LIKE")
            } ''' if not_diagnosis != [''] else ''} {f'''
            {f" OR " if not_diagnosis != [''] else ''} {
                all_but_string(exceptions[diagnosis], combine="OR", variable="diag2.long_title", equality="LIKE")
            } ''' if exceptions[diagnosis] != [] else ''}) )
GROUP BY d.subject_id, d.hadm_id
ORDER BY d.hadm_id
"""

diags_tables = [run_query(subj_query(
    diagnoses_string[_],
    diagnoses_string[:_] + diagnoses_string[_+1:]
)) for _ in range(len(diagnoses_string))]
N = min(*[len(_) for _ in diags_tables], N) # Ensure equal sample size
assert N != 0, f"Length of diagnoses are, respectively {[len(_) for _ in diags_tables]}"

sample = pd.concat(
    [_.sample(n=N, replace=False, random_state=random_state) for _ in diags_tables],
    ignore_index=True
)

print(f"{100 * len(sample.hadm_id.unique())/(2*N)}% unique values, 100% suggests entirely separate populations")
sample = sample.sample(
    frac=1, replace=False, random_state = random_state
).reset_index(drop=True)
display(sample)

100.0% unique values, 100% suggests entirely separate populations


Unnamed: 0,subject_id,hadm_id
0,15185501,23389640
1,13261557,24351231
2,11031232,27641087
3,17347760,23962907
4,12435236,29255686
...,...,...
1375,19720782,27128215
1376,14589196,20789590
1377,10867166,20748929
1378,17391981,20265051


# CXR :
Combine *subject_id*, *study_id*, *dicom_id*, *AcquisitionDate*, image properties?  
Feature vectors linked to single X-rays.

Join tables **record_list** and **dicom_metadata_string**


# IV :
Combine with **CXR** for a given time window based on *AcquisitionDate* above.

Which features to extract?  
**Lab tests**, **chart events**


In [None]:
labels_query = f"""
SELECT d.subject_id, d.hadm_id, CASE WHEN {
    all_but_string([_ for _ in diagnoses_string if _ != ''], combine="WHEN", variable="diag.long_title", equality="LIKE", then=True)
    } ELSE 'None' END as long_title
FROM `physionet-data.mimic_hosp.diagnoses_icd` d
JOIN `physionet-data.mimic_hosp.d_icd_diagnoses` diag
ON d.icd_code = diag.icd_code
    AND d.icd_version = diag.icd_version
    AND d.hadm_id IN {tuple(sample.hadm_id.values)}
GROUP BY d.subject_id, d.hadm_id, CASE WHEN {
    all_but_string([_ for _ in diagnoses_string if _ != ''], combine="WHEN", variable="diag.long_title", equality="LIKE", then=True)
    } ELSE 'None' END
"""
labels_ = run_query(labels_query)
labels = labels_[labels_.long_title.isin([_.lower() for _ in diagnoses_string if _ != ''])]
labels = labels.append(labels_[~labels_.hadm_id.isin(labels.hadm_id)]).sample(frac=1, replace=False).reset_index(drop=True)
labels = sample.rename(columns={"hadm_id" : "hadm_id"}).merge(labels[["hadm_id", "long_title"]], on="hadm_id").sort_values("hadm_id")
display(labels)
labels.to_csv("out/labels.csv", index=False)

Unnamed: 0,subject_id,hadm_id,long_title
558,16003661,20001305,
1273,14583219,20004577,pleural effusion
1084,13251065,20013244,pleural effusion
263,14861926,20031226,
413,18190098,20034658,
...,...,...,...
412,12897943,29942827,
198,18775665,29946087,
618,18322831,29964109,pleural effusion
543,15356161,29975784,pleural effusion


In [None]:
diags_query = f"""
SELECT d.subject_id, d.hadm_id, diag.long_title
FROM `physionet-data.mimic_hosp.diagnoses_icd` d
JOIN `physionet-data.mimic_hosp.d_icd_diagnoses` diag
ON d.icd_code = diag.icd_code
    AND d.icd_version = diag.icd_version
    AND d.hadm_id IN {tuple(sample.hadm_id.values)}
GROUP BY d.subject_id, d.hadm_id, diag.long_title
"""
diags_ = run_query(diags_query)
diags_table = pd.concat((diags_, pd.Series(1, index=diags_.index, name="ones")), axis=1).pivot_table(
    index='hadm_id',
    columns='long_title',
    values='ones',
    fill_value=0,
)
hadm_diags = sample.hadm_id.apply(lambda x: diags_table.loc[x])
hadm_diags.index = sample.hadm_id
hadm_diags = hadm_diags.sort_index()
display(hadm_diags)
hadm_diags.to_csv("out/hadm_diags.csv")

long_title,Abdominal aneurysm without mention of rupture,"Abdominal aortic aneurysm, without rupture","Abdominal or pelvic swelling, mass, or lump, unspecified site","Abdominal pain, generalized","Abdominal pain, other specified site","Abdominal pain, right lower quadrant","Abdominal pain, right upper quadrant","Abdominal pain, unspecified site",Abnormal coagulation profile,Abnormal findings on diagnostic imaging of other specified body structures,...,Weakness,"Wedge compression fracture of T11-T12 vertebra, initial encounter for closed fracture","Wedge compression fracture of first lumbar vertebra, initial encounter for closed fracture","Wedge compression fracture of second thoracic vertebra, initial encounter for closed fracture",Wegener's granulomatosis with renal involvement,Wheelchair dependence,Wheezing,Zoster encephalitis,Zoster without complications,Zygomycosis [Phycomycosis or Mucormycosis]
hadm_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20001305,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20004577,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20013244,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20031226,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20034658,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29942827,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
29946087,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
29964109,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
29975784,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# similarity_graph = pd.DataFrame(
#     hadm_diags.values @ hadm_diags.values.T,
#     index = hadm_diags.index.tolist(),
#     columns = hadm_diags.index.tolist()
# )
# display(similarity_graph)
# similarity_graph.to_csv("out/graph.csv")
# del similarity_graph
# del hadm_diags

Unnamed: 0,20001305,20004577,20013244,20031226,20034658,20041510,20047797,20062068,20063165,20063597,20067636,20076965,20085087,20092911,20095688,20098892,20111030,20111194,20126001,20126941,20129736,20133918,20136761,20140272,20143402,20147457,20149440,20158798,20162773,20165447,20167211,20174788,20175786,20179544,20185129,20188386,20197472,20202014,20205059,20223956,...,29703946,29704310,29706220,29713790,29717855,29719185,29724556,29729904,29741215,29742372,29748480,29765924,29773405,29775724,29784336,29785944,29789943,29809639,29817669,29834280,29834753,29842619,29844312,29859589,29873294,29873933,29883383,29884966,29892865,29899587,29905891,29919857,29925981,29933194,29940125,29942827,29946087,29964109,29975784,29981093
20001305,31,0,0,0,0,0,0,1,0,2,2,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,...,0,0,0,2,0,4,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,1,1,0,1,0,0,1
20004577,0,8,1,1,0,0,1,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,1,1,0,1,1,1,2,1,2,0,1,1,0,0,1,1,...,0,1,1,0,1,0,2,1,1,0,0,0,0,0,0,1,2,0,1,0,0,0,1,1,2,1,1,0,1,1,1,0,1,2,1,0,0,1,1,1
20013244,0,1,29,4,5,2,3,1,3,0,0,1,3,0,3,6,6,0,0,4,9,6,3,0,2,3,0,5,4,2,5,3,6,0,3,2,3,0,0,1,...,1,1,4,0,3,0,5,2,3,1,4,7,2,5,0,3,3,0,2,0,2,5,0,7,8,4,3,4,2,1,2,5,3,6,2,1,1,4,5,5
20031226,0,1,4,29,1,2,3,0,1,0,1,3,2,0,4,1,5,0,0,4,4,0,2,2,0,2,0,4,3,2,2,1,4,0,4,1,2,0,2,1,...,1,0,3,1,1,0,5,1,0,0,3,2,3,1,0,2,3,2,0,1,3,3,1,3,4,3,3,2,2,1,0,2,1,3,0,4,0,8,1,3
20034658,0,0,5,1,26,0,2,0,2,0,0,1,0,0,5,2,3,0,0,0,2,3,1,1,1,1,2,1,3,3,4,0,6,0,1,2,2,0,2,0,...,1,0,0,1,0,0,3,0,3,0,3,3,2,4,1,1,2,2,1,1,2,1,1,1,2,0,1,1,2,0,2,2,3,2,1,1,0,3,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29942827,0,0,1,4,1,2,2,0,2,0,0,1,3,0,3,1,5,1,0,2,3,0,3,1,1,2,1,1,4,1,1,0,2,0,2,1,0,0,1,0,...,1,0,1,1,0,0,1,0,1,0,0,1,3,1,0,1,1,0,0,1,1,3,0,1,2,0,1,1,1,0,0,0,0,1,0,16,0,2,1,1
29946087,1,0,1,0,0,0,2,0,0,0,1,0,0,0,2,1,0,0,0,0,0,2,1,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,2,1,1,0,0,1,0,0,0,2,0,0,1,1,0,1,0,2,3,1,0,14,0,0,1
29964109,0,1,4,8,3,2,1,0,3,0,2,4,1,0,4,2,7,1,0,4,3,1,2,2,0,2,1,2,3,0,3,1,4,0,1,2,2,0,1,0,...,2,1,1,2,1,1,4,1,2,0,1,2,2,1,1,0,3,0,2,1,3,4,0,2,4,2,1,1,4,0,1,3,1,1,1,2,0,23,3,1
29975784,0,1,5,1,1,0,1,0,1,1,1,4,1,0,4,2,5,0,0,2,3,1,2,0,0,2,0,4,3,0,4,1,2,0,2,1,5,0,0,0,...,0,1,3,0,1,1,3,1,2,0,0,3,2,1,0,1,1,0,1,0,2,2,0,3,8,1,1,1,3,0,1,1,0,2,2,1,0,3,15,4


## Combine the info
**subject_id** in the list of patients with the diagnosis above,  
at the correct time, in the correct admission, between admission and discharge,  
with the correct image while in the ICU,  
with the correct radiology report.

In [None]:
image_icu_query = f"""
select
    a.subject_id,
    a.hadm_id,
    a.stay_id,
    b.dicom_id,
    d.AcquisitionDate as dicom_date,
    d.AcquisitionTime as dicom_time,
    b.path as dicom_path,
    s.path as study_path
from `physionet-data.mimic_icu.icustays` a
JOIN `physionet-data.mimic_cxr.record_list` b
ON a.subject_id = b.subject_id
    AND a.hadm_id IN {tuple(sample.hadm_id.values)}
JOIN `physionet-data.mimic_cxr.dicom_metadata_string` d
ON b.dicom_id = d.dicom
    AND d.ViewPosition = 'AP'
    AND d.AcquisitionDate between REPLACE ( CAST ( DATE ( a.intime ) AS STRING ), '-', '' ) and REPLACE ( CAST ( DATE ( a.outtime ) AS STRING ), '-', '' )
JOIN `physionet-data.mimic_cxr.study_list` s
ON b.study_id = s.study_id
ORDER BY hadm_id
"""
unique_images = run_query(image_icu_query)
display(unique_images)
unique_images.to_csv('out/images.csv', index=False)

Unnamed: 0,subject_id,hadm_id,stay_id,dicom_id,dicom_date,dicom_time,dicom_path,study_path
0,16003661,20001305,36916968,67987284-c552a8ae-8e9b95f8-b9d11936-23c53d16,21780326,171350.546,files/p16/p16003661/s50124445/67987284-c552a8a...,files/p16/p16003661/s50124445.txt
1,16003661,20001305,36916968,53d303f7-0f40eb96-e78ce57d-fb450ece-04f676f8,21780327,110829.890,files/p16/p16003661/s50125053/53d303f7-0f40eb9...,files/p16/p16003661/s50125053.txt
2,16003661,20001305,36916968,fc3ce679-a44a58d2-c6630b2c-01629f50-a35f6511,21780326,043022.125,files/p16/p16003661/s50454860/fc3ce679-a44a58d...,files/p16/p16003661/s50454860.txt
3,16003661,20001305,36916968,f9e1801d-8d08e4d8-8fd5e4f4-d7b5e501-c81561f4,21780327,044440.031,files/p16/p16003661/s53044201/f9e1801d-8d08e4d...,files/p16/p16003661/s53044201.txt
4,16003661,20001305,36916968,649e8d38-22bccc26-32b1b52f-67f944ec-22fef2c1,21780325,025044.828,files/p16/p16003661/s54974641/649e8d38-22bccc2...,files/p16/p16003661/s54974641.txt
...,...,...,...,...,...,...,...,...
5449,18775665,29946087,33560412,a4fd21e2-262e3b5f-cf7fe4f2-80f05fc3-c24146f3,22071007,153101.468,files/p18/p18775665/s55047642/a4fd21e2-262e3b5...,files/p18/p18775665/s55047642.txt
5450,18322831,29964109,30754159,8222faef-a8d9460d-0862c2b2-1fdb8b1a-93f73d21,21320328,121640.109,files/p18/p18322831/s51578095/8222faef-a8d9460...,files/p18/p18322831/s51578095.txt
5451,15356161,29975784,38014038,4f1e33f9-ae460af2-ae4d0ffa-4ae94555-843a4ccb,21360901,050546.734,files/p15/p15356161/s53165593/4f1e33f9-ae460af...,files/p15/p15356161/s53165593.txt
5452,15356161,29975784,38014038,b5f3657a-808dbd8f-1d6a749b-01b9a55b-c8fe6160,21360830,134845.515,files/p15/p15356161/s59217608/b5f3657a-808dbd8...,files/p15/p15356161/s59217608.txt


Grab values from mimic_icu.datetimeevents, -.chartevents, and -.procedureevents that overlap in time window with dicom_id-AquisitionDate+AquisitionTime

In [None]:
def time_query(period, time_col="charttime"):
    # Period as int describes the number of days prior to the day of CXR
    if type(period) is not int:
        assert period in ("day", "stay"), "Period given as string type must be either 'day' or 'stay'."
        if period == "day":
            return f"d.AcquisitionDate = REPLACE ( CAST ( DATE ( a.{time_col} ) AS STRING ), '-', '' )"
        elif period == "stay":
            return f"d.AcquisitionDate between REPLACE ( CAST ( DATE ( icu.intime ) AS STRING ), '-', '' ) and REPLACE ( CAST ( DATE ( a.{time_col} ) AS STRING ), '-', '' )"
    return f"CAST ( d.AcquisitionDate AS FLOAT64 ) between CAST ( REPLACE ( CAST ( DATE ( a.{time_col} ) AS STRING ), '-', '' ) AS FLOAT64 ) - {period} and CAST ( REPLACE ( CAST ( DATE ( a.{time_col} ) AS STRING ), '-', '' ) AS FLOAT64 )"
    
def extract_icu_data(
    table,
    period="day",
    id_table="d_items",
    value_col="valuenum",
    time_col="charttime",
    hadm_sample=tuple(sample.hadm_id.values)
):
    query_ = f"""
select
    b.dicom_id,
    REPLACE ( CAST ( TIME ( a.{time_col} ) AS STRING ), ':', '' ) as {time_col},
    a.{value_col},
    items.label
FROM `physionet-data.mimic_icu.icustays` icu
JOIN `physionet-data.mimic_icu.{table}` a
on icu.stay_id = a.stay_id
JOIN `physionet-data.mimic_cxr.record_list` b
ON b.subject_id = a.subject_id
    AND a.hadm_id IN {hadm_sample}
JOIN `physionet-data.mimic_cxr.dicom_metadata_string` d
ON b.dicom_id = d.dicom
    AND {time_query(period, time_col=time_col)}
JOIN `physionet-data.mimic_icu.{id_table}` items
ON a.itemid = items.itemid
ORDER BY a.hadm_id, a.{time_col}
"""
    return run_query(query_)

def extract_hosp_data(
    table,
    id_table,
    period="day",
    id_col="itemid",
    value_col="valuenum",
    time_col="charttime",
    hadm_sample=tuple(sample.hadm_id.values)
):
    query_ = f"""
select
    a.subject_id,
    a.hadm_id,
    b.dicom_id,
    REPLACE ( CAST ( DATE ( a.{time_col} ) AS STRING ), '-', '' ) as date,
    REPLACE ( CAST ( TIME ( a.{time_col} ) AS STRING ), ':', '' ) as {time_col},
    a.{value_col},
    items.label
FROM `physionet-data.mimic_icu.icustays` icu
JOIN `physionet-data.mimic_hosp.{table}` a
ON icu.hadm_id = a.hadm_id
    AND icu.hadm_id IN {hadm_sample}
JOIN `physionet-data.mimic_cxr.record_list` b
ON a.subject_id = b.subject_id
JOIN `physionet-data.mimic_cxr.dicom_metadata_string` d
ON b.dicom_id = d.dicom
    AND {time_query(period, time_col=time_col)}
JOIN `physionet-data.mimic_hosp.{id_table}` items
ON a.{id_col} = items.{id_col}
ORDER BY a.hadm_id, a.{time_col}
"""
    return run_query(query_)

## Chart data extract: REPLACE WITH VITAL SIGNS
- Vital signs
- ?

In [None]:
age = run_query(
f"""
select
    subject_id,
    hadm_id,
    age
from `physionet-data.mimic_derived.age`
order by hadm_id
"""
)
vital_signs = run_query(
f"""
select
    b.hadm_id,
    r.dicom_id,
    a.*
except (temperature_site)
from `physionet-data.mimic_derived.vitalsign` a
join `physionet-data.mimic_icu.icustays` b
on a.stay_id = b.stay_id
    and b.hadm_id IN {tuple(sample.hadm_id.values)}
JOIN `physionet-data.mimic_cxr.record_list` r
ON r.subject_id = b.subject_id
JOIN `physionet-data.mimic_cxr.dicom_metadata_string` d
ON d.dicom = r.dicom_id
    AND d.AcquisitionDate = REPLACE ( CAST ( DATE ( a.charttime ) AS STRING ), '-', '' )
order by b.hadm_id, a.charttime
"""
)

In [None]:
is_nan = vital_signs.loc[:,~vital_signs.columns.isin(["hadm_id", "subject_id", "stay_id", "charttime"])].isna()
nan_count = is_nan.sum()
print(f"Percent NaNs :\n{100 * nan_count / len(vital_signs)}")
vital_signs = vital_signs[~is_nan.all(axis=1)]
display(vital_signs)

Percent NaNs :
dicom_id        0.000000
heart_rate     30.693351
sbp            31.976744
dbp            31.996086
mbp            30.525531
sbp_ni         60.112411
dbp_ni         60.122651
mbp_ni         60.101602
resp_rate      30.192168
temperature    80.661494
spo2           31.901652
glucose        85.359305
dtype: float64


Unnamed: 0,hadm_id,dicom_id,subject_id,stay_id,charttime,heart_rate,sbp,dbp,mbp,sbp_ni,dbp_ni,mbp_ni,resp_rate,temperature,spo2,glucose
0,20001305,649e8d38-22bccc26-32b1b52f-67f944ec-22fef2c1,16003661,36916968,2178-03-25 05:00:00,,,,,,,,21.0,,,
1,20001305,649e8d38-22bccc26-32b1b52f-67f944ec-22fef2c1,16003661,36916968,2178-03-25 05:32:00,76.0,,,,,,,18.0,,,
2,20001305,649e8d38-22bccc26-32b1b52f-67f944ec-22fef2c1,16003661,36916968,2178-03-25 05:35:00,,89.0,36.0,49.0,89.0,36.0,49.0,,,,
3,20001305,649e8d38-22bccc26-32b1b52f-67f944ec-22fef2c1,16003661,36916968,2178-03-25 05:49:00,,,,,,,,,,100.0,
4,20001305,649e8d38-22bccc26-32b1b52f-67f944ec-22fef2c1,16003661,36916968,2178-03-25 05:56:00,,,,,,,,,36.83,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175779,29981093,144841f5-0126909a-cde81d66-1db1375d-b3ed7127,10449297,32514622,2174-09-22 20:00:00,67.0,,,,,,,17.0,36.28,96.0,
175780,29981093,144841f5-0126909a-cde81d66-1db1375d-b3ed7127,10449297,32514622,2174-09-22 20:12:00,,98.0,50.0,61.0,98.0,50.0,61.0,,,,
175781,29981093,144841f5-0126909a-cde81d66-1db1375d-b3ed7127,10449297,32514622,2174-09-22 21:00:00,65.0,106.0,51.0,63.0,106.0,51.0,63.0,16.0,,95.0,
175782,29981093,144841f5-0126909a-cde81d66-1db1375d-b3ed7127,10449297,32514622,2174-09-22 22:00:00,66.0,116.0,51.0,65.0,116.0,51.0,65.0,18.0,,96.0,


In [None]:
try:
    vital_signs = vital_signs.set_index("dicom_id")
except KeyError:
    pass
identity = lambda x: x
aggdict = {
    "hadm_id": identity,
    "subject_id": identity,
    "stay_id": identity,
    "charttime": identity,
    "heart_rate": [identity, lambda x: x],
}
vital_signs_fin = pd.DataFrame(index=vital_signs.index.unique())

for column in (set(vital_signs.columns) - {"hadm_id", "subject_id", "stay_id", "charttime"}):
    df_ = vital_signs[column].groupby(level=0).agg([
        "first",
        "last",
        np.nanmedian,
        np.nanstd,
        np.nanmean,
        np.nanmax,
        np.nanmin
    ])
    vital_signs_fin[[f"{column}_{_}" for _ in df_.columns]] = df_
display(vital_signs_fin)
vital_signs_fin.to_csv("out/vital_signs.csv")

Unnamed: 0_level_0,temperature_first,temperature_last,temperature_nanmedian,temperature_nanmean,temperature_nanmax,temperature_nanmin,glucose_first,glucose_last,glucose_nanmedian,glucose_nanmean,...,sbp_nanmedian,sbp_nanmean,sbp_nanmax,sbp_nanmin,spo2_first,spo2_last,spo2_nanmedian,spo2_nanmean,spo2_nanmax,spo2_nanmin
dicom_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
649e8d38-22bccc26-32b1b52f-67f944ec-22fef2c1,36.83,36.61,36.670,36.668000,36.83,36.56,154.0,131.0,149.0,144.666667,...,91.0,96.035714,164.0,67.0,100.0,94.0,96.0,94.346154,100.0,75.0
67987284-c552a8ae-8e9b95f8-b9d11936-23c53d16,36.67,36.11,36.500,36.418333,36.67,36.06,131.0,101.0,101.0,110.333333,...,93.0,98.482759,195.0,56.0,93.0,95.0,95.0,94.148148,98.0,79.0
fc3ce679-a44a58d2-c6630b2c-01629f50-a35f6511,36.67,36.11,36.500,36.418333,36.67,36.06,131.0,101.0,101.0,110.333333,...,93.0,98.482759,195.0,56.0,93.0,95.0,95.0,94.148148,98.0,79.0
53d303f7-0f40eb96-e78ce57d-fb450ece-04f676f8,37.00,37.00,37.000,37.034000,37.78,36.56,96.0,132.0,114.0,114.000000,...,97.0,97.714286,140.0,47.0,96.0,94.0,94.0,94.375000,100.0,91.0
f9e1801d-8d08e4d8-8fd5e4f4-d7b5e501-c81561f4,37.00,37.00,37.000,37.034000,37.78,36.56,96.0,132.0,114.0,114.000000,...,97.0,97.714286,140.0,47.0,96.0,94.0,94.0,94.375000,100.0,91.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0a51ce08-6f9ca6c2-ba6175fc-948bb295-c1289bb2,36.33,36.44,36.610,36.528889,36.72,36.33,271.0,255.0,173.0,186.760000,...,113.0,115.407407,146.0,90.0,99.0,96.0,97.0,96.851852,99.0,95.0
0fbce255-918e6730-854ac62f-05b91758-0b3de7f0,36.33,36.44,36.610,36.528889,36.72,36.33,271.0,255.0,173.0,186.760000,...,113.0,115.407407,146.0,90.0,99.0,96.0,97.0,96.851852,99.0,95.0
b5f3657a-808dbd8f-1d6a749b-01b9a55b-c8fe6160,36.78,36.11,36.445,36.445000,36.78,36.11,,,,,...,183.5,177.250000,209.0,138.0,94.0,95.0,97.0,96.875000,99.0,94.0
4f1e33f9-ae460af2-ae4d0ffa-4ae94555-843a4ccb,36.67,37.06,36.780,36.836667,37.06,36.67,93.0,93.0,93.0,93.000000,...,139.0,132.000000,162.0,97.0,96.0,93.0,95.0,94.636364,98.0,91.0


In [None]:
# static_data = extract_icu_data(
#     table="chartevents"
# )
# # vital_signs = [
# #     "blood pressure",
# #     "bp",
# #     "o2",
# #     "respiratory rate",
# #     "saturation",
# #     "temperature"
# # ]
# # static_data = static_data[static_data.label.str.lower().str.contains("|".join(vital_signs))]
# nan_count = static_data["valuenum"].isna().sum()
# print(f"{nan_count} ({100*nan_count/len(static_data)}%) NaNs")
# static_data = static_data[static_data["valuenum"].notna()]
# display(static_data)

3705398 (60.7093365681954%) NaNs


Unnamed: 0,dicom_id,charttime,valuenum,label
0,649e8d38-22bccc26-32b1b52f-67f944ec-22fef2c1,025900,155.0,Height (cm)
1,649e8d38-22bccc26-32b1b52f-67f944ec-22fef2c1,025900,61.0,Height
2,649e8d38-22bccc26-32b1b52f-67f944ec-22fef2c1,025900,44.0,Admission Weight (Kg)
3,649e8d38-22bccc26-32b1b52f-67f944ec-22fef2c1,033400,1.0,Unable to assess psychological
4,649e8d38-22bccc26-32b1b52f-67f944ec-22fef2c1,033400,1.0,Unable to assess teaching / learning needs
...,...,...,...,...
6103498,144841f5-0126909a-cde81d66-1db1375d-b3ed7127,230000,110.0,Non Invasive Blood Pressure systolic
6103500,144841f5-0126909a-cde81d66-1db1375d-b3ed7127,230000,42.0,Non Invasive Blood Pressure diastolic
6103502,144841f5-0126909a-cde81d66-1db1375d-b3ed7127,230000,19.0,Respiratory Rate
6103503,144841f5-0126909a-cde81d66-1db1375d-b3ed7127,230000,94.0,O2 saturation pulseoxymetry


In [None]:
idxs, high_idxs, low_idxs = get_static_data_index(
    static_data=static_data,
    N_measr = 100,
    N_unique = 12,
    N_count = 5,
    N_dicom = 5
)
print(f"Count, high-count measurements: {len(high_idxs)}\nCount, low-count measurements: {len(low_idxs)}")
static_data = static_data[static_data.label.isin(idxs)]

Count, high-count measurements: 135
Count, low-count measurements: 65


Separate unique item IDs

Remove binary and scarce (less or equal than `cutoff_uniques_N`) measurements.  
Keep columns that are represented in `(100 * cutoff_percent) %` or more samples.

In [None]:
aggfuncs =  [
    "first",
    "last",
    np.nanmedian,
    np.nanmean,
    np.nanstd,
    np.nanmax,
    np.nanmin,
]
num_val = static_data.pivot_table(
    index=['dicom_id'],
    columns=['label'],
    values='valuenum',
    aggfunc=aggfuncs
)
uniques = static_data.pivot_table(columns=['label'], values='valuenum', aggfunc=lambda x: len(x.unique())).loc["valuenum"]

cutoff_uniques_N = 12
cutoff_percent = .30

idx_1 = uniques >= cutoff_uniques_N
idx_2 = num_val["nanmedian"].apply(lambda col: True if col.notna().sum() >= int(len(num_val["nanmedian"])*cutoff_percent) else False, axis=0)
idx = idx_1 & idx_2

new_df = []
for parameter in aggfuncs:
    if type(parameter) is not str:
        name = parameter.__name__
    else:
        name = parameter
    _ = num_val[name].loc[:,num_val[name].columns.isin(idx[idx].index)]
    _.columns = pd.Series(_.columns).apply(lambda x: "_".join((name, x)))
    new_df.append(_)
new_df = pd.concat(new_df, axis=1)

display(new_df)
new_df.to_csv("out/chartevents.csv")

label,first_ALT,first_AST,first_Admission Weight (lbs.),first_Alkaline Phosphate,first_Anion gap,first_Apnea Interval,first_Arterial Base Excess,first_Arterial Blood Pressure Alarm - High,first_Arterial Blood Pressure Alarm - Low,first_Arterial Blood Pressure diastolic,first_Arterial Blood Pressure mean,first_Arterial Blood Pressure systolic,first_Arterial CO2 Pressure,first_Arterial O2 pressure,first_BUN,first_Calcium non-ionized,first_Chloride (serum),first_Creatinine (serum),first_Daily Weight,first_Fspn High,first_Glucose (serum),first_Glucose finger stick (range 70-100),first_HCO3 (serum),first_Heart Rate,first_Heart Rate Alarm - Low,first_Heart rate Alarm - High,first_Hematocrit (serum),first_Hemoglobin,first_INR,first_Inspiratory Time,first_Inspired O2 Fraction,first_Ionized Calcium,first_Lactic Acid,first_Magnesium,first_Mean Airway Pressure,first_Minute Volume,first_Minute Volume Alarm - High,first_Minute Volume Alarm - Low,first_Non Invasive Blood Pressure diastolic,first_Non Invasive Blood Pressure mean,...,nanmin_Minute Volume,nanmin_Minute Volume Alarm - High,nanmin_Minute Volume Alarm - Low,nanmin_Non Invasive Blood Pressure diastolic,nanmin_Non Invasive Blood Pressure mean,nanmin_Non Invasive Blood Pressure systolic,nanmin_Non-Invasive Blood Pressure Alarm - High,nanmin_Non-Invasive Blood Pressure Alarm - Low,nanmin_O2 Flow,nanmin_O2 Saturation Pulseoxymetry Alarm - High,nanmin_O2 Saturation Pulseoxymetry Alarm - Low,nanmin_O2 saturation pulseoxymetry,nanmin_PEEP set,nanmin_PH (Arterial),nanmin_PTT,nanmin_Paw High,nanmin_Peak Insp. Pressure,nanmin_Phosphorous,nanmin_Platelet Count,nanmin_Potassium (serum),nanmin_Prothrombin time,nanmin_Resp Alarm - High,nanmin_Resp Alarm - Low,nanmin_Respiratory Rate,nanmin_Respiratory Rate (Set),nanmin_Respiratory Rate (Total),nanmin_Respiratory Rate (spontaneous),nanmin_Sodium (serum),nanmin_SpO2 Desat Limit,nanmin_TCO2 (calc) Arterial,nanmin_Temperature Fahrenheit,nanmin_Tidal Volume (observed),nanmin_Tidal Volume (set),nanmin_Tidal Volume (spontaneous),nanmin_Total Bilirubin,nanmin_Ventilator Mode,nanmin_Ventilator Tank #1,nanmin_Ventilator Tank #2,nanmin_Vti High,nanmin_WBC
dicom_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
000046e4-e4d7f796-72c3dba4-8b67a485-0eea211d,88.0,229.0,272.8,46.0,24.0,20.0,-8.0,,,,,,46.0,163.0,57.0,8.5,104.0,7.1,,10.0,139.0,146.0,20.0,78.0,60.0,120.0,40.1,12.6,1.2,0.75,60.0,1.06,1.1,2.5,13.0,10.0,20.0,8.0,56.0,65.0,...,7.3,20.0,6.0,43.0,55.0,90.0,160.0,90.0,,100.0,92.0,92.0,10.0,7.23,28.1,40.0,23.0,10.1,274.0,3.9,13.0,35.0,8.0,0.0,16.0,16.0,0.0,142.0,88.0,18.0,97.9,96.0,480.0,,0.5,49.0,1600.0,2200.0,1.8,5.4
000b0846-2938ab13-3021af67-6b7ba76c-0df8e965,,,,,,,,160.0,85.0,54.0,71.0,103.0,,,,,,,98.8,,,157.0,,86.0,60.0,120.0,23.6,7.8,1.2,,,,,,,,,,61.0,71.0,...,,,,61.0,71.0,107.0,,,2.0,100.0,90.0,90.0,,,31.4,,,,76.0,,13.4,30.0,8.0,11.0,,,,,88.0,,98.5,,,,,,,,,7.1
00339787-e7b65a33-fee7d476-8f8446c0-920633e0,49.0,59.0,220.0,79.0,12.0,20.0,-5.0,110.0,65.0,75.0,88.0,105.0,60.0,67.0,18.0,7.2,102.0,1.1,,10.0,155.0,116.0,23.0,87.0,50.0,130.0,47.0,15.7,1.0,0.80,100.0,1.06,1.6,1.8,18.0,10.7,16.0,8.0,77.0,92.0,...,10.7,16.0,8.0,22.0,36.0,83.0,160.0,90.0,,100.0,92.0,55.0,5.0,7.14,27.2,45.0,33.0,3.6,206.0,5.9,11.4,35.0,8.0,26.0,28.0,28.0,0.0,131.0,88.0,20.0,,394.0,400.0,,0.6,49.0,2800.0,2200.0,1.5,14.1
003b3964-e6812182-5587a18a-93a736ee-29899109,,,,,18.0,20.0,-11.0,,,,,,20.0,48.0,16.0,7.7,101.0,0.9,,10.0,145.0,,20.0,70.0,60.0,120.0,32.9,10.0,,0.75,100.0,,1.2,1.8,15.0,8.5,14.0,6.5,66.0,77.0,...,7.6,14.0,6.5,56.0,63.0,84.0,110.0,60.0,,100.0,92.0,95.0,10.0,7.32,,40.0,24.0,3.3,257.0,3.8,,35.0,8.0,7.0,18.0,18.0,0.0,135.0,85.0,12.0,97.9,429.0,420.0,,,49.0,2000.0,3000.0,1.4,10.9
0045ff3e-572980c4-8b389876-402feb7f-c2d14e7b,30.0,62.0,169.4,201.0,17.0,20.0,-3.0,,,,,,82.0,197.0,24.0,6.9,109.0,0.5,,20.0,177.0,,21.0,136.0,90.0,160.0,28.0,8.3,1.6,0.90,60.0,1.04,10.2,2.1,17.0,14.9,19.0,2.0,29.0,34.0,...,14.9,19.0,2.0,13.0,29.0,52.0,160.0,90.0,,100.0,90.0,49.0,10.0,7.14,37.7,40.0,25.0,7.2,478.0,4.6,17.7,32.0,8.0,14.0,26.0,26.0,0.0,142.0,85.0,30.0,,464.0,500.0,,0.2,49.0,,,1.5,7.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ff9ae68e-68ad6a29-79c55228-7185ab29-b4f6ca62,61.0,191.0,184.8,87.0,17.0,,,,,,,,,,35.0,7.6,101.0,1.4,,,133.0,138.0,22.0,80.0,60.0,120.0,32.2,11.0,1.1,,,,,1.4,,,,,60.0,80.0,...,,,,58.0,78.0,114.0,160.0,90.0,,100.0,92.0,92.0,,,33.3,,,2.2,53.0,3.7,12.4,40.0,8.0,22.0,,,,132.0,85.0,,98.2,,,,,,,,,6.8
ffba94d5-43e5e000-6b6a8c1c-60bb5fd2-241f6e0e,49.0,128.0,,228.0,15.0,,,,,,,,,,109.0,7.9,106.0,3.3,,,136.0,136.0,25.0,81.0,60.0,120.0,23.3,7.7,1.1,,,1.09,1.7,2.1,,,,,58.0,74.0,...,,,,51.0,66.0,117.0,140.0,90.0,2.0,100.0,92.0,90.0,,,27.2,,,1.7,118.0,4.0,11.5,35.0,8.0,15.0,,,,142.0,85.0,,97.9,,,,1.7,,,,,18.8
ffbd2604-770c9d9f-de92c5a1-3f4ee627-c2d98572,33.0,40.0,160.8,55.0,12.0,,,,,,,,,,26.0,7.1,113.0,0.6,,,131.0,,21.0,62.0,50.0,120.0,28.1,9.1,1.1,,,,,1.7,,,,,83.0,87.0,...,,,,45.0,53.0,67.0,160.0,90.0,,100.0,92.0,95.0,,,32.8,,,3.4,172.0,3.4,11.6,35.0,8.0,11.0,,,,143.0,85.0,,86.5,,,,0.1,,,,,4.4
ffd60688-5da7c1d3-4229e284-c84ba788-c00f4302,,,,,16.0,,,,,,,,,,47.0,9.4,105.0,2.1,,,113.0,120.0,22.0,113.0,60.0,120.0,,,,,,,,1.7,,,,,101.0,106.0,...,,,,69.0,80.0,121.0,160.0,90.0,,100.0,92.0,89.0,,,,,,3.0,,4.1,,40.0,8.0,23.0,,,,139.0,85.0,,98.4,,,,,,,,,


## Now for lab tests

In [None]:
lab_data = extract_hosp_data(
    table="labevents",
    id_table="d_labitems",
    period=1
)
lab_data = lab_data[lab_data["valuenum"].notna()]
display(lab_data)

Unnamed: 0,subject_id,hadm_id,dicom_id,date,charttime,valuenum,label
0,16003661,20001305,649e8d38-22bccc26-32b1b52f-67f944ec-22fef2c1,21780325,040000,30.90,PTT
1,16003661,20001305,649e8d38-22bccc26-32b1b52f-67f944ec-22fef2c1,21780325,040000,100.00,Platelet Count
2,16003661,20001305,649e8d38-22bccc26-32b1b52f-67f944ec-22fef2c1,21780325,040000,9.30,Hemoglobin
3,16003661,20001305,649e8d38-22bccc26-32b1b52f-67f944ec-22fef2c1,21780325,040000,0.10,Basophils
4,16003661,20001305,649e8d38-22bccc26-32b1b52f-67f944ec-22fef2c1,21780325,040000,93.30,Neutrophils
...,...,...,...,...,...,...,...
1314331,10449297,29981093,144841f5-0126909a-cde81d66-1db1375d-b3ed7127,21740923,103900,12.30,CK-MB Index
1314332,10449297,29981093,144841f5-0126909a-cde81d66-1db1375d-b3ed7127,21740923,103900,53.40,PTT
1314333,10449297,29981093,144841f5-0126909a-cde81d66-1db1375d-b3ed7127,21740923,103900,253.00,Creatine Kinase (CK)
1314334,10449297,29981093,144841f5-0126909a-cde81d66-1db1375d-b3ed7127,21740923,103900,31.00,"Creatine Kinase, MB Isoenzyme"


In [None]:
idxs, high_idxs, low_idxs = get_static_data_index(
    static_data=lab_data,
    N_measr = 100,
    N_unique = 12,
    N_count = 5,
    N_dicom = 5
)
print(f"Count, high-count measurements: {len(high_idxs)}\nCount, low-count measurements: {len(low_idxs)}")
lab_data[lab_data.label.isin(idxs)]

Count, high-count measurements: 110
Count, low-count measurements: 26


Unnamed: 0,subject_id,hadm_id,dicom_id,date,charttime,valuenum,label
0,16003661,20001305,649e8d38-22bccc26-32b1b52f-67f944ec-22fef2c1,21780325,040000,30.90,PTT
1,16003661,20001305,649e8d38-22bccc26-32b1b52f-67f944ec-22fef2c1,21780325,040000,100.00,Platelet Count
2,16003661,20001305,649e8d38-22bccc26-32b1b52f-67f944ec-22fef2c1,21780325,040000,9.30,Hemoglobin
3,16003661,20001305,649e8d38-22bccc26-32b1b52f-67f944ec-22fef2c1,21780325,040000,0.10,Basophils
4,16003661,20001305,649e8d38-22bccc26-32b1b52f-67f944ec-22fef2c1,21780325,040000,93.30,Neutrophils
...,...,...,...,...,...,...,...
1314331,10449297,29981093,144841f5-0126909a-cde81d66-1db1375d-b3ed7127,21740923,103900,12.30,CK-MB Index
1314332,10449297,29981093,144841f5-0126909a-cde81d66-1db1375d-b3ed7127,21740923,103900,53.40,PTT
1314333,10449297,29981093,144841f5-0126909a-cde81d66-1db1375d-b3ed7127,21740923,103900,253.00,Creatine Kinase (CK)
1314334,10449297,29981093,144841f5-0126909a-cde81d66-1db1375d-b3ed7127,21740923,103900,31.00,"Creatine Kinase, MB Isoenzyme"


In [None]:
nan_count_lab = lab_data.valuenum.isna().sum()
print(f"{nan_count_lab} ({100*nan_count_lab/len(lab_data)}%) NaNs")

aggfuncs =  [
    "first",
    "last",
    np.nanmedian,
    np.nanmean,
    np.nanmax,
    np.nanmin,
]
num_val_lab = lab_data.pivot_table(
    index=['dicom_id'],
    columns=['label'],
    values='valuenum',
    aggfunc=aggfuncs
)
uniques = lab_data.pivot_table(columns=['label'], values='valuenum', aggfunc=lambda x: len(x.unique())).loc["valuenum"]

cutoff_uniques_N = 11
cutoff_percent = .30

idx_1 = uniques > cutoff_uniques_N
idx_2 = num_val_lab["nanmedian"].apply(lambda col: True if (~col.isna()).sum() >= int(len(num_val_lab["nanmedian"])*cutoff_percent) else False, axis=0)
idx = idx_1 & idx_2

new_lab_df = []
for parameter in aggfuncs:
    if type(parameter) is not str:
        name = parameter.__name__
    else:
        name = parameter
    _ = num_val_lab[name].loc[:,num_val_lab[name].columns.isin(idx[idx].index)]
    _.columns = pd.Series(_.columns).apply(lambda x: "_".join((name, x)))
    new_lab_df.append(_)
new_lab_df = pd.concat(new_lab_df, axis=1)

display(new_lab_df)
new_lab_df.to_csv("out/labevents.csv")

0 (0.0%) NaNs


label,first_Alanine Aminotransferase (ALT),first_Alkaline Phosphatase,first_Anion Gap,first_Asparate Aminotransferase (AST),first_Base Excess,first_Bicarbonate,"first_Bilirubin, Total","first_Calcium, Total",first_Calculated Total CO2,first_Chloride,...,nanmin_Platelet Count,nanmin_Potassium,nanmin_RDW,nanmin_Red Blood Cells,nanmin_Sodium,nanmin_Urea Nitrogen,nanmin_White Blood Cells,nanmin_pCO2,nanmin_pH,nanmin_pO2
dicom_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
000046e4-e4d7f796-72c3dba4-8b67a485-0eea211d,83.0,45.0,22.0,198.0,-8.0,19.0,0.5,8.3,20.0,105.0,...,314.0,3.7,13.1,3.74,138.0,44.0,10.9,34.0,7.23,91.0
0001bcde-bb754f26-6b32a474-90653a5d-3730aa89,,,16.0,,7.0,23.0,,7.4,33.0,100.0,...,176.0,3.2,13.7,2.72,136.0,9.0,7.1,39.0,7.52,70.0
000b0846-2938ab13-3021af67-6b7ba76c-0df8e965,,,9.0,,,31.0,,8.1,,105.0,...,76.0,3.8,15.5,2.48,141.0,22.0,6.2,,,
00126448-9dfa1383-58c9cb80-9b78c170-5ca8bf1c,,,12.0,,,22.0,,8.1,,101.0,...,330.0,3.8,14.6,2.15,131.0,21.0,8.9,,,
0013ea88-79a25fd6-fb56b969-9a224975-85d836d2,11.0,50.0,26.0,10.0,2.0,23.0,0.6,8.6,25.0,93.0,...,161.0,3.3,15.4,2.26,136.0,53.0,11.0,30.0,7.51,316.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ffd60688-5da7c1d3-4229e284-c84ba788-c00f4302,,,16.0,,,22.0,,9.4,,105.0,...,585.0,4.1,16.0,3.41,137.0,47.0,13.2,,,
ffd81c9c-a7f1e1b9-eb6fb574-0066af97-9d20f9a9,23.0,39.0,13.0,49.0,-3.0,19.0,12.8,7.5,20.0,108.0,...,106.0,4.1,15.2,2.58,135.0,44.0,7.8,31.0,5.50,83.0
fff1796e-a0026f0b-0aed51ba-3f623f18-cb7c9654,,,13.0,,,31.0,,8.2,,99.0,...,745.0,3.9,16.9,3.12,139.0,22.0,12.2,,,
fff4ed16-e7de1438-96706a1f-7839fce2-cc4cbc48,,,15.0,,3.0,30.0,2.1,8.2,30.0,94.0,...,65.0,3.6,15.8,3.20,135.0,65.0,6.6,46.0,7.41,94.0
