This document contains codes partially adapted from https://github.com/nyuad-cai/MedFuse

In [10]:
import pickle
import pandas as pd
from datetime import timedelta

In [31]:
def loadmetadata(cxr_data_dir, ehr_data_dir):

    data_dir = cxr_data_dir
    cxr_metadata = pd.read_csv(f'{cxr_data_dir}/mimic-cxr-2.0.0-metadata.csv')
    icu_stay_metadata = pd.read_csv(f'{ehr_data_dir}/root/all_stays.csv')
    columns = ['subject_id', 'stay_id', 'intime', 'outtime']

    # only common subjects with both icu stay and an xray
    cxr_merged_icustays = cxr_metadata.merge(icu_stay_metadata[columns ], how='inner', on='subject_id')

    # combine study date time
    cxr_merged_icustays['StudyTime'] = cxr_merged_icustays['StudyTime'].apply(lambda x: f'{int(float(x)):06}' )
    cxr_merged_icustays['StudyDateTime'] = pd.to_datetime(cxr_merged_icustays['StudyDate'].astype(str) + ' ' + cxr_merged_icustays['StudyTime'].astype(str) ,format="%Y%m%d %H%M%S")

    cxr_merged_icustays.intime=pd.to_datetime(cxr_merged_icustays.intime)
    cxr_merged_icustays.outtime=pd.to_datetime(cxr_merged_icustays.outtime)
    
    end_time = cxr_merged_icustays.intime + pd.DateOffset(hours=48)

    cxr_merged_icustays_during = cxr_merged_icustays.loc[(cxr_merged_icustays.StudyDateTime>=cxr_merged_icustays.intime)&((cxr_merged_icustays.StudyDateTime<=end_time))]
    cxr_merged_icustays_AP = cxr_merged_icustays_during[cxr_merged_icustays_during['ViewPosition'] == 'AP']

    groups = cxr_merged_icustays_AP.groupby('stay_id')

    groups_selected = []
    for group in groups:
        selected = group[1].sort_values('StudyDateTime').tail(1).reset_index()
        groups_selected.append(selected)
    groups = pd.concat(groups_selected, ignore_index=True)

    groups['cxr_time_offset_in_seconds'] = (groups['StudyDateTime'] - groups['intime']).astype('timedelta64[s]')
    return groups

In [32]:
cxr_dir = '/hdd2/mimic_cxr_jpg/physionet.org/files/mimic-cxr-jpg/2.0.0'
ehr_data_dir = '../../datasets/mimic4extract/data'

In [33]:
metadata = loadmetadata(cxr_dir, ehr_data_dir)

In [34]:
metadata

Unnamed: 0,index,dicom_id,subject_id,study_id,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,StudyTime,ProcedureCodeSequence_CodeMeaning,ViewCodeSequence_CodeMeaning,PatientOrientationCodeSequence_CodeMeaning,stay_id,intime,outtime,StudyDateTime,cxr_time_offset_in_seconds
0,217628,0194f433-caba299e-5f1d3bed-d7a0fa6c-afa63996,15904173,52277620,CHEST (PORTABLE AP),AP,2539,3050,21621227,055810,CHEST (PORTABLE AP),antero-posterior,Erect,30001947,2162-12-26 15:04:30,2162-12-27 22:31:00,2162-12-27 05:58:10,53620.0
1,291809,5dfd960b-2e6378a2-1de9c84f-24ec4b38-f9d1a19a,17938576,59469162,CHEST (PORTABLE AP),AP,2539,3050,21580123,213357,CHEST (PORTABLE AP),antero-posterior,Erect,30002498,2158-01-23 16:00:00,2158-01-24 17:36:04,2158-01-23 21:33:57,20037.0
2,34480,6d89c83e-c451a4ea-55a00c77-cef04030-6b19b58a,11101925,53005350,CHEST (PORTABLE AP),AP,3050,2539,21581229,011443,CHEST (PORTABLE AP),antero-posterior,,30004018,2158-12-27 21:32:00,2159-01-12 23:47:50,2158-12-29 01:14:43,99763.0
3,323857,d238c1d6-78a3fb3c-34adc7dd-51498676-636e5e3b,18730522,58044295,CHEST (PORTABLE AP),AP,3050,1884,21530907,105103,CHEST (PORTABLE AP),antero-posterior,,30004391,2153-09-05 13:12:00,2153-09-13 18:21:18,2153-09-07 10:51:03,164343.0
4,91956,b07e6c45-cab5a33c-bb1cade8-a043a627-48572ac5,12509799,55465259,CHEST (PORTABLE AP),AP,2544,3056,21650802,092211,CHEST (PORTABLE AP),antero-posterior,Erect,30004530,2165-07-31 09:40:35,2165-08-03 16:29:09,2165-08-02 09:22:11,171696.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10632,31636,7f938ee9-d7e12901-72879fdc-4082cff0-58f78efd,10992814,53391370,CHEST (PORTABLE AP),AP,2544,3056,21140611,171056,CHEST (PORTABLE AP),antero-posterior,Erect,39992167,2114-06-10 19:00:00,2114-06-20 20:09:43,2114-06-11 17:10:56,79856.0
10633,360660,b33b112f-50796485-620de905-6062b8b3-636db0d4,19774838,51812444,CHEST (PORTABLE AP),AP,3056,2544,21870915,101226,CHEST (PORTABLE AP),antero-posterior,Recumbent,39992578,2187-09-14 17:39:09,2187-09-16 19:17:27,2187-09-15 10:12:26,59597.0
10634,219921,52ffb631-5b194fcd-049139ea-97662cf2-a6c920d0,15952397,58072214,CHEST (PORTABLE AP),AP,2539,3050,21641212,113104,CHEST (PORTABLE AP),antero-posterior,Erect,39996044,2164-12-10 17:50:47,2164-12-17 18:23:16,2164-12-12 11:31:04,150017.0
10635,262132,f05b9731-d6bf3b29-6197f242-4cc974a3-fe0f5b56,17112432,57935403,CHEST (PORTABLE AP),AP,3056,2544,21100913,054250,CHEST (PORTABLE AP),antero-posterior,Erect,39997753,2110-09-11 23:32:00,2110-09-13 16:59:48,2110-09-13 05:42:50,108650.0


In [35]:
splits_labels_train = pd.read_csv(f'{ehr_data_dir}/phenotyping/train_listfile.csv')
splits_labels_val = pd.read_csv(f'{ehr_data_dir}/phenotyping/val_listfile.csv')
splits_labels_test = pd.read_csv(f'{ehr_data_dir}/phenotyping/test_listfile.csv')

In [36]:
train_meta_with_labels = metadata.merge(splits_labels_train, how='inner', on='stay_id')
val_meta_with_labels = metadata.merge(splits_labels_val, how='inner', on='stay_id')
test_meta_with_labels = metadata.merge(splits_labels_test, how='inner', on='stay_id')

In [42]:
with open('./metas_with_labels_phenotyping_first_48h.pkl', 'wb') as f:
    pickle.dump({
        'train': train_meta_with_labels,
        'val': val_meta_with_labels,
        'test': test_meta_with_labels
    }, f)