In [515]:
import pandas as pd
import numpy as np

JPG_PATH = "data/csv_files/jpg_file_paths.csv"
META_DATA = "data/csv_files/mimic-cxr-2.0.0-metadata.csv"
LABEL_PATH = "data/csv_files/mimic-cxr-2.0.0-chexpert.csv"

In [516]:
jpg_paths = (pd.read_csv(JPG_PATH,
                            usecols=['dicom_id', 'path']
                            )[['dicom_id', 'path']]).set_index(['dicom_id'])
print(f"df_jpg_paths shape: {jpg_paths.shape}")
jpg_paths.head(10)

df_jpg_paths shape: (377110, 1)


Unnamed: 0_level_0,path
dicom_id,Unnamed: 1_level_1
00000218-9fb20d4e-86045713-8013e08b-0d5bebba,/data8/jpg/files/p17/p17191670/s51332563/00000...
000046e4-e4d7f796-72c3dba4-8b67a485-0eea211d,/data8/jpg/files/p10/p10934092/s55245226/00004...
00005197-869d72f3-66210bf4-fa2c9d83-b613c4e7,/data8/jpg/files/p14/p14851532/s59116935/00005...
00005944-56375fa0-8329f4ae-e3dee4c3-4c4bc572,/data8/jpg/files/p13/p13424439/s53667003/00005...
00009bca-0893e5e4-c1296676-181f5233-25dfa9a3,/data8/jpg/files/p15/p15342241/s58246830/00009...
0000c2f5-f02f9f3c-1ed14642-958de0ad-d6ce4d20,/data8/jpg/files/p17/p17744443/s56580856/0000c...
0000d3be-591ae3b7-b03a7497-8319c02b-650bb4ab,/data8/jpg/files/p19/p19298916/s50189753/0000d...
0000e00d-372d678f-659e32be-5b82fe32-0d73ef1a,/data8/jpg/files/p15/p15468322/s59674182/0000e...
00010785-aefa6d86-8d00afab-09e76c6a-edb6b458,/data8/jpg/files/p18/p18778431/s59340602/00010...
00010923-3e9327ed-01db276b-5872bf82-5c844e7f,/data8/jpg/files/p14/p14373210/s56025085/00010...


In [517]:
meta_data = (pd.read_csv(META_DATA,
                            usecols=['dicom_id',
                                     'subject_id',
                                     'study_id',
                                     'ViewPosition',
                                     'PatientOrientation']
                            )[['dicom_id',
                               'subject_id',
                               'study_id',
                               'ViewPosition',
                               'PatientOrientation']]).set_index('dicom_id')

meta_data['subject_study_id'] = meta_data['subject_id'] * 100_000_000 + meta_data['study_id']

meta_data = meta_data[(meta_data.ViewPosition=='AP') & (meta_data.PatientOrientation=='Erect')]
meta_data = meta_data.drop(columns=['subject_id', 'study_id', 'ViewPosition', 'PatientOrientation'])
meta_data = meta_data.join(jpg_paths)
print(f"meta_data shape: {meta_data.shape}")
meta_data.head(10)

meta_data shape: (116390, 2)


Unnamed: 0_level_0,subject_study_id,path
dicom_id,Unnamed: 1_level_1,Unnamed: 2_level_1
000046e4-e4d7f796-72c3dba4-8b67a485-0eea211d,1093409255245226,/data8/jpg/files/p10/p10934092/s55245226/00004...
00005197-869d72f3-66210bf4-fa2c9d83-b613c4e7,1485153259116935,/data8/jpg/files/p14/p14851532/s59116935/00005...
00030f67-6141afb8-618a939e-80b252b4-c7d59aec,1538842154221476,/data8/jpg/files/p15/p15388421/s54221476/00030...
00036658-c5362a79-400c2649-efb1304e-53c92b83,1827601050461256,/data8/jpg/files/p18/p18276010/s50461256/00036...
00038f14-86928b2a-f3e34fd7-d44ecdff-5b8660a8,1770854550391163,/data8/jpg/files/p17/p17708545/s50391163/00038...
00046130-fd952ef0-57f2948d-491a16b4-5db3a18c,1518483659382057,/data8/jpg/files/p15/p15184836/s59382057/00046...
00046bce-20d53086-5ffea7f9-66324915-834d4778,1525512051300200,/data8/jpg/files/p15/p15255120/s51300200/00046...
000512db-61f1b4e0-5b16d40e-f6aeb922-21f71de8,1595802455562738,/data8/jpg/files/p15/p15958024/s55562738/00051...
00054ff1-47f5c5c2-e8dc7c52-1deafa05-e1174cd0,1062755655547878,/data8/jpg/files/p10/p10627556/s55547878/00054...
0005ad61-eaa2d768-c8aff20f-dc56514a-47de997c,1188823952421950,/data8/jpg/files/p11/p11888239/s52421950/0005a...


In [518]:
labels = pd.read_csv(LABEL_PATH)
labels['subject_study_id'] = labels['subject_id'] * 100_000_000 + labels['study_id']
labels = labels.drop(columns=['subject_id', 'study_id'])

choose_column = 'Pleural Effusion'
findings = ['Atelectasis','Cardiomegaly','Consolidation', 'Edema', 'Enlarged Cardiomediastinum','Fracture',
             'Lung Lesion', 'Lung Opacity', 'No Finding', 'Pleural Effusion', 'Pleural Other','Pneumonia',
             'Pneumothorax', 'Support Devices']
     
labels = labels.drop(columns=[f for f in findings if f not in choose_column])
labels = labels.fillna(0)
labels['Pleural Effusion'] = [1 if lbl==1.0 else 0 for lbl in pe]
# labels['Pleural Effusion'] = [0 if lbl==np.nan for lbl in pe]
labels = labels.set_index('subject_study_id')

print(f"Shape of labels: {labels.shape}")
labels.head(20)

Shape of labels: (227827, 1)


Unnamed: 0_level_0,Pleural Effusion
subject_study_id,Unnamed: 1_level_1
1000003250414267,0
1000003253189527,0
1000003253911762,0
1000003256699142,0
1000076457375967,0
1000089850771383,0
1000089854205396,0
1000093550578979,1
1000093551178377,0
1000093555697293,0


In [519]:
meta_data = meta_data.set_index('subject_study_id')
data_and_labels = meta_data.join(labels)
data_and_labels = data_and_labels.set_index('path')
print(data_and_labels.dtypes)
data_and_labels = data_and_labels.sort_values(by=['Pleural Effusion'])
data_and_labels = data_and_labels.dropna()
print(data_and_labels['Pleural Effusion'].unique())
print(data_and_labels['Pleural Effusion'].value_counts())
# data_and_labels['Pleural Effusion'] = data_and_labels['Pleural Effusion'].astype(int)
print(f"Shape of data_and_labels: {data_and_labels.shape}")
data_and_labels.tail(10)

Pleural Effusion    float64
dtype: object
[0. 1.]
0.0    81272
1.0    35115
Name: Pleural Effusion, dtype: int64
Shape of data_and_labels: (116387, 1)


Unnamed: 0_level_0,Pleural Effusion
path,Unnamed: 1_level_1
/data8/jpg/files/p12/p12284340/s52113276/91e1b0a5-a78d00e5-aba9a96d-96372d72-8290a4c8.jpg,1.0
/data8/jpg/files/p12/p12284340/s51758459/ac2d47ca-fad71d02-3b94fa17-609ab6c3-6a45fb2e.jpg,1.0
/data8/jpg/files/p16/p16716950/s55203576/9b61151e-0862adb4-7a7aa314-a08cccb5-1924d356.jpg,1.0
/data8/jpg/files/p16/p16716950/s59808837/fdead906-3c04f5ba-84804eac-fdf706eb-097b0d0b.jpg,1.0
/data8/jpg/files/p12/p12284340/s50503887/2fa6e0a4-a8bddbdc-1ad75842-056fba04-455ad24f.jpg,1.0
/data8/jpg/files/p12/p12284340/s50002735/b7a9b520-200acc4b-f875f649-611b51f8-fe91ad79.jpg,1.0
/data8/jpg/files/p16/p16717658/s51493203/b12a6ddb-9d744573-56aa6dad-91f4e840-7a6cd576.jpg,1.0
/data8/jpg/files/p16/p16717658/s54828705/45b237fc-02369cf1-b84ca915-b0951f69-47ddfa03.jpg,1.0
/data8/jpg/files/p12/p12283783/s58481108/07dfe68b-17c63c34-df757038-3dd971bd-26c653f0.jpg,1.0
/data8/jpg/files/p15/p15013421/s55314230/ba5ca039-5009ab8f-9e9f3360-79a47779-d0a64d20.jpg,1.0
