Project DeepHealth, UC5 "Deep Image Annotation"

Franco Alberto Cardillo, ILC-CNR (UNITO) 

<francoalberto.cardillo@ilc.cnr.it>

PREPROCESS AND ENCODE THE MIMIC-CXR DATASET

In [27]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from numpy import count_nonzero as nnz
import os
from posixpath import join
from IPython.display import display
from IPython.display import Image
import pickle
from nltk import sent_tokenize

# folders with images
jpg_fld = "/mnt/datasets/mimic-cxr/mimic-cxr-jpg/physionet.org/files/mimic-cxr-jpg/2.0.0"
dcm_fld = "/mnt/datasets/mimic-cxr/mimic-cxr-dcm/physionet.org/files/mimic-cxr/2.0.0"

# metadata (report-based)
filename_metadata = join(jpg_fld, "mimic-cxr-2.0.0-metadata.csv")
# labels (image-based)
filename_chexpert = join(jpg_fld, "mimic-cxr-2.0.0-chexpert.csv") #labels

# output files saved here
meta_fld = "/mnt/datasets/mimic-cxr/meta"


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [28]:
# add VIEW col to file "metadata"
# add SPLIT (train-valid-test) to file "metadata"
meta_df = pd.read_csv(filename_metadata)

# if true, rows without a view in [frontal, lateral] are dropped
DROP_ROWS_WITHOUT_VIEW = False

# initialize view with a mapping from ViewPosition
VIEW_MAP = {
    'AP': 'frontal',
    'PA': 'frontal',
    'LATERAL': 'lateral',
    'LL': 'lateral',
    'LPO': 'other',
    'RAO': 'other',
    'RPO': 'other',
    'LAO': 'other',
    # the below are overwritten in some instances by manual review
    'AP AXIAL': 'other',
    'XTABLE LATERAL': 'other',
    'AP LLD': 'other',
    'PA LLD': 'other',
    'L5 S1': 'other',
    'SWIMMERS': 'other',
    'AP RLD': 'other',
    'PA RLD': 'other',
}

meta_df['view'] = meta_df['ViewPosition'].map(VIEW_MAP)

# for 'other' category, currently many of these are simply unknown
# so try to update them with acq device map
ADPD_MAP = {
    'CHEST, LATERAL': 'lateral',
    'CHEST, PA': 'frontal',
    # manually checked 100 records, below is always frontal
    'CHEST, PORTABLE': 'frontal',
    'CHEST, PA X-WISE': 'frontal',
    'CHEST, AP (GRID)': 'frontal',
    'CHEST LAT': 'lateral',
    'CHEST PA': 'frontal',
    'CHEST, AP NON-GRID': 'frontal',
    'CHEST AP NON GRID': 'frontal',
    'CHEST PA X-WISE': 'frontal',
    'CHEST AP GRID': 'frontal',
    'CHEST, PORTABLE X-WISE': 'other',
    # below have < 25 samples each
    'CHEST PORT': 'frontal',
    'CHEST PORT X-WISE': 'frontal',
    # manually classified below
    'SHOULDER': 'other',
    'CHEST, PEDI (4-10 YRS)': 'other',
    'LOWER RIBS': 'other',
    'CHEST, DECUB.': 'other',
    'ABDOMEN, PORTABLE': 'other',
    'UPPER RIBS': 'frontal',
    'STERNUM, LATERAL': 'lateral',
    'KNEE, AP/OBL': 'other',
    'STERNUM, PA/OBL.': 'other',
    'CLAVICLE/ AC JOINTS': 'other',
    'ABDOMEN,GENERAL': 'other',
    'LOWER RIB': 'other',
    'SCOLIOSIS AP': 'frontal'
}

good_view = ['frontal', 'lateral']
idxUpdate = ~meta_df['view'].isin(good_view)

# ! this field is not present in current file
#c = 'AcquisitionDeviceProcessingDescription'
#idx = (df_metadata[c].notnull()) & idxUpdate
#df_metadata.loc[idx, 'view'] = df_metadata.loc[idx, c].map(ADPD_MAP)

DICOM_TO_VIEW = {
    '2164992c-f4abb30a-7aaaf4f4-383cab47-4e3eb1c8': ['PA', 'frontal'],
    '5e6881e2-ff4254e0-b99f0c2f-8964482a-031364db': ['LL', 'lateral'],
    'fcdf7a30-3236b74e-65b97587-cdd4cfde-63cd1de0': ['PA', 'frontal'],
    'fb074ec1-6715839c-84fa75e6-adc3f026-448b1481': ['PA', 'frontal'],
    'dfb8080a-8506e43e-840d9d58-0f738f41-82c120b0': ['PA', 'frontal'],
    '4b32608b-c2ead7c4-1fe5565f-42f7ab80-9dad30de': ['LL', 'lateral'],
    '53663e89-8f9ca9bb-df1bf434-8d6b1283-2b612609': ['LL', 'lateral'],
    # below are AP, but incorrectly in View Position
    '8672a4e7-366801a0-26cf2395-9344335c-aac8d728': ['AP', 'frontal'],
    '9800b28e-3ff3b417-18473be2-1a66131d-aca88488': ['AP', 'frontal'],
    '598cfe48-33a8643e-843e27e2-5dd584e7-3cd5f1c0': ['AP', 'frontal']
}

# we manually reviewed a few DICOMs to keep them in
for dcm, row in DICOM_TO_VIEW.items():
    view = row[1]
    idx = meta_df['dicom_id'] == dcm
    if idx.any():
        meta_df.loc[idx, 'view'] = view

# remove rows that do not have a good view


if DROP_ROWS_WITHOUT_VIEW:
    idxUpdate = ~meta_df['view'].isin(good_view)
    print(f"removing {nnz(idxUpdate)} rows")
    n_rows = meta_df.shape[0]
    meta_df.drop(df[idxUpdate].index, inplace=True)
    assert nnz(idxUpdate) == n_rows - meta_df.shape[0]

print(f"using {meta_df.shape[0]} examples")

# display(meta_df.head().T)

# * cols to keep
keep_cols = ["dicom_id", "study_id", "view"]
meta_df = meta_df.drop(columns=[c for c in meta_df.columns if c not in keep_cols])
print("METADATA DF:", len(meta_df))
display(meta_df.head())

# add columns "split" 
filename_split = join(jpg_fld, "mimic-cxr-2.0.0-split.csv") #suggested training-test split
split_df = pd.read_csv(filename_split)
print("SPLIT DF:", len(split_df))
display(split_df.head())
assert len(split_df) == len(meta_df)
n_rows_meta = meta_df.shape[0]

merged = pd.merge(meta_df, split_df, on=['dicom_id', 'study_id'], how='left')
print("MERGED DF:", len(merged))
display(merged.head())
merged.to_pickle(join (meta_fld, "mimic_ds_without_path.pkl"))

using 377110 examples
METADATA DF: 377110


Unnamed: 0,dicom_id,study_id,view
0,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,50414267,frontal
1,174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962,50414267,lateral
2,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,53189527,frontal
3,e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c,53189527,lateral
4,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,53911762,frontal


SPLIT DF: 377110


Unnamed: 0,dicom_id,study_id,subject_id,split
0,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,50414267,10000032,train
1,174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962,50414267,10000032,train
2,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,53189527,10000032,train
3,e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c,53189527,10000032,train
4,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,53911762,10000032,train


MERGED DF: 377110


Unnamed: 0,dicom_id,study_id,view,subject_id,split
0,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,50414267,frontal,10000032,train
1,174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962,50414267,lateral,10000032,train
2,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,53189527,frontal,10000032,train
3,e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c,53189527,lateral,10000032,train
4,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,53911762,frontal,10000032,train


In [29]:
# add path
merged = pd.read_pickle( join(meta_fld, "mimic_ds_without_path.pkl"))

filename_records = join(dcm_fld, "cxr-record-list.csv") # path to dmc images (jpg are basically mirrored)
df = pd.read_csv(filename_records, header=0, sep=',')

# * replace .dcm with .jpg
df["path"] = df["path"].apply(lambda s: s.replace(".dcm", ".jpg"))
# display(df.head())
df = df[["study_id", "dicom_id", "path"]]

tmp = merged.merge(df, on=['dicom_id', 'study_id'], how='left')
assert tmp.shape[0] == merged.shape[0]
merged = tmp
print("ADDED PATH WITH JPG EXT:", len(merged))
display(merged.head())

merged.to_pickle(join (meta_fld, "mimic_ds.pkl"))

ADDED PATH WITH JPG EXT: 377110


Unnamed: 0,dicom_id,study_id,view,subject_id,split,path
0,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,50414267,frontal,10000032,train,files/p10/p10000032/s50414267/02aa804e-bde0afd...
1,174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962,50414267,lateral,10000032,train,files/p10/p10000032/s50414267/174413ec-4ec4c1f...
2,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,53189527,frontal,10000032,train,files/p10/p10000032/s53189527/2a2277a9-b0ded15...
3,e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c,53189527,lateral,10000032,train,files/p10/p10000032/s53189527/e084de3b-be89b11...
4,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,53911762,frontal,10000032,train,files/p10/p10000032/s53911762/68b5c4b1-227d048...


In [30]:
# labels
# using chexpert labels

mimic_ds = pd.read_pickle(join(meta_fld, "mimic_ds.pkl"))
print("CHECK EXT: !!!")
print(mimic_ds.iloc[0].path)

n_images = mimic_ds.shape[0]
print("MIMIC DS:", n_images)
display(mimic_ds.head())

chex = pd.read_csv(filename_chexpert)
print("CHEXPERT LABELS:", len(chex))
chex_studies = chex.study_id.tolist()

# ! POLICY FOR UNCERTAIN FINDING: U-NEG
# uncertain finddings (those with 0) treated as zeros (absent)
chex[chex.isna()] = 0
chex[chex.lt(0)] = 0
chex.drop(columns=["subject_id"], inplace=True)
display(chex.head())

# we will keep the original columns minus study_id + dicom_id

merged = chex.merge(mimic_ds, on="study_id")
print("CHEX + MIMIC:", merged.shape)
# print(merged.shape)


display(merged.head().T)
chex_cols = chex.columns.tolist()
chex_cols.remove("study_id")
chex_cols = chex_cols + ["dicom_id", "path"]
merged = merged[chex_cols]


merged.drop(columns=["dicom_id"], inplace=True)
merged.rename(columns={'path':'filename'}, inplace=True)
merged.set_index("filename", inplace=True)
print("LABELS:", len(merged))
display(merged.head().T)

merged.to_pickle( join(meta_fld, "img_dataset.pkl") )
print("saved:", join(meta_fld, "img_dataset.pkl"))

print(merged.index.values)
mimic2 = mimic_ds[mimic_ds.path.isin(merged.index.values)]
print("mimic after removing rows not in chexpert:", mimic2.shape)
mimic2.to_pickle( join(meta_fld, "mimic_ds.pkl") )


# we now need to add the path for each row
# filename_records = join(dcm_fld, "cxr-record-list.csv") # path to dmc images (jpg are basically mirrored)
# rec_df = pd.read_csv(filename_records)
# rec_df.drop(columns=["study_id", "subject_id"], inplace=True)
# display(rec_df.head())

# merged2 = merged.merge(rec_df, on="dicom_id")
# print("FINAL")
# display(merged2.head().T)

CHECK EXT: !!!
files/p10/p10000032/s50414267/02aa804e-bde0afdd-112c0b34-7bc16630-4e384014.jpg
MIMIC DS: 377110


Unnamed: 0,dicom_id,study_id,view,subject_id,split,path
0,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,50414267,frontal,10000032,train,files/p10/p10000032/s50414267/02aa804e-bde0afd...
1,174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962,50414267,lateral,10000032,train,files/p10/p10000032/s50414267/174413ec-4ec4c1f...
2,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,53189527,frontal,10000032,train,files/p10/p10000032/s53189527/2a2277a9-b0ded15...
3,e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c,53189527,lateral,10000032,train,files/p10/p10000032/s53189527/e084de3b-be89b11...
4,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,53911762,frontal,10000032,train,files/p10/p10000032/s53911762/68b5c4b1-227d048...


CHEXPERT LABELS: 227827


Unnamed: 0,study_id,Atelectasis,Cardiomegaly,Consolidation,Edema,Enlarged Cardiomediastinum,Fracture,Lung Lesion,Lung Opacity,No Finding,Pleural Effusion,Pleural Other,Pneumonia,Pneumothorax,Support Devices
0,50414267,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,53189527,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,53911762,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,56699142,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,57375967,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


CHEX + MIMIC: (377095, 20)


Unnamed: 0,0,1,2,3,4
study_id,50414267,50414267,53189527,53189527,53911762
Atelectasis,0.0,0.0,0.0,0.0,0.0
Cardiomegaly,0.0,0.0,0.0,0.0,0.0
Consolidation,0.0,0.0,0.0,0.0,0.0
Edema,0.0,0.0,0.0,0.0,0.0
Enlarged Cardiomediastinum,0.0,0.0,0.0,0.0,0.0
Fracture,0.0,0.0,0.0,0.0,0.0
Lung Lesion,0.0,0.0,0.0,0.0,0.0
Lung Opacity,0.0,0.0,0.0,0.0,0.0
No Finding,1.0,1.0,1.0,1.0,1.0


LABELS: 377095


filename,files/p10/p10000032/s50414267/02aa804e-bde0afdd-112c0b34-7bc16630-4e384014.jpg,files/p10/p10000032/s50414267/174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962.jpg,files/p10/p10000032/s53189527/2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab.jpg,files/p10/p10000032/s53189527/e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c.jpg,files/p10/p10000032/s53911762/68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714.jpg
Atelectasis,0.0,0.0,0.0,0.0,0.0
Cardiomegaly,0.0,0.0,0.0,0.0,0.0
Consolidation,0.0,0.0,0.0,0.0,0.0
Edema,0.0,0.0,0.0,0.0,0.0
Enlarged Cardiomediastinum,0.0,0.0,0.0,0.0,0.0
Fracture,0.0,0.0,0.0,0.0,0.0
Lung Lesion,0.0,0.0,0.0,0.0,0.0
Lung Opacity,0.0,0.0,0.0,0.0,0.0
No Finding,1.0,1.0,1.0,1.0,1.0
Pleural Effusion,0.0,0.0,0.0,0.0,0.0


saved: /mnt/datasets/mimic-cxr/meta/img_dataset.pkl
['files/p10/p10000032/s50414267/02aa804e-bde0afdd-112c0b34-7bc16630-4e384014.jpg'
 'files/p10/p10000032/s50414267/174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962.jpg'
 'files/p10/p10000032/s53189527/2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab.jpg'
 ...
 'files/p19/p19999987/s55368167/58766883-376a15ce-3b323a28-6af950a0-16b793bd.jpg'
 'files/p19/p19999987/s58621812/7ba273af-3d290f8d-e28d0ab4-484b7a86-7fc12b08.jpg'
 'files/p19/p19999987/s58971208/1a1fe7e3-cbac5d93-b339aeda-86bb86b5-4f31e82e.jpg']
mimic after removing rows not in chexpert: (377095, 6)


In [31]:
# add text

merged = pd.read_pickle( join(meta_fld, "mimic_ds.pkl"))
filename_sections = join(dcm_fld, "mimic-cxr-sections", "mimic_cxr_sectioned.csv")
text_df = pd.read_csv(filename_sections, header=0, sep=',', na_filter=False)
display(text_df.head())

# notice:
# text_df has column "study" with values s+int
# corresponding column in merge is "study_id" whose values are int, without the leading s
merged["study"] = merged["study_id"].apply(lambda x: "s" + str(x))
display(merged.head())

text_df = text_df[["study", "impression", "findings"]]
display(text_df.head())

T = merged.merge(text_df, on="study", how="left")

T = T.drop(columns=["study"])
merged = T
display(merged.head())

merged.to_pickle(join (meta_fld, "mimic_ds.pkl"))


Unnamed: 0,study,impression,findings,last_paragraph,comparison
0,s53189527,No acute cardiopulmonary abnormality.,"The cardiac, mediastinal and hilar contours ar...",,___
1,s53911762,No acute intrathoracic process.,Single frontal view of the chest provided.\n \...,,Chest radiograph ___
2,s50414267,No acute cardiopulmonary process.,"There is no focal consolidation, pleural effus...",,None.
3,s56699142,No acute cardiopulmonary process.,"The lungs are clear of focal consolidation, pl...",,"Radiographs from ___, ___ and ___."
4,s57375967,"Focal consolidation at the left lung base, pos...",PA and lateral views of the chest provided. ...,,


Unnamed: 0,dicom_id,study_id,view,subject_id,split,path,study
0,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,50414267,frontal,10000032,train,files/p10/p10000032/s50414267/02aa804e-bde0afd...,s50414267
1,174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962,50414267,lateral,10000032,train,files/p10/p10000032/s50414267/174413ec-4ec4c1f...,s50414267
2,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,53189527,frontal,10000032,train,files/p10/p10000032/s53189527/2a2277a9-b0ded15...,s53189527
3,e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c,53189527,lateral,10000032,train,files/p10/p10000032/s53189527/e084de3b-be89b11...,s53189527
4,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,53911762,frontal,10000032,train,files/p10/p10000032/s53911762/68b5c4b1-227d048...,s53911762


Unnamed: 0,study,impression,findings
0,s53189527,No acute cardiopulmonary abnormality.,"The cardiac, mediastinal and hilar contours ar..."
1,s53911762,No acute intrathoracic process.,Single frontal view of the chest provided.\n \...
2,s50414267,No acute cardiopulmonary process.,"There is no focal consolidation, pleural effus..."
3,s56699142,No acute cardiopulmonary process.,"The lungs are clear of focal consolidation, pl..."
4,s57375967,"Focal consolidation at the left lung base, pos...",PA and lateral views of the chest provided. ...


Unnamed: 0,dicom_id,study_id,view,subject_id,split,path,impression,findings
0,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,50414267,frontal,10000032,train,files/p10/p10000032/s50414267/02aa804e-bde0afd...,No acute cardiopulmonary process.,"There is no focal consolidation, pleural effus..."
1,174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962,50414267,lateral,10000032,train,files/p10/p10000032/s50414267/174413ec-4ec4c1f...,No acute cardiopulmonary process.,"There is no focal consolidation, pleural effus..."
2,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,53189527,frontal,10000032,train,files/p10/p10000032/s53189527/2a2277a9-b0ded15...,No acute cardiopulmonary abnormality.,"The cardiac, mediastinal and hilar contours ar..."
3,e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c,53189527,lateral,10000032,train,files/p10/p10000032/s53189527/e084de3b-be89b11...,No acute cardiopulmonary abnormality.,"The cardiac, mediastinal and hilar contours ar..."
4,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,53911762,frontal,10000032,train,files/p10/p10000032/s53911762/68b5c4b1-227d048...,No acute intrathoracic process.,Single frontal view of the chest provided.\n \...


In [32]:
# start text processing

merged = pd.read_pickle(join(meta_fld, "mimic_ds.pkl"))
display(merged.head())

# merged["impression"] = merged["impression"].fillna("")
# merged["findings"] = merged["findings"].fillna("")

# all text to lowercase
# all to lowercase:
columns = ["impression", "findings"]
for c in columns:
    merged[c] = merged[c].fillna("")

def process_raw_text(s):
    res = ""
    if s is None or len(s.strip()) == 0 or s.strip() == ".":
        res = ""
    else:
        s = s.strip()
        res = s if s.endswith(".") else s + "."
    return res

for c in ["impression", "findings"]:
    merged[c] = merged[c].apply(lambda s: process_raw_text(s))



def to_lowercase(s):
    if isinstance(s, str):
        return s.strip().lower()
    elif isinstance(s, list):
        return [to_lowercase(x.strip()) for x in s]
    else:
        print("errors, unexpected type:", type(s))

for c in columns:
    merged[c] = merged[c].apply(to_lowercase)


# split text into sentences

columns = ["impression", "findings"]
for c in columns:
    merged[c + "_sents"] = merged[c].apply(lambda s: sent_tokenize(s))

for c in columns:
    merged["len_" + c] = merged[c].apply(lambda s: len(s))

for c in columns:
    merged["nsents_" + c] = merged[c + "_sents"].apply(lambda l: len(l))

for c in columns:
    print("column:", c)
    print(merged["nsents_" + c].value_counts())

# some reports have 0 sentences either in "findings" or "impression"
def concat_columns(row):
    f = row["findings"].strip()
    i = row["impression"].strip()
    l = len(f) + len(i)
    return (f + " " + i).strip() if l > 0 else ""
    
merged["raw_text"] = merged.apply(concat_columns, axis=1)

merged["len_raw_text"] = merged.raw_text.apply(len)
merged["nsents_raw_text"] = merged["nsents_findings"] + merged["nsents_impression"]

assert (merged.nsents_raw_text == merged.nsents_findings + merged.nsents_impression).all()
# assert (merged.len_raw_text == merged.len_findings + merged.len_impression + 1).all()

print("text to lowercase, done")

merged.to_pickle(join(meta_fld, "mimic_ds.pkl"))
print(f"saved: {join(meta_fld, 'mimic_ds.pkl')}")


Unnamed: 0,dicom_id,study_id,view,subject_id,split,path,impression,findings
0,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,50414267,frontal,10000032,train,files/p10/p10000032/s50414267/02aa804e-bde0afd...,No acute cardiopulmonary process.,"There is no focal consolidation, pleural effus..."
1,174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962,50414267,lateral,10000032,train,files/p10/p10000032/s50414267/174413ec-4ec4c1f...,No acute cardiopulmonary process.,"There is no focal consolidation, pleural effus..."
2,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,53189527,frontal,10000032,train,files/p10/p10000032/s53189527/2a2277a9-b0ded15...,No acute cardiopulmonary abnormality.,"The cardiac, mediastinal and hilar contours ar..."
3,e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c,53189527,lateral,10000032,train,files/p10/p10000032/s53189527/e084de3b-be89b11...,No acute cardiopulmonary abnormality.,"The cardiac, mediastinal and hilar contours ar..."
4,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,53911762,frontal,10000032,train,files/p10/p10000032/s53911762/68b5c4b1-227d048...,No acute intrathoracic process.,Single frontal view of the chest provided.\n \...


column: impression
1     146559
2      72108
0      54118
3      34711
4      22113
5      18393
6      14007
7       8446
8       3996
9       1620
10       681
11       239
12        75
13        19
15         5
14         3
16         1
17         1
Name: nsents_impression, dtype: int64
column: findings
0     100333
5      59960
4      53110
6      51231
7      33664
3      32900
8      18220
2       9486
9       8620
10      3572
1       3401
11      1518
12       612
13       224
14       110
15        65
16        37
17        16
18         6
22         4
19         3
20         2
21         1
Name: nsents_findings, dtype: int64
text to lowercase, done
saved: /mnt/datasets/mimic-cxr/meta/mimic_ds.pkl


In [38]:
# remove from merged rows with sentence length = 0

empty_text = merged.len_raw_text == 0

print(f"rows with en empty text: { nnz(empty_text) } /  {merged.shape[0] }")
nrows1 = merged.shape[0]

img_ds = pd.read_pickle(join(meta_fld, "img_dataset.pkl"))
img_ds2 = img_ds.reset_index()

print("IMG DS 2")
display(img_ds2.head().T)

empty_reports = merged[empty_text]
display(empty_reports.head().T)

images_to_remove = empty_reports.path
img_ds_cleaned = img_ds.drop(images_to_remove)

print(img_ds.shape[0] - img_ds_cleaned.shape[0])

merged = merged[~empty_text]
removed = merged.shape[0] - nrows1
print("removed:", removed)

merged.to_pickle(join(meta_fld, "mimic_ds.pkl"))
print(f"saved: {join(meta_fld, 'mimic_ds.pkl')}")

img_ds_cleaned.to_pickle(join(meta_fld, "img_dataset.pkl"))
print(f"saved: {join(meta_fld, 'img_dataset.pkl')}")


rows with en empty text: 14902 /  377095
IMG DS 2


Unnamed: 0,0,1,2,3,4
filename,files/p10/p10000032/s50414267/02aa804e-bde0afd...,files/p10/p10000032/s50414267/174413ec-4ec4c1f...,files/p10/p10000032/s53189527/2a2277a9-b0ded15...,files/p10/p10000032/s53189527/e084de3b-be89b11...,files/p10/p10000032/s53911762/68b5c4b1-227d048...
Atelectasis,0.0,0.0,0.0,0.0,0.0
Cardiomegaly,0.0,0.0,0.0,0.0,0.0
Consolidation,0.0,0.0,0.0,0.0,0.0
Edema,0.0,0.0,0.0,0.0,0.0
Enlarged Cardiomediastinum,0.0,0.0,0.0,0.0,0.0
Fracture,0.0,0.0,0.0,0.0,0.0
Lung Lesion,0.0,0.0,0.0,0.0,0.0
Lung Opacity,0.0,0.0,0.0,0.0,0.0
No Finding,1.0,1.0,1.0,1.0,1.0


Unnamed: 0,22,23,124,125,132
dicom_id,6fa5997e-b1dfecf8-3c174666-8815c84a-32db59ff,f1adcae3-2921c0a8-5d9652f9-4191ecd7-f2a96f35,1e647043-eed3576e-3123c170-780cb897-93a89502,899e40a8-0562ad87-0f3253ea-4d376ad5-ad830fc2,940de880-0e87eceb-571361e8-e19b8de7-1a8e714f
study_id,56522600,56522600,52163036,52163036,55608920
view,lateral,frontal,frontal,lateral,lateral
subject_id,10000935,10000935,10002013,10002013,10002013
split,train,train,train,train,train
path,files/p10/p10000935/s56522600/6fa5997e-b1dfecf...,files/p10/p10000935/s56522600/f1adcae3-2921c0a...,files/p10/p10002013/s52163036/1e647043-eed3576...,files/p10/p10002013/s52163036/899e40a8-0562ad8...,files/p10/p10002013/s55608920/940de880-0e87ece...
impression,,,,,
findings,,,,,
impression_sents,[],[],[],[],[]
findings_sents,[],[],[],[],[]


14902
removed: -14902
saved: /mnt/datasets/mimic-cxr/meta/mimic_ds.pkl
saved: /mnt/datasets/mimic-cxr/meta/img_dataset.pkl


In [39]:
# text cleaning

merged = pd.read_pickle( join(meta_fld, "mimic_ds.pkl"))

import re
from nltk import word_tokenize
def clean_text_v1(text, verbose=False):
    
    def subst_numbers(token):
        s = re.sub(r"\A\d+(,|\.)\d+", "_NUM_", token)  # _DEC_ for finer texts
        s = re.sub(r"\A\d+", "_NUM_", s)
        return s

    def subst_meas(text):
        # substitute measures
        e = r"(_NUM_|_DEC_)\s?(cm|mm|in|xxxx)|_NUM_ x _MEAS_|_DEC_ x _MEAS_|_MEAS_ x _MEAS_ x _MEAS|_MEAS_ x _MEAS_"
        t1 = text
        while True:
            t2 = re.sub(e, "_MEAS_", t1)
            if t1 == t2:
                break
            else:
                t1 = t2
        return t1

    text2 = text.replace(" ", " ")
    text2 = text2.replace("..", ".")


    symbols = ",;:?)(!"

    e = "|".join([re.escape(s) for s in symbols])
    text2 = re.sub(e, " ", text2)
    # text2 = " ".join( [t.strip() for t in text2.split(" ")])
    # numbered list items
    text2 = re.sub(r"\s\d+\. ", " ", text2)
    # dash
    text2 = re.sub(r"-", "_", text2)
    # percentages
    text2 = re.sub(r"\d+%\s", "_PERC_ ", text2)
    # XXXX XXXX -> XXXX_XXX
    text2 = re.sub(r"xxxx(\sxxxx)+", "xxxx", text2)
    # ordinals
    text2 = re.sub(r"1st|2nd|3rd|[0-9]+th ", "_N_TH_ ", text2)


    sentences = []
    for sent in sent_tokenize(text2):
        new_tokens = [subst_numbers(token) for token in word_tokenize(sent)[:-1]]  # [:-1] not using last dot
        # for token in word_tokenize(sent):
        #     w = subst_numbers(token)
        #     new_tokens.append(w)
    
        sent = " ".join(new_tokens)
        sent = subst_meas(sent)
        sentences.append(sent)

    text2 = ". ".join(sentences) + "."  # dots, and in particular the last ., were not removed by word_tokenize

    if verbose and text != text2:   # and "_MEAS_" in text2:
        print("* IN (it has been modified):")
        print(text)
        print("* OUT:")
        print(text2)
        print(10 * "*** ")

    return text2


merged["text"] = merged.raw_text.apply(lambda text: clean_text_v1(text, verbose=False))

display(df.head().T)

out_fn = join(meta_fld, "mimic_ds.pkl")
merged.to_pickle(out_fn)
print(f"saved raw reports, step2, in: {out_fn}")

Unnamed: 0,0,1,2,3,4
study_id,50414267,50414267,53189527,53189527,53911762
dicom_id,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714
path,files/p10/p10000032/s50414267/02aa804e-bde0afd...,files/p10/p10000032/s50414267/174413ec-4ec4c1f...,files/p10/p10000032/s53189527/2a2277a9-b0ded15...,files/p10/p10000032/s53189527/e084de3b-be89b11...,files/p10/p10000032/s53911762/68b5c4b1-227d048...


saved raw reports, step2, in: /mnt/datasets/mimic-cxr/meta/mimic_ds.pkl


In [40]:
# vocabulary with all the word in dataset
from utils.vocabulary import Vocabulary

merged = pd.read_pickle(join(meta_fld, "mimic_ds.pkl"))


out_filename = "all_vocab.pkl"
text_col = merged["raw_text"]
vocab = Vocabulary()
for (id, text) in text_col.iteritems():
    for sentence in sent_tokenize(text):
        if len(sentence) == 0:
            print("ERROR, sentence length == 0")
        vocab.add_sentence(sentence)

print("number of distinct words:", len(vocab.word2idx))
print("total number of words:", vocab.word_count)

out_fn = join(meta_fld, out_filename)
with open(out_fn, "wb") as fout:
    pickle.dump(vocab, fout)

number of distinct words: 36255
total number of words: 20591929


In [41]:
# vocabulary, restricted to n_words
# number selected in order to "cover" about 97.something of the dataset (as for chest iu when using 1000 words)
with open (join(meta_fld, "all_vocab.pkl"), "rb") as fin:
    vocab = pickle.load(fin)
print("read:",join(meta_fld, "all_vocab.pkl"))
print("(all) num words:", len(vocab.word2idx))
print("(all) word count:", vocab.word_count)

sel_words = []
n_words = 3000
wc = list(vocab.word2count.items())
# words sorted according to their absolute frequency in the dataset
wc = sorted(wc, key=lambda elem: -elem[1])
n = 0
for w, c in wc:
    n += c

wc2 = wc[:n_words]

n2 = 0
for w, c in wc2:
    # print(f"{w}:{c}")
    n2 += c

# for word, count in vocab.word2count.items():
#     print(f"{word}: {count}")

print("(all) n:", n)
print("(filt) n2:", n2)

print("coverage after filtering:", n2 / n)
print("diff in word count:", n2 - n)
vocab.keep_n_words(n_words)
print("saved word count:", vocab.word_count)
out_vocab_fn = join(meta_fld,  f"vocab_{n_words}.pkl")
with open(out_vocab_fn, "wb") as fout:
    pickle.dump(vocab, fout)
print("saved:", out_vocab_fn)

read: /mnt/datasets/mimic-cxr/meta/all_vocab.pkl
(all) num words: 36255
(all) word count: 20591929
(all) n: 20591929
(filt) n2: 20107476
coverage after filtering: 0.9764736465437502
diff in word count: -484453
(vocabulary) initial word count (total): 20591929
(vocabulary) initial number of words: 36251
(vocabulary) after iterating with add_word number of words: 3000
(vocabulary) final word_count (total):  20107476
(vocabulary) final number of words: 3000
saved word count: 20107476
saved: /mnt/datasets/mimic-cxr/meta/vocab_3000.pkl


In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from numpy import count_nonzero as nnz
import os
from posixpath import join
from IPython.display import display
from IPython.display import Image
import pickle
from nltk import sent_tokenize
import yaml

from utils.vocabulary import Vocabulary

meta_fld = "/mnt/datasets/mimic-cxr/meta"

ds = pd.read_pickle(join(meta_fld, "mimic_ds.pkl"))
print(ds.shape)

display(ds.head())

ds = pd.read_pickle(join(meta_fld, "img_dataset.pkl"))
print(ds.shape)
display(ds.head())