In [17]:
import numpy as np
import pandas as pd

import shutil
import os

from tqdm import tqdm

In [2]:
data_entry = pd.read_csv("./data/NIH_CHEST_XRAYS/Data_Entry_2017.csv")
test_list = pd.read_csv(
    "./data/NIH_CHEST_XRAYS/test_list.txt",
    delimiter="\t",
    header=None,
    names=["filename"],
    index_col=None,
)
train_val_list = pd.read_csv(
    "./data/NIH_CHEST_XRAYS/train_val_list.txt",
    delimiter="\t",
    header=None,
    names=["filename"],
    index_col=None,
)

In [3]:
data_entry["Multi Class"] = data_entry["Finding Labels"].str.contains("\|")

In [4]:
data_entry["Multi Class"].value_counts()

Multi Class
False    91324
True     20796
Name: count, dtype: int64

In [5]:
data_entry_single_disease = data_entry.query("`Multi Class` == False")

In [6]:
data_entry_single_disease.shape

(91324, 13)

In [7]:
data_entry_single_disease.to_csv(
    "./data/NIH_CHEST_XRAYS/Data_Entry_2017_Single_Class.csv", index=None
)

In [23]:
IMAGES_DIR = r"/Users/dhavalpotdar/Workarea/IDS705/prototree/ProtoTree/data/NIH_CHEST_XRAYS/images"
DATASET_DIR = r"/Users/dhavalpotdar/Workarea/IDS705/prototree/ProtoTree/data/NIH_CHEST_XRAYS/dataset"
TEST_CROP = os.path.join(DATASET_DIR, "test_crop")
TEST_FULL = os.path.join(DATASET_DIR, "test_full")
TRAIN_CORNERS = os.path.join(DATASET_DIR, "train_corners")
TRAIN_CROP = os.path.join(DATASET_DIR, "train_crop")

train_set = set(train_val_list["filename"])
test_set = set(test_list["filename"])

not_found_images = []
unique_labels = data_entry_single_disease["Finding Labels"].unique()
for label in unique_labels:
    print(f"\nCopying for class {label}...")
    df_label = data_entry_single_disease.query("`Finding Labels` == @label")

    for idx, row in tqdm(df_label.iterrows(), total=df_label.shape[0], desc="Images"):
        image_name = row["Image Index"]
        source_image_path = os.path.join(IMAGES_DIR, image_name)

        if not os.path.exists(source_image_path):
            not_found_images.append(source_image_path)
            continue

        if image_name in train_set:

            # construct paths for train set
            destination_crop = os.path.join(TRAIN_CROP, label, image_name)
            destination_corners = os.path.join(TRAIN_CORNERS, label, image_name)

            # copy
            shutil.copy(source_image_path, destination_crop)
            shutil.copy(source_image_path, destination_corners)
            pass

        elif image_name in test_set:

            # construct paths for test set
            destination_crop = os.path.join(TEST_CROP, label, image_name)
            destination_corners = os.path.join(TEST_FULL, label, image_name)

            # copy
            shutil.copy(source_image_path, destination_crop)
            shutil.copy(source_image_path, destination_corners)
            pass

        else:
            raise


Copying for class Cardiomegaly...


Images: 100%|██████████| 1093/1093 [00:01<00:00, 1056.45it/s]



Copying for class No Finding...


Images: 100%|██████████| 60361/60361 [00:56<00:00, 1069.43it/s] 



Copying for class Hernia...


Images: 100%|██████████| 110/110 [00:00<00:00, 797.08it/s]



Copying for class Infiltration...


Images: 100%|██████████| 9547/9547 [00:08<00:00, 1146.09it/s]



Copying for class Nodule...


Images: 100%|██████████| 2705/2705 [00:02<00:00, 986.86it/s] 



Copying for class Emphysema...


Images: 100%|██████████| 892/892 [00:00<00:00, 957.30it/s]



Copying for class Effusion...


Images: 100%|██████████| 3955/3955 [00:03<00:00, 1060.85it/s]



Copying for class Atelectasis...


Images: 100%|██████████| 4215/4215 [00:04<00:00, 1042.86it/s]



Copying for class Pleural_Thickening...


Images: 100%|██████████| 1126/1126 [00:01<00:00, 1059.25it/s]



Copying for class Pneumothorax...


Images: 100%|██████████| 2194/2194 [00:01<00:00, 1198.50it/s]



Copying for class Mass...


Images: 100%|██████████| 2139/2139 [00:02<00:00, 1051.84it/s]



Copying for class Fibrosis...


Images: 100%|██████████| 727/727 [00:00<00:00, 885.98it/s]



Copying for class Consolidation...


Images: 100%|██████████| 1310/1310 [00:01<00:00, 1052.40it/s]



Copying for class Edema...


Images: 100%|██████████| 628/628 [00:00<00:00, 1092.09it/s]



Copying for class Pneumonia...


Images: 100%|██████████| 322/322 [00:00<00:00, 1103.60it/s]


In [58]:
data_entry_single_disease.head()

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],Unnamed: 11,Multi Class
0,00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,0.143,0.143,,False
3,00000002_000.png,No Finding,0,2,81,M,PA,2500,2048,0.171,0.171,,False
4,00000003_000.png,Hernia,0,3,81,F,PA,2582,2991,0.143,0.143,,False
5,00000003_001.png,Hernia,1,3,74,F,PA,2500,2048,0.168,0.168,,False
6,00000003_002.png,Hernia,2,3,75,F,PA,2048,2500,0.168,0.168,,False


In [1]:
len(not_found_images)

NameError: name 'not_found_images' is not defined