## Data preprocessing

##### Copyright (C) Microsoft Corporation.  
see license file for details 

In [1]:
# Allow multiple displays per cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# AZUREML_NATIVE_SHARE_DIRECTORY mapping to host dir is set by _nativeSharedDirectory_ in .compute file 

import os

baseDataDir =  os.path.join(os.getcwd(), os.path.join(*(['..', '..', '..', '..'])))
baseDataDir

'/local_dir/prj/AzureChestXRayNoAML/code/02_Model/../../../..'

In [4]:
# import utlity functions

import sys, os
paths_to_append = [os.path.join(os.getcwd(), os.path.join(*([ '..', 'src'])))]
def add_path_to_sys_path(path_to_append):
    if not (any(path_to_append in paths for paths in sys.path)):
        sys.path.append(path_to_append)

[add_path_to_sys_path(crt_path) for crt_path in paths_to_append]

import azure_chestxray_utils

[None]

#### Path variables

In [5]:
# create base directories for the file path variables 
# paths are tipically container level dirs mapped to a host dir for data persistence.

prj_consts = azure_chestxray_utils.chestxray_consts()

data_base_input_dir=os.path.join(baseDataDir, os.path.join(*(prj_consts.BASE_INPUT_DIR_list)))
data_base_output_dir=os.path.join(baseDataDir, os.path.join(*(prj_consts.BASE_OUTPUT_DIR_list)))  

data_base_input_dir
data_base_output_dir

'/local_dir/prj/AzureChestXRayNoAML/code/02_Model/../../../../data/chestxray/ChestX-ray8'

'/local_dir/prj/AzureChestXRayNoAML/code/02_Model/../../../../data/chestxray/output'

In [6]:
# chest xray images are in nih_chest_xray_data_dir
nih_chest_xray_data_dir=os.path.join(data_base_input_dir, 
                                     os.path.join(*(prj_consts.ChestXray_IMAGES_DIR_list+['images'])))
nih_chest_xray_data_dir

# check if we have all 112120 images in nih_chest_xray_data_dir
orig_images_no = !find $nih_chest_xray_data_dir -type f | wc -l
print("orig images number:{} ".format(orig_images_no))

'/local_dir/prj/AzureChestXRayNoAML/code/02_Model/../../../../data/chestxray/ChestX-ray8/ChestXray-NIHCC/images'

orig images number:['112120'] 


In [7]:
# check if we have patients file list Data_Entry_2017.csv and BBox_List_2017.csv (https://nihcc.app.box.com/v/ChestXray-NIHCC)
# blacklist.csv is genrated by data scientists with no medical background

other_data_dir=os.path.join(nih_chest_xray_data_dir, os.path.join(*(['..','..'])))

other_data_dir

!ls $other_data_dir/*.csv

# data is split into train/test/validation partitions
data_partitions_dir=os.path.join(data_base_output_dir, os.path.join(*(prj_consts.DATA_PARTITIONS_DIR_list)))  
data_partitions_dir
!mkdir -p {data_partitions_dir}

'/local_dir/prj/AzureChestXRayNoAML/code/02_Model/../../../../data/chestxray/ChestX-ray8/ChestXray-NIHCC/images/../..'

/local_dir/prj/AzureChestXRayNoAML/code/02_Model/../../../../data/chestxray/ChestX-ray8/ChestXray-NIHCC/images/../../BBox_List_2017.csv
/local_dir/prj/AzureChestXRayNoAML/code/02_Model/../../../../data/chestxray/ChestX-ray8/ChestXray-NIHCC/images/../../Data_Entry_2017.csv
/local_dir/prj/AzureChestXRayNoAML/code/02_Model/../../../../data/chestxray/ChestX-ray8/ChestXray-NIHCC/images/../../blacklist.csv


'/local_dir/prj/AzureChestXRayNoAML/code/02_Model/../../../../data/chestxray/output/data_partitions'

In [8]:
import pickle
import random
import re
import tqdm

import cv2
import numpy as np
import pandas as pd
import sklearn.model_selection 

#### Train/Validation/Test Data partitioning 
 - remove the images in the blacklist.csv where the image has low quality. 
 - remove the NIH bounding box patients since we will save those patients for later validation use. 
 - We will also divide data into train/valid/test dataset using a 7:1:2 ratio.

In [9]:
# remove NIH manually annotated data (groung truth with heavy pathologies, no healthy patients) 
# exclude what visusally looks like bad images to data scientists with no medical background
# todo
# This should prob be a generic function


total_patient_number = 30805
NIH_annotated_file = 'BBox_List_2017.csv' # exclude from train pathology annotated by radiologists 
manually_selected_bad_images_file = 'blacklist.csv'# exclude what viusally looks like bad images

patient_id_original = [i for i in range(1,total_patient_number + 1)]

# ignored images list is used later, since this is not a patient ID level issue
ignored_images_set = set()
with open(os.path.join(other_data_dir, manually_selected_bad_images_file), 'r') as f:
    for line in f:
        # delete the last char which is \n
        ignored_images_set.add(line[:-1])
        if int(line[:-9]) >= 30805:
            print(line[:-1])

bbox_df = pd.read_csv(os.path.join(other_data_dir, NIH_annotated_file))
bbox_patient_index_df = bbox_df['Image Index'].str.slice(3, 8)

bbox_patient_index_list = []
for index, item in bbox_patient_index_df.iteritems():
    bbox_patient_index_list.append(int(item))

patient_id = list(set(patient_id_original) - set(bbox_patient_index_list))
print("len of original patient id is", len(patient_id_original))
print("len of cleaned patient id is", len(patient_id))
print("len of unique patient id with annotated data", 
      len(list(set(bbox_patient_index_list))))
print("len of patient id with annotated data",bbox_df.shape[0])


len of original patient id is 30805
len of cleaned patient id is 30079
len of unique patient id with annotated data 726
len of patient id with annotated data 984


In [10]:
random.seed(0)
random.shuffle(patient_id)

print("first ten patient ids are", patient_id[:10])

# training:valid:test=7:1:2
patient_id_train = patient_id[:int(total_patient_number * 0.7)]
patient_id_valid = patient_id[int(total_patient_number * 0.7):int(total_patient_number * 0.8)]
# get the rest of the patient_id as the test set
patient_id_test = patient_id[int(total_patient_number * 0.8):]
patient_id_test.extend(bbox_patient_index_list)
patient_id_test = list(set(patient_id_test))


print("train:{} valid:{} test:{}".format(len(patient_id_train), len(patient_id_valid), len(patient_id_test)))

# test_set = test_set+left_out_patient_id
# print("train:{} valid:{} test:{}".format(len(train_set), len(valid_set), len(test_set)))

first ten patient ids are [24303, 16035, 4967, 28624, 5378, 20335, 17069, 12271, 16975, 4469]
train:21563 valid:3081 test:6161


In [11]:
# Add a few more project constants

pathologies_name_list = prj_consts.DISEASE_list
NIH_patients_and_labels_file = 'Data_Entry_2017.csv'

#### Finally do preprocessing
Save labels and partitions

In [12]:
labels_df = pd.read_csv(os.path.join(other_data_dir, NIH_patients_and_labels_file))

In [13]:
def process_data(current_df, patient_ids):
    image_name_index = []
    image_labels = {}
    for individual_patient in tqdm.tqdm(patient_ids):
        for _, row in current_df[current_df['Patient ID'] == individual_patient].iterrows():
            processed_image_name = row['Image Index']
            if processed_image_name in ignored_images_set:
                pass
            else:
                image_name_index.append(processed_image_name)
                image_labels[processed_image_name] = np.zeros(14, dtype=np.uint8)
                for disease_index, ele in enumerate(pathologies_name_list):
                    if re.search(ele, row['Finding Labels'], re.IGNORECASE):
                        image_labels[processed_image_name][disease_index] = 1
                    else:
                        # redundant code but just to make it more readable
                        image_labels[processed_image_name][disease_index] = 0
                # print("processed", row['Image Index'])
    return image_name_index, image_labels


In [14]:
# # create and save train/test/validation partitions list

train_data_index, train_labels = process_data(labels_df, patient_id_train)
valid_data_index, valid_labels = process_data(labels_df, patient_id_valid)
test_data_index, test_labels = process_data(labels_df, patient_id_test)

print("train, valid, test image number is:", len(train_data_index), len(valid_data_index), len(test_data_index))

# save the data
labels_all = {}
labels_all.update(train_labels)
labels_all.update(valid_labels)
labels_all.update(test_labels)

partition_dict = {'train': train_data_index, 'test': test_data_index, 'valid': valid_data_index}

with open(os.path.join(data_partitions_dir,'labels14_unormalized_cleaned.pickle'), 'wb') as f:
    pickle.dump(labels_all, f)

with open(os.path.join(data_partitions_dir,'partition14_unormalized_cleaned.pickle'), 'wb') as f:
    pickle.dump(partition_dict, f)
    
# also save the patient id partitions for pytorch training    
with open(os.path.join(data_partitions_dir,'train_test_valid_data_partitions.pickle'), 'wb') as f:
    pickle.dump([patient_id_train,patient_id_valid,
                 patient_id_test,
                list(set(bbox_patient_index_list))], f)    


100%|██████████| 21563/21563 [00:40<00:00, 532.42it/s]
100%|██████████| 3081/3081 [00:05<00:00, 541.45it/s]
100%|██████████| 6161/6161 [00:15<00:00, 408.49it/s]


train, valid, test image number is: 68508 9495 32893


In [15]:
# sanity check, see train labels

type(train_labels)
{k: train_labels[k] for k in list(train_labels)[:5]}

dict

{'00024303_000.png': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=uint8),
 '00016035_000.png': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=uint8),
 '00016035_001.png': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=uint8),
 '00004967_000.png': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=uint8),
 '00004967_001.png': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=uint8)}

In [16]:
!jupyter nbconvert --to html 000_preprocess.ipynb

[NbConvertApp] Converting notebook 000_preprocess.ipynb to html
[NbConvertApp] Writing 313924 bytes to 000_preprocess.html
