## Preprocessing data for Mask RCNN

In [1]:
import os
import json
import random

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import cv2
import itertools
from tqdm import tqdm
import pydicom
from pydicom.pixel_data_handlers.util import apply_voi_lut

import warnings
warnings.filterwarnings("ignore")

In [2]:
DATA_DIR = "/media/daitran/Data/Kaggle/VinBigData/"

TRAIN_DIR = os.path.join(DATA_DIR, "train")
TEST_DIR = os.path.join(DATA_DIR, "test")

TRAIN_CSV_DIR = '/home/daitran/Desktop/research/kaggle/VinBigData/train/512_jpg.csv'

SS_CSV_DIR = os.path.join(DATA_DIR, "sample_submission.csv")
# PREPROCESSED_TRAINING_IMAGE_FOLDER = '/home/daitran/Desktop/research/kaggle/VinBigData/train/full_abnormal_2/'

In [3]:
train_df = pd.read_csv(TRAIN_CSV_DIR, converters = {'w_org': eval, 'h_org': eval})
# converters ={'EncodedPixels': eval, 'CategoryId': eval})
# Extract data with abnormalities only
# train_df

**So we have 15000 DICOM files, but 67914 labeled rows**

In [4]:
train_df = train_df.query('class_id != 14')
# train_df = train_df.query("rad_id == 'R9'")
train_df

Unnamed: 0,image_id,class_name,class_id,rad_id,x_min,y_min,x_max,y_max,w_org,h_org,x_min_resized,y_min_resized,x_max_resized,y_max_resized
0,9a5094b2563a1ef3ff50dc5c7ff71345,Cardiomegaly,3,R10,691.0,1375.0,1653.0,1831.0,[2336],[2080],[170.09230769],[301.36986301],[406.89230769],[401.31506849]
1,051132a778e61a86eb147c7c6f564dfe,Aortic enlargement,0,R10,1264.0,743.0,1611.0,1019.0,[2880],[2304],[280.88888889],[132.08888889],[358.],[181.15555556]
2,1c32170b4af4ce1a3030eb8167753b06,Pleural thickening,11,R9,627.0,357.0,947.0,433.0,[3072],[2540],[126.38740157],[59.5],[190.89133858],[72.16666667]
3,0c7a38f293d5f5e4846aa4ca6db4daf1,ILD,5,R17,1347.0,245.0,2188.0,2169.0,[2555],[2285],[301.82231947],[49.09589041],[490.26520788],[434.64892368]
4,47ed17dcb2cbeec15182ed335a8b5a9e,Nodule/Mass,8,R9,557.0,2352.0,675.0,2484.0,[3353],[2568],[111.0529595],[359.14822547],[134.57943925],[379.30450343]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36091,b53d1dd80e99ca6bcef9d592f65d3321,Pleural effusion,10,R9,240.0,1550.0,562.0,2001.0,[2880],[2304],[53.33333333],[275.55555556],[124.88888889],[355.73333333]
36092,26d1d5a0ef2e692c6340e74859ffdc53,Pulmonary fibrosis,13,R10,1163.0,787.0,1338.0,941.0,[3072],[3072],[193.83333333],[131.16666667],[223.],[156.83333333]
36093,22672ab82c290c20b86863291e25ef6c,ILD,5,R9,299.0,664.0,794.0,1508.0,[2500],[2048],[74.75],[135.9872],[198.5],[308.8384]
36094,db169d0be36123bd55b866d6aa73983b,Other lesion,9,R8,6.0,670.0,272.0,1736.0,[2880],[2304],[1.33333333],[119.11111111],[60.44444444],[308.62222222]


**If considering only abnormal cases, we have 36096 cases**

Visualize data distribution

### Helper functions for converting bounding boxes to the right format for Mask RCNN

In [5]:
def get_mask(img_dimensions, x_min, y_min, x_max, y_max):
    img_height, img_width = img_dimensions
    img_mask = np.full((img_height,img_width),0)
    img_mask[y_min:y_max,x_min:x_max] = 255
    return img_mask.astype(np.float32)

def rle_encoding(x):
    dots = np.where(x.T.flatten() == 255)[0]
    run_lengths = []
    prev = -2
    for b in dots:
        if (b>prev+1): run_lengths.extend((b + 1, 0))
        run_lengths[-1] += 1
        prev = b
    return ' '.join([str(x) for x in run_lengths])

### Function convert DICOM data to np.array


In [6]:
def dicom2array(path, voi_lut=True, fix_monochrome=True):
    dicom = pydicom.read_file(path)
    # VOI LUT (if available by DICOM device) is used to
    # transform raw DICOM data to "human-friendly" view
    if voi_lut:
        data = apply_voi_lut(dicom.pixel_array, dicom)
    else:
        data = dicom.pixel_array
    # depending on this value, X-ray may look inverted - fix that:
    if fix_monochrome and dicom.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data

    data = data - np.min(data)
    data = data / np.max(data)
    data = (data * 255).astype(np.uint8)

    return data


# Accelarate cv2.imwrite 

In [7]:
import glob
import os
import cv2
import concurrent.futures

In [50]:
# def generate_data(train_df_sample = train_df):
diagnostic_per_image = []

image_size=512
with tqdm(total=len(train_df)) as pbar:
    for idx,row in train_df.iterrows():
        image_id = row.image_id
        image_df = train_df.query("image_id==@image_id")
#         print(image_df)
#         print(image_df['w_org'][0])
#         print(image_df)
#         w_org = int(np.array(image_df['w_org'][0]))
#         h_org = int(np.array(image_df['h_org'][0]))
#         print(image_df)
        
        class_list = []
        RLE_list = []

        for diagnostic_id, diagnostic in image_df.iterrows():
            
            
            w_org = np.array(diagnostic.w_org[0])
            h_org = np.array(diagnostic.h_org[0])
            
            class_list.append(diagnostic.class_id)

            dicom_image = np.zeros((w_org, h_org))
#             dicom_image = dicom2array(TRAIN_DIR + '/' + image_id+".dicom")
            image_dimensions = dicom_image.shape

            resized_img = cv2.resize(dicom_image, (image_size,image_size), interpolation = cv2.INTER_AREA)

# #             os.chdir(PREPROCESSED_TRAINING_IMAGE_FOLDER)
#             # print(PREPROCESSED_TRAINING_IMAGE_FOLDER+ image_id+ ".jpg")

# #             cv2.imwrite(PREPROCESSED_TRAINING_IMAGE_FOLDER+ image_id+ ".jpg", resized_img)

            mask = get_mask(image_dimensions, int(diagnostic.x_min), int(diagnostic.y_min), int(diagnostic.x_max), int(diagnostic.y_max))
            resized_mask = cv2.resize(mask, (image_size,image_size))
            RLE_list.append(rle_encoding(resized_mask))
        diagnostic_per_image.append({"image_id":image_id,
                                     "CategoryId":class_list,
                                     "EncodedPixels":RLE_list})
        pbar.update(1)
#     return diagnostic_per_image

100%|██████████| 36096/36096 [3:39:32<00:00,  2.74it/s]  


In [51]:
samples_df = pd.DataFrame(diagnostic_per_image)

In [54]:
os.getcwd()

'/home/daitran/Desktop/git/chest_x_ray_abnormalities_detection/MaskRCNN_implementation'

In [55]:
samples_df.to_csv('maskrcnn_df_png512.csv', index = False)

In [52]:
samples_df

Unnamed: 0,image_id,CategoryId,EncodedPixels
0,9a5094b2563a1ef3ff50dc5c7ff71345,"[3, 10, 11, 3, 3, 0]",[87342 100 87854 100 88366 100 88878 100 89390...
1,051132a778e61a86eb147c7c6f564dfe,"[0, 0, 3, 3, 11, 0, 3]",[144005 49 144517 49 145029 49 145541 49 14605...
2,1c32170b4af4ce1a3030eb8167753b06,"[11, 13, 11, 13, 13, 13, 11, 13, 13]",[64573 12 65085 12 65597 12 66109 12 66621 12 ...
3,0c7a38f293d5f5e4846aa4ca6db4daf1,"[5, 5, 8, 5, 9]",[154674 386 155186 386 155698 386 156210 386 1...
4,47ed17dcb2cbeec15182ed335a8b5a9e,"[8, 9, 8, 9, 9, 8, 7, 7]",[57192 20 57704 20 58216 20 58728 20 59240 20 ...
...,...,...,...
36091,b53d1dd80e99ca6bcef9d592f65d3321,"[0, 10, 13, 6, 4, 13, 10, 10]",[128138 69 128650 69 129162 69 129674 69 13018...
36092,26d1d5a0ef2e692c6340e74859ffdc53,"[8, 10, 10, 13, 9, 11, 13, 9, 13, 11, 0, 8, 13...",[117317 12 117829 12 118341 12 118853 12 11936...
36093,22672ab82c290c20b86863291e25ef6c,"[5, 11, 10, 9, 10, 9, 11, 5, 13, 13, 11, 13, 1...",[38500 211 39012 211 39524 211 40036 211 40548...
36094,db169d0be36123bd55b866d6aa73983b,"[10, 9, 9, 9, 0, 11, 7, 0, 9, 9, 13, 9, 9, 9, 9]",[5429 49 5941 49 6453 49 6965 49 7477 49 7989 ...


In [56]:
# diagnostic_per_image = generate_data(train_df_sample = train_df)
# SAVING .CSV
# os.chdir(PREPROCESSED_TRAINING_IMAGE_FOLDER)
image_size=512
# samples_df = pd.DataFrame(diagnostic_per_image)
samples_df["Height"] = image_size
samples_df["Width"] = image_size
samples_df.head()
#
samples_df.to_csv('sample_df.csv', index=False)


In [27]:
# SAVING .CSV
os.chdir(PREPROCESSED_TRAINING_IMAGE_FOLDER)
image_size=512
samples_df = pd.DataFrame(diagnostic_per_image)
samples_df["Height"] = image_size
samples_df["Width"] = image_size
samples_df.head()
#
samples_df.to_csv('sample_df.csv', index=False)