## I. Setting
### 1. Packages

In [1]:
# Import packages
import os, sys, shutil, warnings, random, glob, pylab, numpy as np, pandas as pd, time, datetime
from tqdm import tqdm

# packages for visualization
import pydicom, cv2, seaborn as sns, matplotlib.pyplot as plt
from matplotlib import patches
from PIL import Image
from imgaug import augmenters as iaa

# packages for model
import tensorflow as tf, keras
from keras.callbacks import EarlyStopping
from keras.backend.tensorflow_backend import set_session
from keras.models import load_model
from sklearn.model_selection import KFold, train_test_split

Using TensorFlow backend.


In [2]:
#### Package settings

# Display up to 50 columns of a pandas dataframe
pd.set_option('display.max_columns',50)
# ignore warnings
warnings.filterwarnings("ignore")

# Limit GPU usage to 80%
tf_config = tf.ConfigProto()
tf_config.gpu_options.per_process_gpu_memory_fraction = 0.8
set_session(tf.Session(config=tf_config))

### 2. Directory

In [3]:
os.getcwd()

'/home/ubuntu/healthcare/pneumonia_lungfish/ocean_assets/notebooks'

In [4]:
# The home directory of this project
root_DIR = '/home/ubuntu/healthcare/pneumonia_lungfish'
# Ocean asset directories
# Image data
assets_image_data =('/home/ubuntu/healthcare/pneumonia_lungfish/ocean_assets/image_data/')
# Meta data (dataframe and array data)
assets_meta_data =('/home/ubuntu/healthcare/pneumonia_lungfish/ocean_assets/meta_data/')
# Visualization 
visual_DIR =('/home/ubuntu/healthcare/pneumonia_lungfish/ocean_assets_visualization/')


# Sample data
sample_image=('/home/ubuntu/healthcare/pneumonia_lungfish/ocean_assets/image_data/sample_image/')
# Sample image data directories for training using 
sample_train = ('/home/ubuntu/healthcare/pneumonia_lungfish/ocean_assets/image_data/sample_image/sample_train/')
sample_validate = ('/home/ubuntu/healthcare/pneumonia_lungfish/ocean_assets/image_data/sample_image/sample_validate/')
sample_test= ('/home/ubuntu/healthcare/pneumonia_lungfish/ocean_assets/image_data/sample_image/sample_test/')

# Sample meta data (dataframe and array data)
sample_meta = ('/home/ubuntu/healthcare/pneumonia_lungfish/ocean_assets/meta_data/sample_meta/')
# Sample visualization 
sample_visual=('/home/ubuntu/healthcare/pneumonia_lungfish/ocean_assets_visualization/sample_visual/')

#sample_dirs = ['sample_train','sample_validate','sample_test']
#for DIR in sample_dirs:
    #os.mkdir(os.path.join(sample_image,DIR))  


print(assets_image_data )
print(assets_meta_data)
print(visual_DIR)
print(sample_image)
print(sample_train)
print(sample_validate)
print(sample_test)
print(sample_meta)
print(sample_visual)
    

/home/ubuntu/healthcare/pneumonia_lungfish/ocean_assets/image_data/
/home/ubuntu/healthcare/pneumonia_lungfish/ocean_assets/meta_data/
/home/ubuntu/healthcare/pneumonia_lungfish/ocean_assets_visualization/
/home/ubuntu/healthcare/pneumonia_lungfish/ocean_assets/image_data/sample_image/
/home/ubuntu/healthcare/pneumonia_lungfish/ocean_assets/image_data/sample_image/sample_train/
/home/ubuntu/healthcare/pneumonia_lungfish/ocean_assets/image_data/sample_image/sample_validate/
/home/ubuntu/healthcare/pneumonia_lungfish/ocean_assets/image_data/sample_image/sample_test/
/home/ubuntu/healthcare/pneumonia_lungfish/ocean_assets/meta_data/sample_meta/
/home/ubuntu/healthcare/pneumonia_lungfish/ocean_assets_visualization/sample_visual/


In [5]:
# full data: includes all images 
full_image=('/home/ubuntu/healthcare/pneumonia_lungfish/ocean_assets/image_data/full_image/')
# full image data directories for training using 
full_train = ('/home/ubuntu/healthcare/pneumonia_lungfish/ocean_assets/image_data/full_image/full_train/')
full_validate = ('/home/ubuntu/healthcare/pneumonia_lungfish/ocean_assets/image_data/full_image/full_validate/')
full_test= ('/home/ubuntu/healthcare/pneumonia_lungfish/ocean_assets/image_data/full_image/full_test/')

# full meta data (dataframe and array data)
full_meta = ('/home/ubuntu/healthcare/pneumonia_lungfish/ocean_assets/meta_data/full_meta/')
# full visualization 
full_visual=('/home/ubuntu/healthcare/pneumonia_lungfish/ocean_assets_visualization/full_visual/')

#full_dirs = ['full_train','full_validate','full_test']
#for DIR in full_dirs:
    #os.mkdir(os.path.join(full_image,DIR))  


print(full_image)
print(full_train)
print(full_validate)
print(full_test)
print(full_meta)
print(full_visual)
    

/home/ubuntu/healthcare/pneumonia_lungfish/ocean_assets/image_data/full_image/
/home/ubuntu/healthcare/pneumonia_lungfish/ocean_assets/image_data/full_image/full_train/
/home/ubuntu/healthcare/pneumonia_lungfish/ocean_assets/image_data/full_image/full_validate/
/home/ubuntu/healthcare/pneumonia_lungfish/ocean_assets/image_data/full_image/full_test/
/home/ubuntu/healthcare/pneumonia_lungfish/ocean_assets/meta_data/full_meta/
/home/ubuntu/healthcare/pneumonia_lungfish/ocean_assets_visualization/full_visual/


In [7]:
# Model Directory
MASK_RCNN = os.path.join(root_DIR,'src/Mask_RCNN')
COCO_WEIGHTS_PATH = os.path.join(root_DIR,'src/Mask_RCNN_coco_weights/mask_rcnn_coco.h5')
models_working_DIR = os.path.join(root_DIR,'src/models_working')
print(MASK_RCNN)
print(COCO_WEIGHTS_PATH)
print(models_working_DIR)

/home/ubuntu/healthcare/pneumonia_lungfish/src/Mask_RCNN
/home/ubuntu/healthcare/pneumonia_lungfish/src/Mask_RCNN_coco_weights/mask_rcnn_coco.h5
/home/ubuntu/healthcare/pneumonia_lungfish/src/models_working


### Raw data

In [67]:
# Raw data directory
raw_data_DIR = ('/home/ubuntu/healthcare/pneumonia_lungfish/src/data/raw/')
# Raw Image directory
#train_dicom_DIR = os.path.join(full_image,'stage_2_train_images/')
train_dicom_DIR ='/home/ubuntu/healthcare/pneumonia_lungfish/src/data/raw/stage_2_train_images/'

print(raw_data_DIR)
print(train_dicom_DIR)

/home/ubuntu/healthcare/pneumonia_lungfish/src/data/raw/
/home/ubuntu/healthcare/pneumonia_lungfish/src/data/raw/stage_2_train_images/


## II. Data 

### 1. Overview of data preparation
 * remove replicates
 * split dataframe into train, validation, test in csv
 * split images into folders 
 * prepare a dataframe with ground truth information (including image path, target, segmentation, and bounding boxes)

### 2. Exploratory data analysis

#### 2.1) Prepare a dataframe df_path listing the path for each image 

In [9]:
df_path = pd.DataFrame({'image_path': glob.glob(os.path.join(train_dicom_DIR, '*.dcm'))})
df_path['patientId'] = df_path['image_path'].map(lambda x: os.path.splitext(os.path.basename(x))[0])
print(df_path.head(2))
print(df_path.shape[0], 'images in total')

                                          image_path  \
0  /home/ubuntu/healthcare/pneumonia_lungfish/src...   
1  /home/ubuntu/healthcare/pneumonia_lungfish/src...   

                              patientId  
0  79d6e398-6ee6-4c5a-b924-a082d3c1cce9  
1  77a4deeb-5538-4c93-82e1-ef9c8877aaeb  
26684 images in total


#### 2.2) Prepare a dataframe with both target and path information 

In [96]:
df_labels = pd.read_csv(os.path.join(raw_data_DIR,'stage_2_train_labels.csv'))
print('df_labels dataframe:','\n',df_labels.head(2))
df_target = df_labels[['patientId','Target']].drop_duplicates()

df_target_path= pd.merge(df_target,df_path, on='patientId',how='left')
df_target_path.to_csv(os.path.join(full_meta,'df_target_path.csv'), index = False)
print('\n','\n','In total, there are',df_target_path.shape[0], 'patientIds!')
print('\n','\n','df_target_path dataframe:','\n',df_target_path.head(2)) 

df_labels dataframe: 
                               patientId    x    y  width  height  Target
0  0004cfab-14fd-4e49-80ba-63a80b6bddd6 nan% nan%   nan%    nan%       0
1  00313ee0-9eaa-42f4-b0ab-c148ed3241cd nan% nan%   nan%    nan%       0

 
 In total, there are 26684 patientIds!

 
 df_target_path dataframe: 
                               patientId  Target  \
0  0004cfab-14fd-4e49-80ba-63a80b6bddd6       0   
1  00313ee0-9eaa-42f4-b0ab-c148ed3241cd       0   

                                          image_path  
0  /home/ubuntu/healthcare/pneumonia_lungfish/src...  
1  /home/ubuntu/healthcare/pneumonia_lungfish/src...  


### In the scope of this project, the 'Target' in df_labels is associated with opacity 
 * Target = 0: 'No Lung Opacity'
 * Target 1: = 'Lung Opacity' 

In [11]:
df_count = df_target_path.groupby(['Target']).size().reset_index(name=' Count')
df_count['Percentage'] =df_count.iloc[:, 1:].apply(lambda x: x / x.sum()*100)
pd.options.display.float_format = '{:.1f}%'.format
df_count

Unnamed: 0,Target,Count,Percentage
0,0,20672,77.5%
1,1,6012,22.5%


### 3. Full dataset preparation for training 
#### N= 26684 images

#### 3.1) Split the dataframe into train, validation, and test

In [12]:
# We wrote a function to split a dataframe into train, validation, test 
def split_df(df,image_path_col,target_col,val_size,test_size, random_state):
    X = df[image_path_col]
    y= df[target_col]
    
    from sklearn.model_selection import train_test_split
    
    X_train, X_val_test, y_train, y_val_test = train_test_split(X, y,
                                                                stratify=y, 
                                                                test_size=float(val_size+test_size),random_state=int(random_state))
    
    X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test,
                                                stratify=y_val_test, 
                                                test_size=float(test_size)/float(val_size+test_size),random_state=int(random_state))
        
    print( 'Train dataset (X_train y_train):',y_train.count())
    print('Validataion dataset(X_val, y_val):',y_val.count())
    print('Test dataset (X_test, y_test):',y_test.count())  
    
    return X_train, y_train, X_val, y_val, X_test, y_test                                           
                                              
    

In [13]:
# Train: Validate: Test = 0.9:0.05:0.05
X_train, y_train, X_val, y_val, X_test, y_test = split_df(df_target_path,'image_path','Target',0.05,0.05,0)

Train dataset (X_train y_train): 24015
Validataion dataset(X_val, y_val): 1334
Test dataset (X_test, y_test): 1335


#### 3.2) Split image files to train, test, validate directories

In [14]:
X_test[:1]

23932    /home/ubuntu/healthcare/pneumonia_lungfish/src...
Name: image_path, dtype: object

In [91]:
# We wrote a function to split images in train, validation directory
def X_split_image(X,source_DIR,dst_DIR):
    import pandas as pd
    import os
    
    '''
    type(X): dataframe series
    '''
     
    # create a sample image_path file
    df = pd.DataFrame()    
    df['image_path']=X.astype(str)
    df['image_dcm'] = df['image_path'].str.replace('/home/ubuntu/healthcare/pneumonia_lungfish/src/data/raw/stage_2_train_images/', '')
    #print(df.head(2) )
    # Optional: write df into a csv file
    os.chdir(dst_DIR)
    csv_name = str(dst_DIR)+'image_path.csv'
    
    df.to_csv(csv_name,index=False)
    print('csv files are saved in', dst_DIR)
    
    print(df.shape[0],'images are splitted from',source_DIR,'in',dst_DIR)
                            
    names= df['image_dcm'].tolist()
    #print(names)
    
    for filename in os.listdir(source_DIR):
        #print(len(filename))
        for name in names:
            #print(name)
            if filename in name:               
                #print('mathced names ')
                shutil.move(os.path.join(source_DIR,filename), dst_DIR)
                          

In [None]:
X_split_image(X_train,full_image,full_train)
X_split_image(X_test,full_image,full_test)
X_split_image(X_val,full_image,full_validate)

#### 3.3) Prepare a dataframe df_all, which contains target (lable), path, and boxes information 

In [58]:
df_all = pd.merge(df_target_path,df_labels, on=['patientId','Target'],how='left')
print(df_all.head(2))
print('\n','In total,',df_target_path.shape[0],'patients.',df_all.shape[0],'rows. Because one image may have more than one bounding box')
df_all.to_csv(os.path.join(full_meta,'df_target_path_box.csv'),index=False)



                              patientId  Target  \
0  0004cfab-14fd-4e49-80ba-63a80b6bddd6       0   
1  00313ee0-9eaa-42f4-b0ab-c148ed3241cd       0   

                                          image_path    x    y  width  height  
0  /home/ubuntu/healthcare/pneumonia_lungfish/src... nan% nan%   nan%    nan%  
1  /home/ubuntu/healthcare/pneumonia_lungfish/src... nan% nan%   nan%    nan%  

 In total, 26684 patients. 30227 rows. Because one image may have more than one bounding box


### 4. A small sample dataset preparation for demo 
#### N= 1000 images. It preserves sample proportions in the full dataset


#### 4.1)  Split the dataframe into rest and sample dataframe

In [65]:
# Split the dataset into rest and sample 
X = df_target_path['image_path']
y = df_target_path['Target']
X_rest, X_sample, y_rest, y_sample = train_test_split(X, y,
                                                stratify=y, 
                                                test_size=0.03747,random_state=0)

#### 4.2) Copy Sample Image Files to assets_image_sample directories

In [73]:
# We wrote a function to copy images from source_DIR to destination directory
def X_copy_image(X,source_DIR,dst_DIR):
    '''
    type(X): dataframe series
    '''

    import pandas as pd
    import os

    # create a sample image_path file
    df = pd.DataFrame()    
    df['image_path']=X.astype(str)
    df['image_dcm'] = df['image_path'].str.replace('/home/ubuntu/healthcare/pneumonia_lungfish/src/data/raw/stage_2_train_images/', '')
    print(df.head(2) )
    # Optional: write df into a csv file
    csv_name = 'sample_image_path.csv'  
    os.chdir(sample_meta)
    df.to_csv(csv_name,index=False)
    
    print(df.shape[0],'images are copied from',source_DIR,'in',dst_DIR)
                            

    names= df['image_dcm'].tolist()
    #print(names)
    
    for filename in os.listdir(source_DIR):
        print(filename)
        for name in names:
            #print(name)
            if filename in name:               
                #print('mathced names ')
                shutil.copy(os.path.join(source_DIR,filename), dst_DIR)
                break                       
                          

In [74]:
X_copy_image(X_sample,train_dicom_DIR,sample_image)


                                              image_path  \
18255  /home/ubuntu/healthcare/pneumonia_lungfish/src...   
19445  /home/ubuntu/healthcare/pneumonia_lungfish/src...   

                                      image_dcm  
18255  be89e82c-0f08-4660-82de-c2fe903df1c0.dcm  
19445  ca112a3c-b701-48b8-a94f-725ea65416a7.dcm  
1000 images are copied from /home/ubuntu/healthcare/pneumonia_lungfish/src/data/raw/stage_2_train_images/ in /home/ubuntu/healthcare/pneumonia_lungfish/ocean_assets/image_data/sample_image/


In [76]:
# read the X_sample csv in a pandas dataframe
X_sample_csv = pd.read_csv(os.path.join(sample_meta,'sample_image_path.csv'))
X_sample_csv.head()


Unnamed: 0,image_path,image_dcm
0,/home/ubuntu/healthcare/pneumonia_lungfish/src...,be89e82c-0f08-4660-82de-c2fe903df1c0.dcm
1,/home/ubuntu/healthcare/pneumonia_lungfish/src...,ca112a3c-b701-48b8-a94f-725ea65416a7.dcm
2,/home/ubuntu/healthcare/pneumonia_lungfish/src...,fdff1f9e-15e5-4b5b-acf7-bdd0d584fcf6.dcm
3,/home/ubuntu/healthcare/pneumonia_lungfish/src...,6329ef20-a219-43c4-a437-e15622f8c0ac.dcm
4,/home/ubuntu/healthcare/pneumonia_lungfish/src...,7e2abe5f-c9c3-44e1-9fe2-a343d651b17b.dcm


In [78]:
# merge with target information 
df_sample_target_path=pd.merge(X_sample_csv,df_target_path,on='image_path',how='left')
print(df_sample_target_path.head(2))

df_sample_target_path.to_csv(os.path.join(sample_meta,'df_target_path_1000sample.csv'),index=False)


                                          image_path  \
0  /home/ubuntu/healthcare/pneumonia_lungfish/src...   
1  /home/ubuntu/healthcare/pneumonia_lungfish/src...   

                                  image_dcm  \
0  be89e82c-0f08-4660-82de-c2fe903df1c0.dcm   
1  ca112a3c-b701-48b8-a94f-725ea65416a7.dcm   

                              patientId  Target  
0  be89e82c-0f08-4660-82de-c2fe903df1c0       0  
1  ca112a3c-b701-48b8-a94f-725ea65416a7       0  


#### The sample proportions are preserved

In [79]:
df_sample_count = df_sample_target_path.groupby(['Target']).size().reset_index(name=' Count')
df_sample_count['Percentage'] =df_sample_count.iloc[:, 1:].apply(lambda x: x / x.sum()*100)
pd.options.display.float_format = '{:.1f}%'.format
df_sample_count


Unnamed: 0,Target,Count,Percentage
0,0,775,77.5%
1,1,225,22.5%


#### 4.3) Split image files to train, test, validate

In [89]:
X_train, y_train, X_val, y_val, X_test, y_test = split_df(df_sample_target_path,'image_path','Target',0.05,0.05,0)


Train dataset (X_train y_train): 900
Validataion dataset(X_val, y_val): 50
Test dataset (X_test, y_test): 50


In [93]:
X_split_image(X_train,sample_image,sample_train)
X_split_image(X_test,sample_image,sample_test)
X_split_image(X_val,sample_image,sample_validate)

csv files are saved in /home/ubuntu/healthcare/pneumonia_lungfish/ocean_assets/image_data/sample_image/sample_train/
900 images are splitted from /home/ubuntu/healthcare/pneumonia_lungfish/ocean_assets/image_data/sample_image/ in /home/ubuntu/healthcare/pneumonia_lungfish/ocean_assets/image_data/sample_image/sample_train/
csv files are saved in /home/ubuntu/healthcare/pneumonia_lungfish/ocean_assets/image_data/sample_image/sample_test/
50 images are splitted from /home/ubuntu/healthcare/pneumonia_lungfish/ocean_assets/image_data/sample_image/ in /home/ubuntu/healthcare/pneumonia_lungfish/ocean_assets/image_data/sample_image/sample_test/
csv files are saved in /home/ubuntu/healthcare/pneumonia_lungfish/ocean_assets/image_data/sample_image/sample_validate/
50 images are splitted from /home/ubuntu/healthcare/pneumonia_lungfish/ocean_assets/image_data/sample_image/ in /home/ubuntu/healthcare/pneumonia_lungfish/ocean_assets/image_data/sample_image/sample_validate/


#### 4.4) Prepare a sample dataframe which contains label, path, and boxes information 

In [95]:
df_sample_all= pd.merge(df_sample_target_path,df_labels,on=['patientId','Target'],how='left')
print('\n','In total,',df_sample_target_path.shape[0],'patients.',df_sample_all.shape[0],'rows. Because one image may have more than one bounding box')
df_sample_all.to_csv(os.path.join(sample_meta,'df_target_path_box_1000samples.csv'),index=False)



 In total, 1000 patients. 1133 rows. Because one image may have more than one bounding box


### So far, we have prepared two datasets 
 #### 1) A full dataset 
   * 26684 images   
     * in /ocean_assets/ocean_assets_image_data/       
   * Matched dataframes 
     * df_target_path.csv 
     * df_target_path_box.csv   
     * in /ocean_assets/ocean_assets_meta_data/
     
     
 #### 2) A small sample dataset
   * 1000 images   
     * in /ocean_assets/ocean_assets_image_data/     
   * Matched dataframes     
     * df_target_path_1000sample.csv
     * df_target_path_box_1000samples.csv
     * in /ocean_assets/ocean_assets_meta_data/

### Now, let's head to data visualization notebook