### Generate JMS or V7 H5 files for CXR images and masks 

In [1]:
# In[1]:

import os, sys, shutil
from os import listdir
from os.path import isfile, join
import random
import numpy as np
import cv2
import pandas as pd
import json
import datetime
import csv, h5py


In [2]:
# In[2]:

from MODULES.Generators import get_generator, DataGenerator
from MODULES.Losses import other_metrics_binary_class
from MODULES.Constants import _Params, _Paths
from MODULES.Utils import get_class_threshold, standardize, commonelem_set
import tensorflow as tf 
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import model_from_json 
from tensorflow.python.client import device_lib
from tensorflow.keras import backend as K
from tensorflow.keras.preprocessing import image
from sklearn.metrics import roc_auc_score, roc_curve
from tensorflow.compat.v1.logging import INFO, set_verbosity
import matplotlib.pyplot as plt
import datetime

%load_ext autoreload
%autoreload 2


In [3]:
# In[3]:

# ### MODEL AND RUN SELECTION
HEIGHT, WIDTH, CHANNELS, IMG_COLOR_MODE, MSK_COLOR_MODE, NUM_CLASS, \
    KS1, KS2, KS3, DL1, DL2, DL3, NF, NFL, NR1, NR2, DIL_MODE, W_MODE, LS, \
    SHIFT_LIMIT, SCALE_LIMIT, ROTATE_LIMIT, ASPECT_LIMIT, U_AUG, \
    TRAIN_SIZE, VAL_SIZE, DR1, DR2, CLASSES, IMG_CLASS, MSK_FLOAT, MSK_THRESHOLD, \
    MRA, MRALEVEL, MRACHANNELS, WAVELET, WAVEMODE, WST, WST_J, WST_L, WST_FIRST_IMG, \
    SCALE_BY_INPUT, SCALE_THRESHOLD = _Params()
    
TRAIN_IMG_PATH, TRAIN_MSK_PATH, TRAIN_MSK_CLASS, VAL_IMG_PATH, \
        VAL_MSK_PATH, VAL_MSK_CLASS = _Paths()


In [4]:
# In[4]:

# HFHS set
train_df = pd.read_csv("selected_train_020521_index.csv",index_col = 0)
valid_df = pd.read_csv("selected_valid_020521_index.csv",index_col = 0)

n_train = len(train_df)
n_valid = len(valid_df)
# n_test = len(test_df)
print(n_train,n_valid)


2263 1532


In [None]:
# In[5]:

# ### DATASET PREPARATION FOR STANDARDIZED IMAGES as H5 files

# Source directories for images and masks
# HFHS set
IMAGE_DIR = "dataset/selected_COVID_pos4_neg5_image_resized_equalized/"

# For JMS database
# MASK_DIR = "dataset/selected_COVID_pos4_neg5_masks_float_952_3/"
# For V7 database
MASK_DIR = "dataset/selected_COVID_pos4_neg5_masks_float_6395_1/"

# Target directory for H5 file containing both images and masks
# For JMS database
# H5_IMAGE_DIR = "dataset/COVID_standardized_pos4_neg5_image_expand_float_952_3_threshold_H5_CLASSWEIGHT/"
# For V7 database
H5_IMAGE_DIR = "dataset/COVID_standardized_pos4_neg5_image_expand_float_6395_1_threshold_H5_CLASSWEIGHT/"

print(IMAGE_DIR)
print(H5_IMAGE_DIR)

pwd = os.getcwd()
if not os.path.isdir(H5_IMAGE_DIR):
    os.mkdir(H5_IMAGE_DIR)
    
# Arrays for training and validation set

# Training set
train_image_mat = np.zeros((HEIGHT,WIDTH,n_train,1))
train_mask_mat = np.zeros((HEIGHT,WIDTH,n_train,1))
train_label_mat = np.zeros((n_train,2))
train_weight_mat = np.zeros((n_train,1))
train_df_index = train_df.index.tolist()

# Validation set 
valid_df_index = valid_df.index.tolist()

# Loop over the training set and calculate mean and std
for i in range(n_train):
    print(f'{i},index={train_df.index[i]}')
    train_image_name, train_pos_label, train_neg_label, train_weight = \
    train_df.iloc[i]['Image'],\
    train_df.iloc[i]['Positive'],\
    train_df.iloc[i]['Negative'],\
    train_df.iloc[i]['ClassWeight']

    train_image = cv2.imread(IMAGE_DIR + train_image_name, cv2.IMREAD_GRAYSCALE)

# Resize and equalize if this was not already done during datasets preparation
    # train_image = cv2.resize(train_image, (WIDTH, HEIGHT), cv2.INTER_LINEAR)
    # train_image = cv2.equalizeHist(train_image)

    train_image = np.expand_dims(train_image,axis=-1)
    
    train_image_mat[:,:,i] = train_image
    train_label_mat[i,:] = [train_pos_label,train_neg_label]
    train_weight_mat[i,:] = train_weight
    
    # External learned mask of segmented lungs
    train_learned_mask = cv2.imread(MASK_DIR + train_image_name, cv2.IMREAD_GRAYSCALE).astype('float64')
    train_learned_mask /= 255
    train_learned_mask = np.expand_dims(train_learned_mask,axis=-1)
    
    # Internal thresholded mask to eliminate dark regions of the CXR 
    low_ind = train_image < 6
    high_ind = train_image > 225    
    train_thresholded_mask = np.ones_like(train_image)
    train_thresholded_mask[low_ind] = 0
    train_thresholded_mask[high_ind] = 0
        
    # Combine the two masks
    train_mask_mat[:,:,i] = np.multiply(train_thresholded_mask,train_learned_mask)
    
# IMPORTANT: For best result we standardize the entire set.     
train_image_mat, train_image_mean, train_image_std = standardize(train_image_mat, by_layer=False)

for i in range(n_train):    
    train_image_name = train_df.iloc[i]['Image']

    with h5py.File(H5_IMAGE_DIR + train_image_name[:-4] + '.h5', 'w') as hf: 
        # Images
        Xset = hf.create_dataset(
            name='X',
            data=np.squeeze(train_image_mat[:,:,i,:]),
            shape=(HEIGHT, WIDTH, 1),
            maxshape=(HEIGHT, WIDTH, 1),
            compression="gzip",
            compression_opts=9)
        
        # Masks
        Mset = hf.create_dataset(
            name='M',
            data=np.squeeze(train_mask_mat[:,:,i,:]),
            shape=(HEIGHT, WIDTH, 1),
            maxshape=(HEIGHT, WIDTH, 1),
            compression="gzip",
            compression_opts=9)

        # Labels
        yset = hf.create_dataset(
            name='y',
            data=np.squeeze(train_label_mat[i,:]))
        
        # Class weights
        wset = hf.create_dataset(
            name='w',
            data=np.squeeze(train_weight_mat[i,:]))   
        

# Loop over the validation set

for i in range(n_valid):
    print(f'{i},index={valid_df.index[i]}')
    valid_image_name, valid_pos_label, valid_neg_label, valid_weight = \
    valid_df.iloc[i]['Image'],\
    valid_df.iloc[i]['Positive'],\
    valid_df.iloc[i]['Negative'],\
    valid_df.iloc[i]['ClassWeight']

    valid_image = cv2.imread(IMAGE_DIR + valid_image_name, cv2.IMREAD_GRAYSCALE)
    
# Resize or equalize if this was not already done during datasets preparation    
    # valid_image = cv2.resize(valid_image, (WIDTH, HEIGHT), cv2.INTER_LINEAR)
    # valid_image = cv2.equalizeHist(valid_image)

    valid_image = np.expand_dims(valid_image,axis=-1)
        
    # External learned mask of segmented lungs
    valid_learned_mask = cv2.imread(MASK_DIR + valid_image_name, cv2.IMREAD_GRAYSCALE).astype('float64')
    valid_learned_mask /= 255
    valid_learned_mask = np.expand_dims(valid_learned_mask,axis=-1)
    
    # Internal thresholded mask    
    low_ind = valid_image < 6
    high_ind = valid_image > 225    
    valid_thresholded_mask = np.ones_like(valid_image)
    valid_thresholded_mask[low_ind] = 0
    valid_thresholded_mask[high_ind] = 0

    # Combine the two masks
    valid_mask = np.multiply(valid_thresholded_mask,valid_learned_mask)
    
    # Standardization with training mean and std 
    valid_image = valid_image.astype(np.float64)
    valid_image -= train_image_mean
    valid_image /= train_image_std        
    
    with h5py.File(H5_IMAGE_DIR + valid_image_name[:-4] + '.h5', 'w') as hf: 
        # Images
        Xset = hf.create_dataset(
            name='X',
            data=valid_image,
            shape=(HEIGHT, WIDTH, 1),
            maxshape=(HEIGHT, WIDTH, 1),
            compression="gzip",
            compression_opts=9)
        
        # Masks
        Mset = hf.create_dataset(
            name='M',
            data=valid_mask,
            shape=(HEIGHT, WIDTH, 1),
            maxshape=(HEIGHT, WIDTH, 1),
            compression="gzip",
            compression_opts=9)
        
        # Labels
        yset = hf.create_dataset(
            name='y',
            data=[valid_pos_label,valid_neg_label])
        
        # Class weights
        wset = hf.create_dataset(
            name='w',
            data=valid_weight) 
        
        
# Generate json dictionary with standardization parameters
h5_dict = {"mean":train_image_mean,"std":train_image_std}          

# For JMS database
# with open(H5_IMAGE_DIR + 'standardization_parameters_JMS.json', 'w') as filehandle:
#     json.dump(h5_dict, filehandle)
    
# For V7 database    
with open(H5_IMAGE_DIR + 'standardization_parameters_V7.json', 'w') as filehandle:
    json.dump(h5_dict, filehandle)     


In [7]:
# Generate json dictionary (do not regenerate if number of elements was changed by duplicating some)

train_h5_name_list = []
valid_h5_name_list = []

for i in range(n_train):
    # print(f'{i},index={valid_df.index[i]}')
    train_image_name = train_df.iloc[i]['Image']
    train_h5_name_list.append(train_image_name[:-4] + '.h5') 
for i in range(n_valid):
    # print(f'{i},index={valid_df.index[i]}')
    valid_image_name = valid_df.iloc[i]['Image']
    valid_h5_name_list.append(valid_image_name[:-4] + '.h5') 

h5_dict = {"train":train_h5_name_list,"valid":valid_h5_name_list}

# HFHS set
with open(H5_IMAGE_DIR + 'pos4_neg5_datasets_CLASSWEIGHT_only.json', 'w') as filehandle:
    json.dump(h5_dict, filehandle)     


In [None]:
# Check H5 file generation

print(IMAGE_DIR)
print(H5_IMAGE_DIR)

h5_files = [f for f in listdir(H5_IMAGE_DIR) if isfile(join(H5_IMAGE_DIR, f))]
h5_files.sort()

# Select one image
with h5py.File(H5_IMAGE_DIR + h5_files[9], 'r') as f:
    X_h5 = np.array(f.get("X"))
    M_h5 = np.array(f.get("M"))
    y_h5 = np.array(f.get("y"))
    w_h5 = np.array(f.get("w"))
    
fig = plt.figure(figsize=(20,10))
fig.subplots_adjust(hspace=0.4, wspace=0.2)

ax = fig.add_subplot(1, 2, 1)
ax.imshow(np.squeeze(X_h5), cmap="gray")
ax = fig.add_subplot(1, 2, 2)
ax.imshow(np.squeeze(M_h5), cmap="gray")       

# plt.savefig(working_img_mask_path + name + '_img_and_pred_mask.png') 
# plt.close()
    

In [9]:
h5_files = [f for f in listdir(H5_IMAGE_DIR) if isfile(join(H5_IMAGE_DIR, f))]
h5_files.sort()
# print(h5_files)


In [10]:
# Check for duplicate slice number between training and validation or test set

# HFHS set
with open(H5_IMAGE_DIR + "pos4_neg5_datasets_CLASSWEIGHT_only.json") as json_file:
    dataset = json.load(json_file)
    
train_len = len(dataset['train'])
train_id_list = [] 
for name in dataset['train']:
    train_id_list.append(name)
    
valid_len = len(dataset['valid'])
valid_id_list = [] 
for name in dataset['valid']:
    valid_id_list.append(name)
    
# test_len = len(dataset['test'])
# test_id_list = [] 
# for name in dataset['test']:
#     test_id_list.append(name)

# Checking common elements in two lists
common_train_valid_slices = commonelem_set(train_id_list, valid_id_list)
print(f'{common_train_valid_slices}')
# common_train_test_slices = commonelem_set(train_id_list, test_id_list)
# print(f'Common slices between train and test sets: {common_train_test_slices}')
# common_valid_test_slices = commonelem_set(valid_id_list, test_id_list)
# print(f'Common slices between valid and test sets: {common_valid_test_slices}')


There are no common elements between these two sets.
