In [None]:
import pydicom
import pandas as pd
from tqdm.notebook import tqdm

In [None]:
# run notebooks with functions necessary for this notebook
# please modify the path if it differs
%run /content/src/image_preprocessing.ipynb

In [None]:
def img_preprocessing(image, b_neigh=None, t_neigh=None, dim=(224,224), window=(40,80), apply_clahe=False):
    """
    Preprocess an CT image using correcting HU values, brain segmentation, new spacing application, reshaping and converting to three channel image.
        
    :param image: the image to preprocess
    :param b_neigh: the bottom neighbouring slice
    :param t_neigh: the top neighbouring slice
    :param dim: a tuple of the desired dimensions of the image
    :param window: a tuple representing the window center and window width
    :param apply_clahe: whether to use CLAHE or not
        
    :return: the preprocessed image
    """
    if b_neigh is not None and t_neigh is not None:
        channels = []
        for img in (b_neigh, image, t_neigh):
            pixel_array = img.pixel_array
            pixel_array = hu_to_pixels(pixel_array, img.RescaleIntercept, img.RescaleSlope, window[0], window[1])
            pixel_array = segment_brain(pixel_array)
            pixel_array = apply_new_spacing(pixel_array, np.array(img.PixelSpacing), [1, 1])
            pixel_array = crop_or_reshape(pixel_array, dim)
            if apply_clahe:
                pixel_array = clahe(pixel_array)
            channels.append(pixel_array)
        return to_3_channels(channels[0], channels[1], channels[2])
    else:
        pixel_array = image.pixel_array
        pixel_array = hu_to_pixels(pixel_array, image.RescaleIntercept, image.RescaleSlope, window[0], window[1])
        pixel_array = segment_brain(pixel_array)
        pixel_array = apply_new_spacing(pixel_array, np.array(image.PixelSpacing), [1, 1])
        pixel_array = crop_or_reshape(pixel_array, dim)
        if apply_clahe:
            pixel_array = clahe(pixel_array)
        pixel_array = cv2.cvtColor(pixel_array.astype('uint8'), cv2.COLOR_GRAY2BGR)
        return pixel_array

In [None]:
def find_neighbors_to_image(image_id, dataframe):
    """
    Finds the neighbouring slices to an image. If there is no neighbours, an array filled with zeros will be returned.
        
    :param image_id: the image ID for which the neighbours will be found
    :param dataframe: the dataframe with image IDs, studies and vertical positions
        
    :return: (bottom neighbour, top neighbour)
    """
    study_id = dataframe[dataframe['ID'] == image_id]["Study"].values[0]
    study_ids = dataframe[dataframe['Study'] == study_id]["ID"].unique()
    positions = []
    for img_id in study_ids:
        positions.append(dataframe[dataframe['ID'] == img_id]["Position"].values[0])
    positions_dict = dict(zip(positions, study_ids))
    sorted_keys = sorted(positions_dict)
    
    pos = dataframe[dataframe['ID'] == image_id]["Position"].values[0]
    pos_index = sorted_keys.index(pos)
    
    b_neigh_pos = sorted_keys[pos_index - 1] if pos_index != 0 else sorted_keys[pos_index]
    b_neigh = positions_dict[b_neigh_pos]
    t_neigh_pos = sorted_keys[pos_index + 1] if pos_index != len(sorted_keys) - 1 else sorted_keys[pos_index]
    t_neigh = positions_dict[t_neigh_pos]
    return b_neigh, t_neigh

In [None]:
def generate_images(dataframe, window, dim, output_dir, img_dir_path, context_3d=False, apply_clahe=False, limited_studies=None):
    """
    Generates preprocessed .jpg images from .dicom CT scans.
        
    :param dataframe: a dataframe with image IDs, studies and its vertical positions
    :param window: a window to apply -> tuple in format (window center, wineow width)
    :param dim: a tuple with desired image dimensions
    :param output_dir: path to the directory for output files
    :param img_dir_path: path to the directory with .dicom files
    :param context_3d: whether to include 3D context or not
    :param apply_clahe: whether to apply CLAHE or not
    :param limited_studies: in the case of limited runtime, the current subset of studies to convert
    """
    studies = limited_studies
    if studies is None:
        studies = dataframe["Study"].unique()

    for study in tqdm(studies):
        images = dataframe[dataframe["Study"] == study]["ID"].unique()

        for img_id in images:
            img = pydicom.filereader.dcmread(img_dir_path + img_id + ".dcm")
            if not context_3d:
                img = img_preprocessing(img, dim=dim, window=window, apply_clahe=apply_clahe)
            else:
                b_neigh, t_neigh = find_neighbors_to_image(img_id, dataframe)
                b_neigh_img = pydicom.filereader.dcmread(img_dir_path + b_neigh + ".dcm")
                t_neigh_img = pydicom.filereader.dcmread(img_dir_path + t_neigh + ".dcm")
                img = img_preprocessing(img, b_neigh_img, t_neigh_img, dim, window, apply_clahe)
            cv2.imwrite(output_dir + img_id + ".jpg" , img)

In [None]:
TRAIN_IMG_DIR_PATH = "/content/drive/MyDrive/rsna-intracranial-hemorrhage-detection/stage_2_train/"
TEST_IMG_DIR_PATH = "/content/drive/MyDrive/rsna-intracranial-hemorrhage-detection/stage_2_test/"
OUTPUT_TRAIN_DIR = "/content/jpg_data/stage_2_train_jpg/"
OUTPUT_TEST_DIR = "/content/jpg_data/stage_2_test_jpg/"
TRAIN_DATAFRAME_PATH = "/content/dataframes/stage_2_train_mod.csv"
TEST_DATAFRAME_PATH = "/content/dataframes/stage_2_sample_submission_mod.csv"

# use '! mkdir directory_name' if the directories didn't exist

WINDOW = (40, 80)
IMG_DIM = (224, 224)
CONTEXT_3D = False
CLAHE = False

######################################################################################################################
# in the case of limited runtime please modify the settings below
# converting all images may take hours

LIMITED_RUNTIME = False   # set to True in the case of limited runtime
DATA_TO_CONVERT = "train"   # use "train" or "test", depending on which images you want to convert now
STUDIES_PER_RUN = 1000
RUNS_FINISHED = 0   # runs finished on current dataframe - if you start to convert test data, set again to 0
######################################################################################################################

# generate training images
if not LIMITED_RUNTIME or DATA_TO_CONVERT == "train":
    print("Generating training images...")
    dataframe = pd.read_csv(TRAIN_DATAFRAME_PATH)
    if not LIMITED_RUNTIME:
        generate_images(dataframe, WINDOW, IMG_DIM, OUTPUT_TRAIN_DIR, TRAIN_IMG_DIR_PATH, CONTEXT_3D, CLAHE)
    else:
        studies = dataframe['Study'].unique()
        if RUNS_FINISHED * STUDIES_PER_RUN > len(studies):
            print(f'No additional data to convert.')
        else:
            print(f'Run {RUNS_FINISHED + 1}. {int(np.ceil(len(studies)/STUDIES_PER_RUN)) - RUNS_FINISHED - 1} additional runs needed.')
            limited_studies = studies[RUNS_FINISHED * STUDIES_PER_RUN : (RUNS_FINISHED + 1) * STUDIES_PER_RUN]
            generate_images(dataframe, WINDOW, IMG_DIM, OUTPUT_TRAIN_DIR, TRAIN_IMG_DIR_PATH, CONTEXT_3D, CLAHE, limited_studies)

# generate test images
if not LIMITED_RUNTIME or DATA_TO_CONVERT == "test":
    print("Generating test images...")
    dataframe = pd.read_csv(TEST_DATAFRAME_PATH)
    if not LIMITED_RUNTIME:
        generate_images(dataframe, WINDOW, IMG_DIM, OUTPUT_TEST_DIR, TEST_IMG_DIR_PATH, CONTEXT_3D, CLAHE)
    else:
        studies = dataframe['Study'].unique()
        if RUNS_FINISHED * STUDIES_PER_RUN > len(studies):
            print(f'No additional data to convert.')
        else:
            print(f'Run {RUNS_FINISHED + 1}. {int(np.ceil(len(studies)/STUDIES_PER_RUN)) - RUNS_FINISHED - 1} additional runs needed.')
            limited_studies = studies[RUNS_FINISHED * STUDIES_PER_RUN : (RUNS_FINISHED + 1) * STUDIES_PER_RUN]
            generate_images(dataframe, WINDOW, IMG_DIM, OUTPUT_TEST_DIR, TEST_IMG_DIR_PATH, CONTEXT_3D, CLAHE, limited_studies)

In [None]:
# you can also tar and download the data as tar.gz
! tar -cf stage_2_train_jpg.tar.gz $OUTPUT_TRAIN_DIR'*'
! tar -cf stage_2_test_jpg.tar.gz $OUTPUT_test_DIR'*'