In [None]:
#Import required libraries and packages
import os, shutil
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from zipfile import ZipFile
import gc
import cv2
import cupy as cp
import pydicom
from pydicom.pixel_data_handlers.util import apply_modality_lut
from joblib import Parallel, delayed
from tqdm.notebook import tqdm
from glob import glob
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

In [None]:
class Config:
    SEED = 42
    IMAGE_SIZE = [256, 256]
    RESIZE_DIM = 256
    TARGET_COLS = [
        'bowel_injury', 'extravasation_injury',
        'kidney_healthy', 'kidney_low', 'kidney_high',
        'liver_healthy', 'liver_low', 'liver_high',
        'spleen_healthy', 'spleen_low', 'spleen_high',
    ]
#Create an instance of the Config Class
config = Config()

# Data Pipeline

## Dataset Overview:

The dataset supplied for this competition is initially in DICOM format, a standard for medical imaging. However, for the purpose of our training pipeline, we have opted to use PNG images extracted from the DICOM format.

**Reasoning:**

1. **Compatibility and Standardization:**
   - **DICOM (Digital Imaging and Communications in Medicine):** DICOM is a widely accepted standard for medical imaging, ensuring compatibility across different medical devices and systems. However, it is not inherently suitable for deep learning frameworks.
   - **PNG (Portable Network Graphics):** PNG, on the other hand, is a format specifically designed for the web but is easily interpretable by various image processing libraries used in machine learning frameworks. Using PNG allows us to leverage the flexibility and ease of integration offered by common image handling tools.

2. **Simplified Preprocessing:**
   - **DICOM:** DICOM files often contain a plethora of metadata and additional information that might not be relevant for our specific machine learning task. Extracting and processing this information can be computationally intensive.
   - **PNG:** By converting DICOM to PNG, we streamline the preprocessing pipeline. PNG images typically retain only the essential visual information, making it easier to handle and reducing the complexity of our data preparation steps.

3. **Community Support:**
   - **PNG:** The machine learning community has developed extensive tooling and resources around common image formats like PNG. This includes pre-trained models, data augmentation techniques, and a wealth of knowledge on best practices. Utilizing PNG aligns our approach with community standards, facilitating collaboration and knowledge exchange.

4. **Visualization and Interpretability:**
   - **PNG:** PNG images are easily viewable with standard image viewers and can be more intuitively interpreted by researchers and medical professionals. This facilitates the collaborative analysis of the dataset and the validation of model outputs.

5. **Storage Efficiency:**
   - **PNG:** While DICOM is efficient for storage of medical imaging data, for our specific machine learning application, PNG can be more storage-friendly. PNG compression is lossless, maintaining image quality while potentially reducing storage requirements compared to DICOM.

In summary, the decision to work with PNG images derived from DICOM is driven by a combination of technical compatibility, ease of preprocessing, community support, visualization benefits, and storage considerations tailored to the requirements of our machine learning workflow.

In [None]:
MAIN_FOLDER = '/kaggle/input/rsna-2023-abdominal-trauma-detection'
IMAGE_DIR = '/tmp/dataset/rsna-atd'
TRAIN_PATH = "/kaggle/input/rsna-2023-abdominal-trauma-detection/train_images/"
Stride = 10
!ls {MAIN_FOLDER}

## Metadata Overview

**The train.csv file provides comprehensive meta-information, including:**

* **patient_id:** A distinctive identification code assigned to each patient.
* **series_id:** A unique identifier for each scan, facilitating traceability.
* **instance_number:** Denotes the image sequence within a scan. Notably, the starting instance number is often above zero due to the initial cropping of scans to the abdomen.
* **[bowel/extravasation]_[healthy/injury]:** Presents binary targets for two distinct injury types, namely bowel and extravasation.
* **[kidney/liver/spleen]_[healthy/low/high]:** Offers a nuanced perspective with three target levels for three injury types: kidney, liver, and spleen.
* **any_injury:** Indicates whether a patient experienced any form of injury during the course of observation.

This structured metadata is instrumental for comprehending and navigating the dataset. The patient_id and series_id enable unique patient and scan identification, while instance_number provides insights into the sequential order of images within scans. The injury-related fields afford a detailed breakdown, categorizing injuries into binary and trinary classifications. Lastly, any_injury serves as a concise flag denoting the overall injury status of each patient. This clear delineation of metadata categories enhances the dataset's interpretability, facilitating efficient analysis and model training.

In [None]:
#Read CSV files
meta_train_df = pd.read_csv(f'{MAIN_FOLDER}/train_series_meta.csv')

#Checking if patients are repeated by finding the number of unique patient IDs
num_train_rows = meta_train_df.shape[0]
unique_train_patients = meta_train_df['patient_id'].nunique()

print(f'{num_train_rows=}')
print(f'{unique_train_patients=}')

In [None]:
#Read CSV files
meta_test_df = pd.read_csv(f'{MAIN_FOLDER}/test_series_meta.csv')

#Checking if patients are repeated by finding the number of unique patient IDs
num_test_rows = meta_test_df.shape[0]
unique_test_patients = meta_train_df['patient_id'].nunique()

print(f'{num_test_rows=}')
print(f'{unique_test_patients=}')

In [None]:
#Read CSV files
train_df = pd.read_csv(os.path.join(MAIN_FOLDER, 'train.csv'))
img_lvl_df = pd.read_csv(os.path.join(MAIN_FOLDER, 'image_level_labels.csv'))

#Merge DataFrames
train_df = pd.merge(train_df, img_lvl_df, on='patient_id', how='right')

#Construct image_path using os.path.join
train_df['image_path'] = train_df.apply(
    lambda row: os.path.join(MAIN_FOLDER, 'train_images', str(row['patient_id']),
                             str(row['series_id']), f"{row['instance_number']}.dcm"),
    axis=1
)

#Drop duplicates
train_df.drop_duplicates()

train_df.head()


In [None]:
#Checking if patients are repeated by finding the number of unique patient IDs for train data
num_rows = train_df.shape[0]
unique_patients = train_df['patient_id'].nunique()

print(f'{num_rows=}')
print(f'{unique_patients=}')

In [None]:
#Glob for DICOM files in the specified directory
test_paths = glob('/kaggle/input/rsna-2023-abdominal-trauma-detection/test_images/*/*/*dcm')

#Create a DataFrame to organize the test dataset
test_df = pd.DataFrame(test_paths, columns=["image_path"])

#Extract patient_id, series_id, and instance_number from the file path and convert to integer
test_df['patient_id'] = test_df.image_path.map(lambda x: x.split('/')[-3]).astype(int)
test_df['series_id'] = test_df.image_path.map(lambda x: x.split('/')[-2]).astype(int)
test_df['instance_number'] = test_df.image_path.map(lambda x: x.split('/')[-1].replace('.dcm','')).astype(int)

print('Test:')
print(f'# Size: {len(test_df)}')
display(test_df.head())



In [None]:
#Checking if patients are repeated by finding the number of unique patient IDs for test data
num_rows = test_df.shape[0]
unique_patients = test_df['patient_id'].nunique()

print(f'{num_rows=}')
print(f'{unique_patients=}')

## DICOM to PNG pipeline

In [None]:
#Remove the directory and its contents (if it exists)
!rm -r {IMAGE_DIR}

#Create 'train_images' and 'test_images' directories within IMAGE_DIR
os.makedirs(f'{IMAGE_DIR}/train_images', exist_ok=True)
os.makedirs(f'{IMAGE_DIR}/test_images', exist_ok=True)

In [None]:
def standardize_pixel_array(dcm: pydicom.dataset.FileDataset) -> np.ndarray:
    """
    Standardize pixel array based on DICOM metadata.
    
    Args:
        dcm (pydicom.dataset.FileDataset): DICOM file dataset.
    
    Returns:
        numpy.ndarray: Standardized pixel array.
    """
    # Correct DICOM pixel_array if PixelRepresentation == 1.
    pixel_array = dcm.pixel_array
    if dcm.PixelRepresentation == 1:
        bit_shift = dcm.BitsAllocated - dcm.BitsStored
        dtype = pixel_array.dtype 
        pixel_array = (pixel_array << bit_shift).astype(dtype) >> bit_shift

    intercept = float(dcm.RescaleIntercept)
    slope = float(dcm.RescaleSlope)
    center = int(dcm.WindowCenter)
    width = int(dcm.WindowWidth)
    low = center - width / 2
    high = center + width / 2    
    
    pixel_array = (pixel_array * slope) + intercept
    pixel_array = np.clip(pixel_array, low, high)

    return pixel_array

def read_xray(path, fix_monochrome=True):
    """
    Read X-ray DICOM file, standardize pixel array, and normalize the values.
    
    Args:
        path (str): File path to the X-ray DICOM file.
        fix_monochrome (bool): Flag to fix monochrome inversion if PhotometricInterpretation is 'MONOCHROME1'.
    
    Returns:
        numpy.ndarray: Normalized pixel values of the X-ray image.
    """
    dicom = pydicom.dcmread(path)
    data = standardize_pixel_array(dicom)

    #Normalize pixel values
    data = (data - np.min(data)) / (np.max(data) + 1e-5)

    #Fix monochrome if needed
    if fix_monochrome and dicom.PhotometricInterpretation == 'MONOCHROME1':
        data = 1.0 - data

    return data

def resize_and_save(file_path, resize_dim=(config.RESIZE_DIM, config.RESIZE_DIM)):
    """
    Resize image, convert to uint8, and save to a new location.
    Returns patient, study, image IDs, original width, and height.

    Args:
        file_path (str): File path to the X-ray DICOM file.
        resize_dim (tuple): Dimensions for resizing the image.

    Returns:
        tuple: Patient ID, Study ID, Image ID, Original Width, Original Height.
    """
    img = read_xray(file_path)

    # Resize using Cupy on GPU
    img_gpu = cp.asarray(img)
    img_resized_gpu = cp.asnumpy(cv2.resize(img_gpu, resize_dim, cv2.INTER_LINEAR))
    
    # Scale to uint8 on GPU
    img_resized_gpu = cp.asnumpy((img_resized_gpu * 255).astype(np.uint8))

    # Extract patient, study, and image IDs
    sub_path = file_path.split("/", 4)[-1].split('.dcm')[0] + '.png'
    infos = sub_path.split('/')
    pid = infos[-3]
    sid = infos[-2]
    iid = infos[-1]; iid = iid.replace('.png','')
    
    # Create new path
    new_path = os.path.join(IMAGE_DIR, sub_path)
    os.makedirs(os.path.dirname(new_path), exist_ok=True)
    
    # Save the resized image directly
    cv2.imwrite(new_path, img_resized_gpu)
    
    return pid, sid, iid, img.shape[1], img.shape[0]

def resize_and_save_with_visualization(file_path, resize_dim=(config.RESIZE_DIM, config.RESIZE_DIM), plot_frequency=1000):
    """
    Resize image, convert to uint8, and save to a new location with visualization.

    Args:
        file_path (str): File path to the X-ray DICOM file.
        resize_dim (tuple): Dimensions for resizing the image.
        plot_frequency (int): Frequency of plotting images during processing.

    Returns:
        tuple: Patient ID, Study ID, Image ID, Original Width, Original Height.
    """
    dicom = pydicom.dcmread(file_path)
    
    # Standardize pixel array
    img = standardize_pixel_array(dicom)
    img = (img - img.min()) / (img.max() - img.min() + 1e-6)

    # Flip image if MONOCHROME1
    if dicom.PhotometricInterpretation == "MONOCHROME1":
        img = 1 - img

    h, w = img.shape[:2]
    
    # Resize using Cupy and convert to NumPy
    img_resized = cv2.resize(img, resize_dim, cv2.INTER_LINEAR)
    
    # Scale to uint8 
    img_resized = (img_resized * 255).astype(np.uint8)
    
    # Extract patient, study, and image IDs
    sub_path = file_path.split("/", 4)[-1].split('.dcm')[0] + '.png'
    infos = sub_path.split('/')
    pid = infos[-3]
    sid = infos[-2]
    iid = infos[-1]; iid = iid.replace('.png','')
    
    # Create new path
    new_path = os.path.join(IMAGE_DIR, sub_path)
    os.makedirs(os.path.dirname(new_path), exist_ok=True)
    
    # Save the resized image directly
    cv2.imwrite(new_path, img_resized)

    # Plotting
    if not (len(os.listdir(os.path.dirname(file_path))) % plot_frequency):
        plt.figure(figsize=(5, 5))
        plt.imshow(img_resized, cmap="gray")
        plt.title(f"Patient {pid} - Study {sid} - Frame {iid}")
        plt.axis(False)
        plt.show()

    return pid, sid, iid, w, h

def process_images_with_resizing_and_visualization(file_paths):
    """
    Process images in parallel with resizing, saving, and visualization.

    Args:
        file_paths (list): List of file paths to DICOM images.

    Returns:
        list: A list containing the sizes of processed images.
    """
    with ThreadPoolExecutor(max_workers=4) as executor:
        # Use ThreadPoolExecutor to parallelize image processing using the resize_and_save function
        image_sizes = list(tqdm(
            executor.map(resize_and_save_with_visualization, file_paths),
            total=len(file_paths),
            leave=True
        ))
    return image_sizes


In [None]:
file_train_paths = train_df.image_path.tolist()

#Process images in parallel and measure time
start_time = time.time()
train_img_sizes = process_images_with_resizing_and_visualization(file_train_paths)
end_time = time.time()

print(f"Total time taken: {end_time - start_time} seconds")

# Perform garbage collection to free up resources
gc.collect()



In [None]:
#Unpack imgsize_train list of tuples into separate lists
pid, sid, iid, width, height = zip(*train_img_sizes)

#Create a DataFrame with the extracted information
meta_df = pd.DataFrame({
    'patient_id_meta': pid,
    'series_id_meta': sid,
    'instance_number_meta': iid,
    'width': width,
    'height': height
})

#Convert specific columns to integer type
columns_to_convert = ['patient_id_meta', 'series_id_meta', 'instance_number_meta']
meta_df[columns_to_convert] = meta_df[columns_to_convert].astype(int)

#Merge the DataFrames based on specific columns or index
train_df = pd.concat([train_df, meta_df], axis=1) 

#Drop the redundant columns
train_df = train_df.drop(columns=['patient_id_meta', 'series_id_meta', 'instance_number_meta'])

train_df.head()


In [None]:
file_test_paths = test_df.image_path.tolist()

#Process images in parallel and measure time
start_time = time.time()
test_img_sizes = process_images_with_resizing_and_visualization(file_test_paths)
end_time = time.time()

print(f"Total time taken: {end_time - start_time} seconds")

# Perform garbage collection to free up resources
gc.collect()


In [None]:
#Unpack imgsize_train list of tuples into separate lists
pid, sid, iid, width, height = zip(*test_img_sizes)

#Create a DataFrame with the extracted information
meta_df = pd.DataFrame({
    'patient_id_meta': pid,
    'series_id_meta': sid,
    'instance_number_meta': iid,
    'width': width,
    'height': height
})

#Convert specific columns to integer type
columns_to_convert = ['patient_id_meta', 'series_id_meta', 'instance_number_meta']
meta_df[columns_to_convert] = meta_df[columns_to_convert].astype(int)

#Merge the DataFrames based on specific columns or index
test_df = pd.concat([test_df, meta_df], axis=1) 

#Drop the redundant columns
test_df = test_df.drop(columns=['patient_id_meta', 'series_id_meta', 'instance_number_meta'])

test_df.head()


In [None]:
#Filter the DataFrame based on the condition
filtered_train_df = train_df[train_df.width < 700]
#Get the image path from the filtered DataFrame
image_path = filtered_train_df.image_path.iloc[0].replace(MAIN_FOLDER, IMAGE_DIR).replace('.dcm', '.png')
#Read the image using cv2
img = cv2.imread(image_path, cv2.IMREAD_UNCHANGED)

#Plot the image
plt.figure(figsize=(10, 10))
plt.imshow(img, cmap='gray')
plt.show()

In [None]:
#Filter the DataFrame based on the condition
filtered_train_df = train_df[train_df.width > 700]
#Check if there are any matching rows
if not filtered_train_df.empty:
    #Get the image path from the filtered DataFrame
    image_path = filtered_train_df.image_path.iloc[0].replace(MAIN_FOLDER, IMAGE_DIR).replace('.dcm', '.png')
    #Read the image using cv2
    img = cv2.imread(image_path, cv2.IMREAD_UNCHANGED)
    #Plot the image
    plt.figure(figsize=(10, 10))
    plt.imshow(img, cmap='gray')
    plt.show()
else:
    print("No images found for the specified condition.")

# Creating CSV

In [None]:
#Save DataFrames to CSV files
train_df.to_csv(f'{IMAGE_DIR}/train.csv', index=False)
test_df.to_csv(f'{IMAGE_DIR}/test.csv', index=False)

#Copy additional metadata files to the target directory
shutil.copy(f'{MAIN_FOLDER}/train_series_meta.csv', f'{IMAGE_DIR}/')
shutil.copy(f'{MAIN_FOLDER}/test_series_meta.csv', f'{IMAGE_DIR}/')
shutil.copy(f'{MAIN_FOLDER}/sample_submission.csv', f'{IMAGE_DIR}/')


In [None]:
#List directory contents
!ls -al {IMAGE_DIR}

# Creating Dataset

In [None]:
#Create a ZipFile object for writing
zipObj = ZipFile(f'/kaggle/working/rsna-acd-processed.zip', 'w')

#Gather file paths from the specified IMAGE_DIR and its subdirectories
file_paths = glob(f'{IMAGE_DIR}/**/*', recursive=True)

#Print the total number of files to be zipped
print(f'Total Files: {len(file_paths)}')

#Begin zipping process
print('Zipping...')
for file_path in tqdm(file_paths):
    #Write the file to the zip archive, using a relative path within the archive
    zipObj.write(file_path, file_path[len(IMAGE_DIR):])
    
    # emove the original file after adding to the zip archive
    os.remove(file_path) if os.path.isfile(file_path) else None

#Close the ZipFile object
zipObj.close()
