In [1]:
# Standard library imports
from pathlib import Path  # To handle and manipulate filesystem paths
import os  # For interacting with the operating system
import glob  # For finding all file paths matching a specified pattern

# Third-party imports
import numpy as np  # For numerical operations and handling arrays
import pandas as pd  # For data manipulation and analysis
import matplotlib.pyplot as plt  # For creating static, animated, and interactive visualizations
from PIL import Image  # For opening, manipulating, and saving many different image file formats

# PyTorch imports
import torch  # Main PyTorch library for building and training neural networks
from torch.utils.data import Dataset, DataLoader  # For handling datasets and data loaders

# PyTorch-I/O extension
import torchio as tio  # For medical image processing in PyTorch

# pydicom imports
import pydicom  # For reading, modifying, and writing DICOM files
from pydicom.data import get_testdata_file  # For accessing test DICOM files
from pydicom.fileset import FileSet  # For working with DICOM FileSets

# Scikit-learn imports
from sklearn.model_selection import train_test_split  # For splitting datasets into training and testing sets


In [2]:
os.chdir('/Users/eleanorbolton/OneDrive - University of Leeds/CCP_MRI_IMAGE_SUBSET/')
print(os.getcwd())

/Users/eleanorbolton/Library/CloudStorage/OneDrive-UniversityofLeeds/CCP_MRI_image_subset


#### Process the DICOM Image
This function processes a DICOM image and returns the image as a NumPy array. It optionally resizes the image to reduce its size in memory.


In [3]:
def process_dicom_image(path: str, resize=True) -> np.ndarray:
    """ Given a path to a DICOM image, process and return the image. 
        Reduces the size in memory.
    """
    dicom_file = pydicom.dcmread(path)
    image = dicom_file.pixel_array
    image = image - np.min(image)
    image = image.astype(np.uint8)
    '''
    # resize the image to 256x256 using PIL
    if resize:
        image = Image.fromarray(image)
        image = image.resize((256, 256))
        image = np.array(image)
    '''
    return image

#### Get Sequence Image
This function returns a sorted list of images from a specified MRI sequence subfolder. It excludes images that are entirely black.


In [5]:
def get_sequence_images(path: str) -> list:
    images = []
    
    # Get a list of all DICOM files in the directory
    image_path_list = glob.glob(os.path.join(path, '*'))
    
    # Read the DICOM files and store them with their instance numbers
    dicom_files = []
    for image_path in image_path_list:
        try:
            dicom_file = pydicom.dcmread(image_path)
            instance_number = dicom_file.InstanceNumber
            dicom_files.append((instance_number, image_path))
        except Exception as e:
            print(f"Error reading {image_path}: {e}")
    
    # Sort the files by instance number
    dicom_files.sort(key=lambda x: x[0])
    
    # Read the pixel data in sorted order
    for _, image_path in dicom_files:
        try:
            dicom_file = pydicom.dcmread(image_path)
            image = dicom_file.pixel_array
            images.append(image)
        except Exception as e:
            print(f"Error reading pixel data from {image_path}: {e}")
    
    return images

### Defining the central slice
The anatomical "middle" of the MR image will be different in each subject. we therefore need to decide the best way to define the central slice

#### Get the middle slice

This function returns the middle image in a sequence of MRI images, excluding images that are entirely black.


In [6]:
def get_middle_image(path: str) -> np.ndarray:
    images = []
    
    # Get a list of all DICOM files in the directory
    image_path_list = glob.glob(os.path.join(path, '*'))
    
    # Read the DICOM files and store them with their instance numbers
    dicom_files = []
    for image_path in image_path_list:
        dicom_file = pydicom.dcmread(image_path)
        instance_number = dicom_file.InstanceNumber
        dicom_files.append((instance_number, image_path))
    
    # Sort the files by instance number
    dicom_files.sort(key=lambda x: x[0])

#### Get the best slice

This is based on the sum of the pixel tensor and finds the max sum

In [7]:
def find_best_slice(dicom_files):
    """ Find the slice with the highest sum of pixel intensities. """
    max_sum = -1
    best_slice = None

    for dicom_file, image_path in dicom_files:
        try:
            image = dicom_file.pixel_array
            image_sum = np.sum(image)
            if image_sum > max_sum:
                max_sum = image_sum
                best_slice = (dicom_file, image_path)
        except Exception as e:
            print(f"Error reading {image_path}: {e}")

    return best_slice

#### Find the slice with the least zero values

In [8]:
def find_nonzero_slice(dicom_files):
    """ Find the slice with the least amount of black (highest number of non-zero pixels). """
    max_non_zero = -1
    best_slice = None

    for dicom_file, image_path in dicom_files:
        try:
            image = dicom_file.pixel_array
            non_zero_count = np.count_nonzero(image)
            if non_zero_count > max_non_zero:
                max_non_zero = non_zero_count
                best_slice = (dicom_file, image_path)
        except Exception as e:
            print(f"Error reading {image_path}: {e}")

    return best_slice

### Removing Duplicate Images
Some images are present for the same subjects at the same position but have been processed. This Function removes the least infomrative of the duplicate image based on the number of 0 pixels

In [9]:
def remove_duplicates(dicom_files):
    """ Remove duplicate instance numbers, keeping only the slice with the highest sum of intensities. """
    instance_dict = defaultdict(list)

    for dicom_file, image_path in dicom_files:
        instance_number = dicom_file.InstanceNumber
        instance_dict[instance_number].append((dicom_file, image_path))

    # Keep only the slice with the highest sum of intensities for each instance number
    unique_dicom_files = []
    for instance_number, files in instance_dict.items():
        if len(files) > 1:
            best_slice = find_best_slice(files)
            unique_dicom_files.append(best_slice)
        else:
            unique_dicom_files.append(files[0])

    return unique_dicom_files