In [1]:
import os
import glob
import json
import pandas as pd  # For data manipulation and analysis
import pydicom  # For reading, modifying, and writing DICOM files
from pydicom.data import get_testdata_file  # For accessing test DICOM files
from pydicom.fileset import FileSet  # For working with DICOM FileSets
import numpy as np


In [2]:
# Reading the CSV file
training_data_dir = "/Users/eleanorbolton/Library/CloudStorage/OneDrive-UniversityofLeeds/t1_vibe_we_hand_subset/" 
csv_path = os.path.join(training_data_dir, 'training_labels_subset.csv')
labels_df = pd.read_csv(csv_path)

In [3]:
import json
import os
import glob
import numpy as np
import pydicom
import time
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor
from functools import lru_cache
from tqdm import tqdm  # Add tqdm for progress tracking

class HandScanDataset3:
    def __init__(self, labels_df, data_dir):
        self.labels_df = labels_df
        self.data_dir = data_dir
        self.patient_ids = self.labels_df['patient ID'].astype(str).str.zfill(5).tolist()
        self.labels = self.labels_df['progression'].apply(lambda x: 1 if x == 'y' else 0).tolist()

    @lru_cache(maxsize=128)
    def get_best_patient_images(self, base_path):
        seq_len = 2
        selected_image_paths = []

        # Search for the 't1_vibe_we' directory recursively within the patient directory
        t1_vibe_we_path = None
        for root, dirs, files in os.walk(base_path):
            if 't1_vibe_we' in dirs:
                t1_vibe_we_path = os.path.join(root, 't1_vibe_we')
                break

        if not t1_vibe_we_path:
            print(f"Directory 't1_vibe_we' not found under {base_path}.")
            return selected_image_paths

        dicom_files = []
        for image_path in glob.glob(os.path.join(t1_vibe_we_path, '*')):
            try:
                dicom_file = pydicom.dcmread(image_path)
                dicom_files.append((dicom_file, image_path))
            except Exception as e:
                print(f"Error reading {image_path}: {e}")

        dicom_files.sort(key=lambda x: x[0].InstanceNumber)
        dicom_files = self.remove_duplicates(dicom_files)

        if dicom_files:
            max_sum = -1
            best_dicom_file, best_image_path = None, None
            for dicom_file, image_path in dicom_files:
                image = dicom_file.pixel_array
                image_sum = np.sum(image)
                if image_sum > max_sum:
                    max_sum = image_sum
                    best_dicom_file, best_image_path = dicom_file, image_path

            if best_dicom_file is not None:
                best_instance_number = best_dicom_file.InstanceNumber
                start_index = max(0, best_instance_number - (seq_len // 2))
                end_index = start_index + seq_len
                selected_slices = dicom_files[start_index:end_index]

                selected_image_paths = [image_path for _, image_path in selected_slices]

        return selected_image_paths

    def remove_duplicates(self, dicom_files):
        instance_dict = defaultdict(list)

        for dicom_file, image_path in dicom_files:
            instance_number = dicom_file.InstanceNumber
            instance_dict[instance_number].append((dicom_file, image_path))

        unique_dicom_files = []
        for instance_number, files in instance_dict.items():
            if len(files) > 1:
                best_slice = self.find_best_slice(files)
                unique_dicom_files.append(best_slice)
            else:
                unique_dicom_files.append(files[0])

        return unique_dicom_files

    def find_best_slice(self, dicom_files):
        for dicom_file, image_path in dicom_files:
            if hasattr(dicom_file, 'ContrastBolusAgent') and dicom_file.ContrastBolusAgent == 'DOTAREM':
                return dicom_file, image_path
        return dicom_files[0]

    def collect_image_paths(self):
        dataset_dict = {"training": []}

        start_time = time.time()  # Start timing

        with ThreadPoolExecutor() as executor:
            results = list(tqdm(executor.map(self._process_patient, zip(self.patient_ids, self.labels)), 
                                total=len(self.patient_ids), 
                                desc="Processing patients"))

        for result in results:
            dataset_dict["training"].extend(result)

        end_time = time.time()  # End timing
        print(f"Time taken: {end_time - start_time:.2f} seconds")

        return dataset_dict

    def _process_patient(self, patient_info):
        patient_id, label = patient_info
        patient_dir = os.path.join(self.data_dir, patient_id)
        image_paths = self.get_best_patient_images(patient_dir)

        if not image_paths:
            print(f"No image paths found for patient ID {patient_id} in {patient_dir}.")
        
        return [{"image": image_path, "label": label} for image_path in image_paths]

    def save_dataset_to_json(self, output_json_path):
        dataset_dict = self.collect_image_paths()
        with open(output_json_path, 'w') as f:
            json.dump(dataset_dict, f, indent=4)
        print(f"Dataset paths saved to {output_json_path}")

# Example usage:
# labels_df should be your dataframe with 'patient ID' and 'progression' columns.
dataset = HandScanDataset3(labels_df=labels_df, data_dir=training_data_dir)
dataset.save_dataset_to_json('training_data.json')


Processing patients:  14%|█▍        | 30/216 [1:28:18<18:17:31, 354.04s/it]

Error reading /Users/eleanorbolton/Library/CloudStorage/OneDrive-UniversityofLeeds/t1_vibe_we_hand_subset/CCP_229/758a731c5e/t1_vibe_we/1.3.12.2.1107.5.2.36.40258.2014062613541035044351949.DCM: [Errno 89] Operation canceled
Error reading /Users/eleanorbolton/Library/CloudStorage/OneDrive-UniversityofLeeds/t1_vibe_we_hand_subset/CCP_181/983862e0ae/t1_vibe_we/1.3.12.2.1107.5.2.36.40258.201309201112039112924598.DCM: [Errno 89] Operation canceled
Error reading /Users/eleanorbolton/Library/CloudStorage/OneDrive-UniversityofLeeds/t1_vibe_we_hand_subset/CCP_191/d85a6b7dc2/t1_vibe_we/1.3.12.2.1107.5.2.36.40258.2013121615001176086548823.DCM: [Errno 89] Operation canceled
Error reading /Users/eleanorbolton/Library/CloudStorage/OneDrive-UniversityofLeeds/t1_vibe_we_hand_subset/CCP_245/3f1c6efdcb/t1_vibe_we/1.3.12.2.1107.5.2.36.40258.2014100310134739398519057.DCM: [Errno 89] Operation canceled
Error reading /Users/eleanorbolton/Library/CloudStorage/OneDrive-UniversityofLeeds/t1_vibe_we_hand_subset

Processing patients:  18%|█▊        | 38/216 [4:47:36<22:27:12, 454.11s/it]


In [None]:
    def get_best_patient_images(self, base_path):
        """ 
        Process all images in the 't1_vibe_we' subfolder of each subject.
        Sort images by Instance Number and return a sequence of a fixed length.
        """
        seq_len = 2
        all_images = []
        img_shape = (512, 384)  # Set a default image shape

        for root, dirs, files in os.walk(base_path):
            if 't1_vibe_we' in dirs:
                t1_vibe_we_path = os.path.join(root, 't1_vibe_we')
                
                # Get the images in the 't1_vibe_we' sequence
                dicom_files = []
                for image_path in glob.glob(os.path.join(t1_vibe_we_path, '*')):
                    try:
                        dicom_file = pydicom.dcmread(image_path)
                        dicom_files.append((dicom_file, image_path))
                    except Exception as e:
                        print(f"Error reading {image_path}: {e}")

                # Sort the files by Instance Number
                dicom_files.sort(key=lambda x: x[0].InstanceNumber)
                
                # Remove duplicates
                dicom_files = self.remove_duplicates(dicom_files)

                # Find the best slice
                if dicom_files:
                    # Find the slice with the highest intensity
                    max_sum = -1
                    best_dicom_file, best_image_path = None, None
                    for dicom_file, image_path in dicom_files:
                        image = dicom_file.pixel_array
                        image_sum = np.sum(image)
                        if image_sum > max_sum:
                            max_sum = image_sum
                            best_dicom_file, best_image_path = dicom_file, image_path

                    if best_dicom_file is not None:
                        best_instance_number = best_dicom_file.InstanceNumber

                        # Calculate the start and end indices for the selected sequence
                        start_index = max(0, best_instance_number - (seq_len // 2))
                        end_index = start_index + seq_len

                        # Select the slices around the best slice
                        selected_slices = dicom_files[start_index:end_index]

                        images = []
                        for dicom_file, image_path in selected_slices:
                            try:
                                image = self.process_dicom_image(image_path)
                                images.append(image)
                            except Exception as e:
                                print(f"Error processing image {image_path}: {e}")

                        # Determine the original image dimensions
                        if images:
                            img_shape = images[0].shape  # Set img_shape based on the first image

                        if len(images) < seq_len:
                            # Pad with zero images of the same shape as the original images
                            diff = seq_len - len(images)
                            images.extend([np.zeros(img_shape, dtype=np.uint8) for _ in range(diff)])

                        all_images.extend(images)

        return np.array(all_images)


    def remove_duplicates(self, dicom_files):
        """ Remove duplicate instance numbers, keeping only the slice with the highest sum of intensities. """
        instance_dict = defaultdict(list)

        for dicom_file, image_path in dicom_files:
            instance_number = dicom_file.InstanceNumber
            instance_dict[instance_number].append((dicom_file, image_path))

        # Compare DICOM files with the same Instance Number
        unique_dicom_files = []
        for instance_number, files in instance_dict.items():


            if len(files) > 1:

                # Optionally, still choose the best slice based on your criteria, but here we're just showing the differences
                best_slice = self.find_best_slice(files)
                unique_dicom_files.append(best_slice)

            else:
                unique_dicom_files.append(files[0])


        return unique_dicom_files


    def find_best_slice(self, dicom_files):
        """ Find the slice with the 'DOTAREM' ContrastBolusAgent or, as a fallback, return the first available slice. """
        best_slice = None

        # Check for the slice with 'DOTAREM'
        for dicom_file, image_path in dicom_files:
            if hasattr(dicom_file, 'ContrastBolusAgent') and dicom_file.ContrastBolusAgent == 'DOTAREM':
                best_slice = (dicom_file, image_path)
                break  # Stop searching once we find the 'DOTAREM' slice

        # Fallback: If no slice with 'DOTAREM' is found, return the first slice
        if best_slice is None:
            best_slice = dicom_files[0]

        return best_slice


    def process_dicom_image(self, path: str, resize=True) -> np.ndarray:
            dicom_file = pydicom.dcmread(path)
            image = dicom_file.pixel_array.astype(np.float32)
            
            # If the image has any zero-sized dimensions, return a placeholder or skip processing
            if 0 in image.shape:
                print(f"Skipping image due to invalid shape: {image.shape}")
                return np.zeros((512, 384), dtype=np.uint8) 
            
            # Normalize the image: Zero mean and unit variance
            mean = np.mean(image)
            std = np.std(image)
            image = (image - mean) / (std + 1e-7)  # Add a small epsilon to prevent division by zero

            # Apply 95% clipping
            lower_bound = np.percentile(image, 2.5)
            upper_bound = np.percentile(image, 97.5)
            image = np.clip(image, lower_bound, upper_bound)

            # Normalize again after clipping
            mean = np.mean(image)
            std = np.std(image)
            image = (image - mean) / (std + 1e-7)

            # Convert back to uint8 for further processing
            image = (image * 255).astype(np.uint8)

            if resize:
                image = Image.fromarray(image)
                image = image.resize((384, 512))  # Resize the image to 512x384
                image = np.array(image)

            return image


    def get_sequence_images(self, path: str) -> list:
            images = []
            
            # Get a list of all DICOM files in the directory
            image_path_list = glob.glob(os.path.join(path, '*'))
            
            # Read the DICOM files and store them with their instance numbers
            dicom_files = []
            for image_path in image_path_list:
                try:
                    dicom_file = pydicom.dcmread(image_path)
                    instance_number = dicom_file.InstanceNumber
                    dicom_files.append((instance_number, image_path))
                except Exception as e:
                    print(f"Error reading {image_path}: {e}")
            
            # Sort the files by instance number
            dicom_files.sort(key=lambda x: x[0])
            
            # Read the pixel data in sorted order
            for _, image_path in dicom_files:
                try:
                    dicom_file = pydicom.dcmread(image_path)
                    image = dicom_file.pixel_array
                    images.append(image)
                except Exception as e:
                    print(f"Error reading pixel data from {image_path}: {e}")
            
            return images

In [12]:
def collect_image_paths(labels_df, data_dir, seq_len=2):
    dataset_dict = {"training": [], "validation": []}

    for _, row in labels_df.iterrows():
        patient_id = str(row['patient ID']).zfill(5)  # Adjust column name if necessary
        label = 1 if row['progression'] == 'y' else 0  # Adjust column name if necessary

        patient_dir = os.path.join(data_dir, patient_id)
        t1_vibe_we_path = os.path.join(patient_dir, 't1_vibe_we')

        if os.path.exists(t1_vibe_we_path):
            dicom_files = []
            for image_path in glob.glob(os.path.join(t1_vibe_we_path, '*')):
                try:
                    dicom_file = pydicom.dcmread(image_path)
                    dicom_files.append((dicom_file, image_path))
                except Exception as e:
                    print(f"Error reading {image_path}: {e}")

            # Sort the files by Instance Number
            dicom_files.sort(key=lambda x: x[0].InstanceNumber)

            # Find the best slice and collect paths
            if dicom_files:
                max_sum = -1
                best_dicom_file, best_image_path = None, None
                for dicom_file, image_path in dicom_files:
                    image = dicom_file.pixel_array
                    image_sum = np.sum(image)
                    if image_sum > max_sum:
                        max_sum = image_sum
                        best_dicom_file, best_image_path = dicom_file, image_path

                if best_dicom_file is not None:
                    best_instance_number = best_dicom_file.InstanceNumber
                    start_index = max(0, best_instance_number - (seq_len // 2))
                    end_index = start_index + seq_len
                    selected_slices = dicom_files[start_index:end_index]

                    for dicom_file, image_path in selected_slices:
                        entry = {
                            "image": image_path,
                            "label": label
                        }
                        dataset_dict["training"].append(entry)

    return dataset_dict


In [13]:
def save_dataset_to_json(dataset_dict, output_json_path):
    with open(output_json_path, 'w') as f:
        json.dump(dataset_dict, f, indent=4)

# Example usage:
dataset_dict = collect_image_paths(labels_df=training_data_dir, data_dir=training_data_dir)
output_json_path = 'training_data.json'
save_dataset_to_json(dataset_dict, output_json_path)


AttributeError: 'str' object has no attribute 'iterrows'