In [1]:
import torch
from torchvision import transforms
from PIL import Image
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
import os

# --------------------
#Image Preprocessing (DICOM)
# --------------------

In [8]:
import pydicom
import os
from PIL import Image
from torchvision import transforms
import torch

# Base directory containing the DICOM files
base_dir = '/Users/ellaquan/Project/Extracted_DICOM_500'  # Update this path

# Define image transformations
image_preprocess = transforms.Compose([
    transforms.Resize((224, 224)),  # Resizing the image to 224x224
    transforms.ToTensor(),  # Convert the image to a PyTorch tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # ImageNet normalization
])

# Augmentation transformations for training (optional)
augmentation_transforms = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.RandomResizedCrop(224),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.2),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Function to convert DICOM to PIL image
def dicom_to_pil(dicom_path):
    # Load the DICOM file
    dicom_img = pydicom.dcmread(dicom_path)
    
    # Extract pixel array (this will be in grayscale)
    pixel_array = dicom_img.pixel_array
    
    # Normalize to range 0-255 and convert to uint8 type for image compatibility
    pixel_array = (pixel_array - pixel_array.min()) / (pixel_array.max() - pixel_array.min()) * 255
    pixel_array = pixel_array.astype('uint8')
    
    # Convert to PIL image (grayscale 'L' mode)
    pil_image = Image.fromarray(pixel_array).convert('RGB')  # Convert grayscale to RGB for preprocessing
    return pil_image

# Function to preprocess a DICOM image (with optional augmentation)
def preprocess_dicom_image(dicom_path, augment=False):
    pil_image = dicom_to_pil(dicom_path)  # Convert DICOM to PIL image
    
    # Apply augmentation or standard preprocessing
    if augment:
        preprocessed_image = augmentation_transforms(pil_image)
    else:
        preprocessed_image = image_preprocess(pil_image)
    
    return preprocessed_image

# Function to iterate over a folder of DICOM files and preprocess each image
def process_dicom_folder(base_dir, augment=False):
    preprocessed_images = []  # List to store the preprocessed images
    for root, dirs, files in os.walk(base_dir):
        for file_name in files:
            if file_name.endswith('.dcm'):  # Check for DICOM files
                dicom_file_path = os.path.join(root, file_name)
                
                try:
                    print(f"Processing: {dicom_file_path}")  # Debugging line

                    # Preprocess the DICOM image
                    preprocessed_image = preprocess_dicom_image(dicom_file_path, augment=augment)
                    
                    # Append preprocessed image to list (for later use)
                    preprocessed_images.append(preprocessed_image)
                    
                except Exception as e:
                    print(f"Error processing {dicom_file_path}: {e}")
    
    return preprocessed_images
# Function to save preprocessed images
def save_preprocessed_images(base_dir, output_dir, augment=False):
    os.makedirs(output_dir, exist_ok=True)  # Create the output directory if it doesn't exist
    for root, dirs, files in os.walk(base_dir):
        for file_name in files:
            if file_name.endswith('.dcm'):  # Check for DICOM files
                dicom_file_path = os.path.join(root, file_name)
                
                try:
                    print(f"Processing: {dicom_file_path}")  # Debugging line

                    # Preprocess the DICOM image
                    preprocessed_image = preprocess_dicom_image(dicom_file_path, augment=augment)
                    
                    # Convert the preprocessed tensor back to PIL Image and save
                    preprocessed_image_pil = transforms.ToPILImage()(preprocessed_image)  # Convert tensor back to PIL
                    output_path = os.path.join(output_dir, f"{os.path.splitext(file_name)[0]}.png")
                    preprocessed_image_pil.save(output_path)
                    print(f"Saved preprocessed image to: {output_path}")
                    
                except Exception as e:
                    print(f"Error processing {dicom_file_path}: {e}")#

# Example usage: Save preprocessed images to an output folder
#output_directory = '/Users/ellaquan/Project/Preprocessed_Images'
#save_preprocessed_images(base_dir, output_directory, augment=True)

# Example usage:
preprocessed_images = process_dicom_folder(base_dir, augment=True)

# Optionally, you can stack preprocessed images into a batch tensor for model input
#if preprocessed_images:
#    batched_tensor = torch.stack(preprocessed_images)
#    print(f"Batched tensor shape: {batched_tensor.shape}")

# Function to save a tensor to a file
def save_batched_tensor(batched_tensor, file_path):
    torch.save(batched_tensor, file_path)
    print(f"Saved batched tensor to: {file_path}")


# Stack preprocessed images into a batch tensor for model input
if preprocessed_images:
    batched_tensor = torch.stack(preprocessed_images)
    print(f"Batched tensor shape: {batched_tensor.shape}")
    
    # Save the batched tensor to a file
    output_tensor_file = '/Users/ellaquan/Project/batched_tensor.pt'  # Path to save the batched tensor
    save_batched_tensor(batched_tensor, output_tensor_file)

Processing: /Users/ellaquan/Project/Extracted_DICOM_500/D2-0070_1-3.dcm
Processing: /Users/ellaquan/Project/Extracted_DICOM_500/D2-0084_1-2.dcm
Processing: /Users/ellaquan/Project/Extracted_DICOM_500/D2-0046_1-1.dcm
Processing: /Users/ellaquan/Project/Extracted_DICOM_500/D1-0297_1-1.dcm
Processing: /Users/ellaquan/Project/Extracted_DICOM_500/D2-0047_1-4.dcm
Processing: /Users/ellaquan/Project/Extracted_DICOM_500/D1-0255_1-2.dcm
Processing: /Users/ellaquan/Project/Extracted_DICOM_500/D1-1523_1-2.dcm
Processing: /Users/ellaquan/Project/Extracted_DICOM_500/D2-0025_1-4.dcm
Processing: /Users/ellaquan/Project/Extracted_DICOM_500/D1-1178_1-2.dcm
Processing: /Users/ellaquan/Project/Extracted_DICOM_500/D2-0414_1-2.dcm
Processing: /Users/ellaquan/Project/Extracted_DICOM_500/D1-1548_1-1.dcm
Processing: /Users/ellaquan/Project/Extracted_DICOM_500/D1-0607_1-1.dcm
Processing: /Users/ellaquan/Project/Extracted_DICOM_500/D1-0631_1-2.dcm
Processing: /Users/ellaquan/Project/Extracted_DICOM_500/D2-0423_

# --------------------
# Clinical Data Preprocessing
# --------------------

In [12]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import torch
import numpy as np
import pandas as pd

# Load clinical data from Excel
clinical_data_path = '/Users/ellaquan/Project/CMMD-data/CMMD_clinicaldata_revision.xlsx'
clinical_df = pd.read_excel(clinical_data_path) 

# Select the first 500 unique IDs
clinical_df = clinical_df.drop_duplicates(subset='ID1').head(500)

# Select relevant features (columns)
clinical_features = clinical_df[['Age', 'abnormality', 'classification', 'subtype']]

# Normalize continuous variables
scaler = StandardScaler()
continuous_features = ['Age']  # Ensure 'Age' matches the column name in your dataset
clinical_df[continuous_features] = scaler.fit_transform(clinical_df[continuous_features])

# One-hot encode categorical variables
categorical_features = ['abnormality', 'classification', 'subtype']
encoder = OneHotEncoder(sparse_output=False)  # Updated parameter name
encoded_categorical = encoder.fit_transform(clinical_df[categorical_features])

# Combine the normalized continuous features and encoded categorical features
preprocessed_clinical_data = np.hstack((clinical_df[continuous_features].values, encoded_categorical))

# Convert the combined clinical data to a PyTorch tensor
preprocessed_clinical_data_tensor = torch.tensor(preprocessed_clinical_data, dtype=torch.float32)

# --------------------
# Splitting Data
# --------------------

In [None]:

# Split data into training, validation, and test sets (for both image and clinical data)
def split_data(image_paths, clinical_data, test_size=0.2, val_size=0.1):
    # First split into training+validation and test sets
    train_image_paths, test_image_paths, train_clinical_data, test_clinical_data = train_test_split(
        image_paths, clinical_data, test_size=test_size, random_state=42
    )
    
    # Then split the training set into training and validation sets
    train_image_paths, val_image_paths, train_clinical_data, val_clinical_data = train_test_split(
        train_image_paths, train_clinical_data, test_size=val_size, random_state=42
    )
    
    return (train_image_paths, val_image_paths, test_image_paths,
            train_clinical_data, val_clinical_data, test_clinical_data)

# Example of splitting data
image_paths = ['/path/to/image1.png', '/path/to/image2.png', '/path/to/image3.png']  # List of image paths
preprocessed_clinical_data = preprocessed_clinical_data_tensor.numpy()  # Convert to numpy for splitting

train_img, val_img, test_img, train_clinical, val_clinical, test_clinical = split_data(image_paths, preprocessed_clinical_data)

print(f"Training Set: {len(train_img)} images, {len(train_clinical)} clinical records")
print(f"Validation Set: {len(val_img)} images, {len(val_clinical)} clinical records")
print(f"Test Set: {len(test_img)} images, {len(test_clinical)} clinical records")