In [26]:
import os
import shutil
import tempfile

import matplotlib.pyplot as plt
from tqdm import tqdm

from monai.losses import DiceCELoss
from monai.inferers import sliding_window_inference
from monai.transforms import (
    AsDiscrete,
    Compose,
    CropForegroundd,
    LoadImaged,
    Orientationd,
    RandFlipd,
    RandCropByPosNegLabeld,
    RandShiftIntensityd,
    ScaleIntensityRanged,
    Spacingd,
    RandRotate90d,
    EnsureTyped,
    EnsureChannelFirstd,
    DeleteItemsd,  # Remove the metadata
)

from monai.config import print_config
from monai.metrics import DiceMetric
from monai.networks.nets import SwinUNETR

from monai.data import (
    ThreadDataLoader,
    CacheDataset,
    load_decathlon_datalist,
    decollate_batch,
    set_track_meta,
)


import torch

import multiprocessing

#multiprocessing.set_start_method('spawn', force=True)


print_config()

MONAI version: 1.2.0+95.ga4e4894d
Numpy version: 1.24.4
Pytorch version: 2.0.1+cu117
MONAI flags: HAS_EXT = False, USE_COMPILED = False, USE_META_DICT = False
MONAI rev id: a4e4894dca25f5e87b9306abfc472805f92b69da
MONAI __file__: /usr/local/lib/python3.8/dist-packages/monai/__init__.py

Optional dependencies:
Pytorch Ignite version: NOT INSTALLED or UNKNOWN VERSION.
ITK version: 5.3.0
Nibabel version: 5.1.0
scikit-image version: 0.21.0
scipy version: 1.10.1
Pillow version: 10.0.0
Tensorboard version: NOT INSTALLED or UNKNOWN VERSION.
gdown version: 4.7.1
TorchVision version: 0.15.2+cu117
tqdm version: 4.66.1
lmdb version: 1.4.1
psutil version: 5.9.5
pandas version: 2.0.3
einops version: 0.6.1
transformers version: 4.32.1
mlflow version: 2.6.0
pynrrd version: NOT INSTALLED or UNKNOWN VERSION.
clearml version: NOT INSTALLED or UNKNOWN VERSION.

For details about installing the optional dependencies, please visit:
    https://docs.monai.io/en/latest/installation.html#installing-the-recomm

In [2]:
directory = os.environ.get("MONAI_DATA_DIRECTORY")
root_dir = tempfile.mkdtemp() if directory is None else directory
print(root_dir)

/tmp/tmpc8aczhn3


In [None]:
num_samples = 1

import os
import torch

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "1,2,3"  # Use the 3rd and 4th GPU. Indexing starts from 0.

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [5]:
import os
import pandas as pd
from tqdm import tqdm

# Get the current working directory
current_directory = os.getcwd()

# Construct the path to the train_images directory located one level up
train_base_directory = os.path.join(current_directory, 'train_images')

segmentation_directory = os.path.join(current_directory, 'segmentations')

# Collect all the DICOM file paths along with patient IDs and series IDs
dicom_data = []
for patient_id in tqdm(os.listdir(train_base_directory), desc="Processing Patients"):
    patient_dir = os.path.join(train_base_directory, patient_id)
    for series_id in os.listdir(patient_dir):
        series_dir = os.path.join(patient_dir, series_id)
        dicom_data.append({'series_dir': series_dir, 'patient_id': patient_id, 'series_id': series_id})

# Create a DataFrame using the collected data
dicom_df = pd.DataFrame(dicom_data)

# Gather all the series IDs from the segmentation folder
segmentation_series_ids = [os.path.splitext(f)[0] for f in os.listdir(segmentation_directory)]  # Assuming .nii extension

# Filter the DataFrame to only include rows with series IDs that are present in the segmentation folder
filtered_df = dicom_df[dicom_df['series_id'].isin(segmentation_series_ids)]

# Print the resulting DataFrame
print(filtered_df)


Processing Patients: 100%|███████████████████████| 3147/3147 [00:00<00:00, 121930.60it/s]

                                           series_dir patient_id series_id
55    /workspace/0728tot/ATD/train_images/64194/25349      64194     25349
56    /workspace/0728tot/ATD/train_images/64194/34232      64194     34232
109    /workspace/0728tot/ATD/train_images/50518/1201      50518      1201
110   /workspace/0728tot/ATD/train_images/54183/33526      54183     33526
115   /workspace/0728tot/ATD/train_images/42008/63418      42008     63418
...                                               ...        ...       ...
4597  /workspace/0728tot/ATD/train_images/37551/62680      37551     62680
4613  /workspace/0728tot/ATD/train_images/44507/21282      44507     21282
4617  /workspace/0728tot/ATD/train_images/37436/59325      37436     59325
4618  /workspace/0728tot/ATD/train_images/37436/50434      37436     50434
4679     /workspace/0728tot/ATD/train_images/7642/778       7642       778

[206 rows x 3 columns]





In [5]:
import numpy as np
import os
import pydicom as dicom
import nibabel as nib
from tqdm import tqdm
from skimage.transform import resize
import matplotlib.pyplot as plt
import pandas as pd
import cv2

def sample_patient_volume(folder, output_path, depth_downsample_rate = None, lw_downsample_rate = None):
    """
    Standardize the pixel array from DICOM metadata.
    """
    
    filenames = sorted([int(filename.split('.')[0]) for filename in os.listdir(folder)])
    filenames = [str(filename) + '.dcm' for filename in filenames]
    
    if depth_downsample_rate:
        filenames = filenames[::depth_downsample_rate]

    volume = []
    for filename in filenames:
        filepath = os.path.join(folder, filename)
        dcm = dicom.dcmread(filepath)
        
        pixel_array = dcm.pixel_array
        if dcm.PixelRepresentation == 1:
            bit_shift = dcm.BitsAllocated - dcm.BitsStored
            dtype = pixel_array.dtype 
            pixel_array = (pixel_array << bit_shift).astype(dtype) >> bit_shift

        intercept = float(dcm.RescaleIntercept)
        slope = float(dcm.RescaleSlope)
        center = int(dcm.WindowCenter)
        width = int(dcm.WindowWidth)
        low = center - width / 2
        high = center + width / 2

        pixel_array = (pixel_array * slope) + intercept
        pixel_array = np.clip(pixel_array, low, high)

        pixel_array -= np.min(pixel_array)

        pixel_array = (pixel_array / np.max(pixel_array) * 255).astype(np.int16)
        
        if lw_downsample_rate:
            pixel_array = pixel_array[::lw_downsample_rate, ::lw_downsample_rate]
        
        volume.append(pixel_array)
        
    volume = np.array(volume)
    
    # Save the volume as .nii.gz using nibabel
    nifti_img = nib.Nifti1Image(volume, np.eye(4))
    nib.save(nifti_img, output_path)


In [24]:
for i in tqdm(range(len(filtered_df))):

    samp = str(filtered_df.series_id.iloc[i])+'.nii.gz'

    output_directory = os.path.join(current_directory, 'train_seg', samp)

    sample_patient_volume(filtered_df.series_dir.iloc[i], output_directory)

100%|██████████████████████████████████████████████████| 206/206 [21:03<00:00,  6.13s/it]


In [15]:
# Define the directories
volume_dir = os.path.join(current_directory, 'train_seg')
segmentation_dir = os.path.join(current_directory, 'segmentations')

# List the files in each directory
volume_files = sorted([f for f in os.listdir(volume_dir) if f.endswith('.nii.gz')])
segmentation_files = sorted([f for f in os.listdir(segmentation_dir) if f.endswith('.nii')])

In [11]:
segmentation_alt_dir = os.path.join(current_directory, 'segmentations_alt')

for i in tqdm(range(len(segmentation_files))):
    seg_file = segmentation_files[i]
    
    segmentation_path = os.path.join(segmentation_dir, seg_file)
    
    segmentation_nii = nib.load(segmentation_path)
    
    segmentation_data = segmentation_nii.get_fdata()
    
    segmentation_data = np.transpose(segmentation_data, (2, 0, 1))
    
    segmentation_data = np.rot90(np.swapaxes(segmentation_data, 1, 2), 2)

    nifti_img = nib.Nifti1Image(segmentation_data, np.eye(4))

    new_segmentation_path = os.path.join(segmentation_alt_dir, seg_file)
    nib.save(nifti_img, new_segmentation_path)

100%|██████████████████████████████████████████████████| 206/206 [17:16<00:00,  5.03s/it]


In [6]:
# Define the directories
volume_dir = os.path.join(current_directory, 'train_seg')
segmentation_alt_dir = os.path.join(current_directory, 'segmentations_alt')

# List the files in each directory
volume_files = sorted([f for f in os.listdir(volume_dir) if f.endswith('.nii.gz')])
segmentation_alt_files = sorted([f for f in os.listdir(segmentation_alt_dir) if f.endswith('.nii')])

In [7]:
bbox_df = pd.read_csv('bbox_data.csv')
bbox_series = []

for i in range(len(bbox_df)):
    bbox_series.append(bbox_df.series_id.iloc[i].split('/')[-1])

bbox_df['series_index'] = bbox_series

bbox_df = bbox_df[bbox_df.series_index.isin(filtered_df.series_id)]
bbox_df = bbox_df.sort_values(by='series_index').reset_index(drop=True)

In [74]:
i = 1
# Remove unwanted characters and split string into list of strings
data_list = bbox_df['bbox_x'][i].replace("[", "").replace("]", "").split("\n")
bounding_boxes = [list(map(int, item.strip().split())) for item in data_list]
bounding_boxes = bounding_boxes[int(len(bounding_boxes)*0.05):int(len(bounding_boxes)*.95)]
# bounding_box = bounding_boxes[bbox_df['max_area_index'][i]] max area

mean_bbox = np.mean(bounding_boxes, axis=0)
distances = np.linalg.norm(bounding_boxes - mean_bbox, axis=1)
closest_index = np.argmin(distances)

bounding_box = bounding_boxes[closest_index]

vol_file = volume_files[i]
seg_file = segmentation_alt_files[i]

volume_path = os.path.join(volume_dir, vol_file)

segmentation_alt_path = os.path.join(segmentation_alt_dir, seg_file)

volume_nii = nib.load(volume_path)
segmentation_alt_nii = nib.load(segmentation_alt_path)

volume_data = volume_nii.get_fdata()
segmentation_alt_data = segmentation_alt_nii.get_fdata()

In [78]:
bad_cases = []
for i in tqdm(range(len(volume_files))):
    vol_file = volume_files[i]
    seg_file = segmentation_alt_files[i]
    
    volume_path = os.path.join(volume_dir, vol_file)
    
    segmentation_alt_path = os.path.join(segmentation_alt_dir, seg_file)
    
    volume_nii = nib.load(volume_path)
    segmentation_alt_nii = nib.load(segmentation_alt_path)
    
    volume_data = volume_nii.get_fdata()
    segmentation_alt_data = segmentation_alt_nii.get_fdata()
    
    if volume_data.shape!=segmentation_alt_data.shape:
        bad_cases.append(i)

100%|██████████████████████████████████████████████████| 206/206 [01:57<00:00,  1.76it/s]


In [82]:
volume_files[bad_cases[0]]

'10109.nii.gz'