In [3]:
import os
import numpy as np
from PIL import Image
import concurrent.futures
from tqdm import tqdm
from collections import Counter
import unicodedata
import monai.transforms as mtf
from multiprocessing import Pool
from unidecode import unidecode


In [4]:
input_dir = "/Users/rohith/Desktop/M3D copy/Data/data/test_data/M3D_Cap/ct_quizze/"
output_dir = "/Users/rohith/Desktop/M3D copy/Data/data/test_data_preprocess/M3D_Cap/ct_quizze/"

# Get all subfolders inside input_dir (e.g., patient IDs)
subfolders = [folder for folder in os.listdir(input_dir) if os.path.isdir(os.path.join(input_dir, folder))]


In [5]:
transform = mtf.Compose([
    mtf.CropForeground(),
    mtf.Resize(spatial_size=[32, 256, 256], mode="bilinear")
])




In [6]:
def process_subfolder(subfolder):
    output_id_folder = os.path.join(output_dir, subfolder)
    input_id_folder = os.path.join(input_dir, subfolder)

    os.makedirs(output_id_folder, exist_ok=True)

    for subsubfolder in os.listdir(input_id_folder):
        if subsubfolder.endswith('.txt'):
            text_path = os.path.join(input_id_folder, subsubfolder)
            with open(text_path, 'r') as file:
                text_content = file.read()

            search_text = "study_findings:"
            index = text_content.find(search_text)

            if index != -1:
                filtered_text = text_content[index + len(search_text):].replace("\n", " ").strip()
            else:
                filtered_text = text_content.replace("\n", " ").strip()

            if len(filtered_text.replace(" ", "")) < 5:
                search_text = "discussion:"
                index = text_content.find(search_text)
                if index != -1:
                    filtered_text = text_content[index + len(search_text):].replace("\n", " ").strip()

            new_text_path = os.path.join(output_id_folder, subsubfolder)
            with open(new_text_path, 'w') as new_file:
                new_file.write(filtered_text)

        subsubfolder_path = os.path.join(input_id_folder, subsubfolder)

        if os.path.isdir(subsubfolder_path):
            subsubfolder = unidecode(subsubfolder)  # Remove special characters
            output_path = os.path.join(output_id_folder, f'{subsubfolder}.npy')

            image_files = sorted(
                [file for file in os.listdir(subsubfolder_path) if file.endswith(('.jpeg', '.png'))],
                key=lambda x: int(os.path.splitext(x)[0])
            )

            if len(image_files) == 0:
                continue

            images_3d = []
            for image_file in image_files:
                image_path = os.path.join(subsubfolder_path, image_file)
                try:
                    img = Image.open(image_path).convert("L")
                    img_array = np.array(img) / 255.0  # Normalize
                    images_3d.append(img_array[None])
                except:
                    print(f"Error processing image: {image_path}")

            try:
                most_common_shape = Counter([img.shape for img in images_3d]).most_common(1)[0][0]
                images_3d_pure = [img for img in images_3d if img.shape == most_common_shape]
                final_3d_image = np.vstack(images_3d_pure)[np.newaxis, ...]

                # Normalize and apply transforms
                final_3d_image = (final_3d_image - final_3d_image.min()) / np.clip(final_3d_image.max(), 1e-8, None)
                img_trans = transform(final_3d_image)

                np.save(output_path, img_trans)
            except:
                print(f"VStack error in folder: {output_path}")


In [7]:
for subfolder in tqdm(subfolders, desc="Processing"):
    process_subfolder(subfolder)


Processing:   6%|▌         | 116/2000 [01:33<12:21,  2.54it/s] 

Error processing image: /Users/rohith/Desktop/M3D copy/Data/data/test_data/M3D_Cap/ct_quizze/009659/Axial_C__portal_venous_phase/152.png


Processing:  34%|███▍      | 675/2000 [09:56<8:04:06, 21.92s/it]

: 