# Extract and Output Nodule Images

In [4]:
import os
import csv
import xml.etree.ElementTree as ET
import pydicom
import cv2

# Define paths
base_path = "C:/Users/darte/Documents/Projects/lung-cancer-detection/Data/LIDC-IDRI"
csv_path = "C:/Users/darte/Documents/Projects/lung-cancer-detection/Assets/malignancy_label.csv"

# Function to load patient IDs based on malignancy CSV file
def load_patient_ids(csv_path):
    benign_patients = []
    malignant_patients = []
    with open(csv_path, 'r') as file:
        reader = csv.DictReader(file)
        for row in reader:
            if row['Category'] == 'Benign':
                benign_patients.append(int(row['PatientID']))
            elif row['Category'] == 'Malignant':
                malignant_patients.append(int(row['PatientID']))
    return benign_patients, malignant_patients

# Load patient IDs
benign_patients, malignant_patients = load_patient_ids(csv_path)

# Function to extract and save nodule images for a given patient
def extract_nodule_images(patient_id, patient_type):
    patient_id_str = f"{patient_id:04d}"  # Format patient ID as 4-digit string
    patient_folder = os.path.join(base_path, f"LIDC-IDRI-{patient_id_str}")
    xml_file = None

    # Find the XML file in the patient's folder
    for root, _, files in os.walk(patient_folder):
        for file in files:
            if file.endswith(".xml"):
                xml_file = os.path.join(root, file)
                break
        if xml_file:
            break

    if not xml_file:
        print(f"No XML file found for Patient {patient_id}.")
        return

    print(f"Found XML file for Patient {patient_id}: {xml_file}")

    # Parse the XML file
    namespace = {"ns": "http://www.nih.gov"}
    tree = ET.parse(xml_file)
    root = tree.getroot()

    # Dictionary to store nodule information
    nodule_slices = {}

    # Iterate through all unblindedReadNodule elements
    for nodule in root.findall(".//ns:unblindedReadNodule", namespace):
        nodule_id = nodule.find("ns:noduleID", namespace).text
        nodule_slices[nodule_id] = []

        # Iterate through all ROI elements
        for roi in nodule.findall("ns:roi", namespace):
            sop_uid = roi.find("ns:imageSOP_UID", namespace)
            if sop_uid is not None:
                # Extract edgeMap coordinates
                x_coords = []
                y_coords = []
                for edge_map in roi.findall("ns:edgeMap", namespace):
                    x_coord = edge_map.find("ns:xCoord", namespace)
                    y_coord = edge_map.find("ns:yCoord", namespace)
                    if x_coord is not None and y_coord is not None:
                        x_coords.append(int(x_coord.text))
                        y_coords.append(int(y_coord.text))

                # Calculate bounding box if coordinates are available
                if x_coords and y_coords:
                    x_min = min(x_coords)
                    y_min = min(y_coords)
                    width = max(x_coords) - x_min
                    height = max(y_coords) - y_min
                    nodule_slices[nodule_id].append((sop_uid.text, x_min, y_min, width, height))
                else:
                    print(f"Missing edgeMap coordinates for Nodule {nodule_id}, SOP_UID {sop_uid.text}. Skipping this ROI.")

    # Search for DICOM files in all subdirectories of the patient folder
    dicom_files = []
    for root, _, files in os.walk(patient_folder):
        for file in files:
            if file.endswith(".dcm"):
                dicom_files.append(os.path.join(root, file))

    if not dicom_files:
        print(f"No DICOM files found for Patient {patient_id}.")
        return

    # Match SOP_UIDs to DICOM files and save extracted nodule images
    for nodule_id, rois in nodule_slices.items():
        for roi in rois:
            sop_uid, x, y, width, height = roi
            for dicom_file in dicom_files:
                dicom_data = pydicom.dcmread(dicom_file)
                if dicom_data.SOPInstanceUID == sop_uid:
                    # Load the DICOM image
                    image = dicom_data.pixel_array

                    # Add padding to the ROI
                    padding = 10  # Add 10 pixels of padding
                    x = max(0, x - padding)
                    y = max(0, y - padding)
                    width = min(image.shape[1] - x, width + 2 * padding)
                    height = min(image.shape[0] - y, height + 2 * padding)

                    # Ensure ROI is within image bounds
                    x = max(0, x)
                    y = max(0, y)
                    width = min(image.shape[1] - x, width)
                    height = min(image.shape[0] - y, height)

                    # Extract the ROI
                    nodule_image = image[y:y+height, x:x+width]

                    # Check if the extracted image is empty
                    if nodule_image.size == 0:
                        print(f"Empty ROI for Patient {patient_id}, Nodule {nodule_id}, SOP_UID {sop_uid}. Skipping.")
                        continue

                    # Normalize the image
                    normalized_image = cv2.normalize(nodule_image, None, 0, 255, cv2.NORM_MINMAX)

                    # Save the extracted nodule image
                    output_dir = f"output/{patient_type}"
                    os.makedirs(output_dir, exist_ok=True)
                    output_path = os.path.join(output_dir, f"patient_{patient_id}_nodule_{nodule_id}_{sop_uid}.png")
                    cv2.imwrite(output_path, normalized_image)
                    print(f"Saved nodule image for Patient {patient_id}, Nodule {nodule_id}, SOP_UID {sop_uid}")

# Process all benign patients
print("Processing benign patients...")
for patient_id in benign_patients:
    extract_nodule_images(patient_id, "benign")

# Process all malignant patients
print("Processing malignant patients...")
for patient_id in malignant_patients:
    extract_nodule_images(patient_id, "malignant")

Processing benign patients...
Found XML file for Patient 4: C:/Users/darte/Documents/Projects/lung-cancer-detection/Data/LIDC-IDRI\LIDC-IDRI-0004\01-01-2000-NA-NA-91780\3000534.000000-NA-58228\074.xml
Saved nodule image for Patient 4, Nodule Nodule 001, SOP_UID 1.3.6.1.4.1.14519.5.2.1.6279.6001.153759531653726148364082207422
Saved nodule image for Patient 4, Nodule Nodule 001, SOP_UID 1.3.6.1.4.1.14519.5.2.1.6279.6001.175159594489113303104755002435
Saved nodule image for Patient 4, Nodule Nodule 001, SOP_UID 1.3.6.1.4.1.14519.5.2.1.6279.6001.226269353909047545554945460400
Saved nodule image for Patient 4, Nodule Nodule 001, SOP_UID 1.3.6.1.4.1.14519.5.2.1.6279.6001.203766013809021049942391947414
Saved nodule image for Patient 4, Nodule Nodule 002, SOP_UID 1.3.6.1.4.1.14519.5.2.1.6279.6001.128981269311076397646444355539
Saved nodule image for Patient 4, Nodule Nodule 003, SOP_UID 1.3.6.1.4.1.14519.5.2.1.6279.6001.671676418477816170164014522564
Saved nodule image for Patient 4, Nodule No

In [5]:
def count_saved_images(patient_type):
    output_dir = f"output/{patient_type}"
    if not os.path.exists(output_dir):
        return 0
    return len([file for file in os.listdir(output_dir) if file.endswith(".png")])

# Count images for benign and malignant patients
benign_image_count = count_saved_images("benign")
malignant_image_count = count_saved_images("malignant")

print(f"Number of images saved for benign patients: {benign_image_count}")
print(f"Number of images saved for malignant patients: {malignant_image_count}")

Number of images saved for benign patients: 395
Number of images saved for malignant patients: 1122


In [6]:
import os
import cv2
import numpy as np
import shutil

# Define paths
output_dir = "output"
disregarded_dir = "output/disregarded"

# Function to check if an image is mostly black
def is_black_image(image_path, threshold=10, black_ratio=0.95):
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    if image is None:
        return False  # Skip if the image cannot be read
    # Count pixels below the threshold
    black_pixels = np.sum(image < threshold)
    total_pixels = image.size
    return (black_pixels / total_pixels) >= black_ratio

# Process all images in the output directory
def filter_black_images(output_dir, disregarded_dir):
    os.makedirs(disregarded_dir, exist_ok=True)
    for root, _, files in os.walk(output_dir):
        for file in files:
            if file.endswith(".png"):
                image_path = os.path.join(root, file)
                if is_black_image(image_path):
                    # Move the black image to the disregarded folder
                    relative_path = os.path.relpath(root, output_dir)
                    target_dir = os.path.join(disregarded_dir, relative_path)
                    os.makedirs(target_dir, exist_ok=True)
                    shutil.move(image_path, os.path.join(target_dir, file))
                    print(f"Moved black image: {image_path} -> {target_dir}")

# Run the filter
filter_black_images(output_dir, disregarded_dir)

Moved black image: output\benign\patient_996_nodule_0_1.3.6.1.4.1.14519.5.2.1.6279.6001.221062229553402782303348171302.png -> output/disregarded\benign
Moved black image: output\benign\patient_996_nodule_13371_1.3.6.1.4.1.14519.5.2.1.6279.6001.146411744667174192771089241419.png -> output/disregarded\benign
Moved black image: output\benign\patient_996_nodule_13374_1.3.6.1.4.1.14519.5.2.1.6279.6001.272009228124873088425546685312.png -> output/disregarded\benign
Moved black image: output\benign\patient_996_nodule_13376_1.3.6.1.4.1.14519.5.2.1.6279.6001.221062229553402782303348171302.png -> output/disregarded\benign
Moved black image: output\benign\patient_996_nodule_3_1.3.6.1.4.1.14519.5.2.1.6279.6001.272009228124873088425546685312.png -> output/disregarded\benign
Moved black image: output\benign\patient_996_nodule_54841_1.3.6.1.4.1.14519.5.2.1.6279.6001.272009228124873088425546685312.png -> output/disregarded\benign
Moved black image: output\benign\patient_996_nodule_54843_1.3.6.1.4.1.14

In [7]:
def recount_images(output_dir, disregarded_dir, patient_type):
    patient_dir = os.path.join(output_dir, patient_type)
    disregarded_patient_dir = os.path.join(disregarded_dir, patient_type)
    if not os.path.exists(patient_dir):
        return 0
    total_images = len([file for file in os.listdir(patient_dir) if file.endswith(".png")])
    disregarded_images = 0
    if os.path.exists(disregarded_patient_dir):
        disregarded_images = len([file for file in os.listdir(disregarded_patient_dir) if file.endswith(".png")])
    return total_images - disregarded_images

# Recount images for benign and malignant patients
benign_non_disregarded_count = recount_images(output_dir, disregarded_dir, "benign")
malignant_non_disregarded_count = recount_images(output_dir, disregarded_dir, "malignant")

print(f"Number of non-disregarded images for benign patients: {benign_non_disregarded_count}")
print(f"Number of non-disregarded images for malignant patients: {malignant_non_disregarded_count}")

Number of non-disregarded images for benign patients: 315
Number of non-disregarded images for malignant patients: 624
