In [None]:
import os
import xml.etree.ElementTree as ET
import pydicom

# This script parses an XML file to extract nodule information and matches it with DICOM files
# Allows model to match nodule coordinates with DICOM files

# Define paths
base_path = "C:/Users/darte/Documents/Projects/lung-cancer-detection/Data/LIDC-IDRI"
patient_id = "0001"  # Patient 1
patient_folder = os.path.join(base_path, f"LIDC-IDRI-{patient_id}")
xml_file = None

# Find the XML file in the patient's folder
for root, _, files in os.walk(patient_folder):
    for file in files:
        if file.endswith(".xml"):
            xml_file = os.path.join(root, file)
            break
    if xml_file:
        break

if not xml_file:
    print(f"No XML file found for Patient {patient_id}.")
else:
    print(f"Found XML file: {xml_file}")

# Parse the XML file
namespace = {"ns": "http://www.nih.gov"}
tree = ET.parse(xml_file)
root = tree.getroot()

# Dictionary to store nodule information
nodule_slices = {}

# Iterate through all unblindedReadNodule elements
for nodule in root.findall(".//ns:unblindedReadNodule", namespace):
    nodule_id = nodule.find("ns:noduleID", namespace).text
    nodule_slices[nodule_id] = []

    # Iterate through all ROI elements
    for roi in nodule.findall("ns:roi", namespace):
        sop_uid = roi.find("ns:imageSOP_UID", namespace)
        if sop_uid is not None:
            nodule_slices[nodule_id].append(sop_uid.text)

# Print the results
print("\nNodule to Slice Mapping:")
for nodule_id, sop_uids in nodule_slices.items():
    print(f"Nodule {nodule_id}:")
    for sop_uid in sop_uids:
        print(f"  SOP_UID: {sop_uid}")

# Search for DICOM files in all subdirectories of the patient folder
dicom_files = []
for root, _, files in os.walk(patient_folder):
    for file in files:
        if file.endswith(".dcm"):
            dicom_files.append(os.path.join(root, file))

if not dicom_files:
    print("No DICOM files found.")
else:
    print("\nMatching SOP_UIDs to DICOM files:")
    for dicom_file in dicom_files:
        dicom_data = pydicom.dcmread(dicom_file)
        sop_instance_uid = dicom_data.SOPInstanceUID
        for nodule_id, sop_uids in nodule_slices.items():
            if sop_instance_uid in sop_uids:
                print(f"DICOM file {dicom_file} matches Nodule {nodule_id} (SOP_UID: {sop_instance_uid})")

Found XML file: C:/Users/darte/Documents/Projects/lung-cancer-detection/Data/LIDC-IDRI\LIDC-IDRI-0001\01-01-2000-NA-NA-30178\3000566.000000-NA-03192\069.xml

Nodule to Slice Mapping:
Nodule Nodule 001:
  SOP_UID: 1.3.6.1.4.1.14519.5.2.1.6279.6001.110383487652933113465768208719
  SOP_UID: 1.3.6.1.4.1.14519.5.2.1.6279.6001.499837844441581448374672853475
  SOP_UID: 1.3.6.1.4.1.14519.5.2.1.6279.6001.299410838455281419536742634793
  SOP_UID: 1.3.6.1.4.1.14519.5.2.1.6279.6001.824843590991776411530080688091
  SOP_UID: 1.3.6.1.4.1.14519.5.2.1.6279.6001.297813206491522913194774892711
  SOP_UID: 1.3.6.1.4.1.14519.5.2.1.6279.6001.261151233960269013402330853013
  SOP_UID: 1.3.6.1.4.1.14519.5.2.1.6279.6001.202709423777326615340853838834
  SOP_UID: 1.3.6.1.4.1.14519.5.2.1.6279.6001.281416679065036634264586513142
Nodule Nodule 002:
  SOP_UID: 1.3.6.1.4.1.14519.5.2.1.6279.6001.313544823773855097029348077255
Nodule Nodule 003:
  SOP_UID: 1.3.6.1.4.1.14519.5.2.1.6279.6001.315606855383999143703852453142
