# Malignancy Classification

This file uses the CT scan DICOM image metadata in conjunction with the professional radiologist's XML annotations to extract the malignancy score data from each patient and categorize them as benign, malignant, or uncertain

Malignancy Scores as follow:
- 1: Highly Unlikely to be alignant (benign)
- 2: Unlikely to be malignant
- 3: Indeterminate
- 4: Likely to be malignant
- 5: Highly Likely to be malignant

## Malignancy Mean Scoring

By extracting the malignancy scores of each patient, I used average voting methodology and found the mean of each patient's malignancy scores.

In [4]:
import os
import xml.etree.ElementTree as ET
from collections import defaultdict

# Define the base path to the LIDC-IDRI dataset
base_path = "C:/Users/darte/Documents/Projects/lung-cancer-detection/Data/LIDC-IDRI"

# Namespace used in the XML files
namespace = {"ns": "http://www.nih.gov"}

# Dictionary to store malignancy scores for all patients
all_patient_scores = {}

# Iterate through the first 20 patient folders
for patient_id in range(1, 71):
    patient_folder = os.path.join(base_path, f"LIDC-IDRI-{str(patient_id).zfill(4)}")
    # print(f"\nProcessing Patient {patient_id} in folder: {patient_folder}")
    
    # Find the XML file in the patient's folder
    xml_file = None
    for root, _, files in os.walk(patient_folder):
        for file in files:
            if file.endswith(".xml"):
                xml_file = os.path.join(root, file)
                break
        if xml_file:
            break
    
    if not xml_file:
        print(f"  No XML file found for Patient {patient_id}. Skipping...")
        continue
    
    # print(f"  Found XML file: {xml_file}")
    
    # Parse the XML file
    tree = ET.parse(xml_file)
    root = tree.getroot()
    
    # Dictionary to store malignancy scores for this patient
    dicom_scores = defaultdict(list)
    
    # Iterate through all unblindedReadNodule elements
    for nodule in root.findall(".//ns:unblindedReadNodule", namespace):
        # Get the malignancy score
        characteristics = nodule.find("ns:characteristics", namespace)
        if characteristics is not None:
            malignancy = characteristics.find("ns:malignancy", namespace)
            if malignancy is not None:
                malignancy_score = int(malignancy.text)
            else:
                continue
        else:
            continue

        # Iterate through all ROI elements to map the score to DICOM images
        for roi in nodule.findall("ns:roi", namespace):
            image_sop_uid = roi.find("ns:imageSOP_UID", namespace)
            if image_sop_uid is not None:
                dicom_uid = image_sop_uid.text
                dicom_scores[dicom_uid].append(malignancy_score)
    
    # Calculate the mean malignancy score for this patient
    all_scores = [score for scores in dicom_scores.values() for score in scores]
    if all_scores:
        mean_score = sum(all_scores) / len(all_scores)
        all_patient_scores[patient_id] = mean_score
    else:
        all_patient_scores[patient_id] = None

# Print the results
print("\nMean Malignancy Scores for All Patients:")
for patient_id, mean_score in all_patient_scores.items():
    if mean_score is not None:
        print(f"  Patient {patient_id}: Mean Malignancy Score = {mean_score:.2f}")
    else:
        print(f"  Patient {patient_id}: No malignancy scores available.")

  No XML file found for Patient 69. Skipping...

Mean Malignancy Scores for All Patients:
  Patient 1: Mean Malignancy Score = 4.72
  Patient 2: Mean Malignancy Score = 4.57
  Patient 3: Mean Malignancy Score = 3.51
  Patient 4: Mean Malignancy Score = 1.25
  Patient 5: Mean Malignancy Score = 2.63
  Patient 6: Mean Malignancy Score = 2.61
  Patient 7: Mean Malignancy Score = 4.83
  Patient 8: Mean Malignancy Score = 2.67
  Patient 9: Mean Malignancy Score = 2.00
  Patient 10: Mean Malignancy Score = 2.65
  Patient 11: Mean Malignancy Score = 2.37
  Patient 12: Mean Malignancy Score = 2.71
  Patient 13: Mean Malignancy Score = 4.12
  Patient 14: Mean Malignancy Score = 3.46
  Patient 15: Mean Malignancy Score = 4.20
  Patient 16: Mean Malignancy Score = 3.75
  Patient 17: Mean Malignancy Score = 2.53
  Patient 18: Mean Malignancy Score = 3.45
  Patient 19: Mean Malignancy Score = 2.91
  Patient 20: Mean Malignancy Score = 3.66
  Patient 21: Mean Malignancy Score = 1.62
  Patient 22: Me

In [5]:
# Iterate through patient folders for patients 973 to 1012
for patient_id in range(973, 1013):
    patient_folder = os.path.join(base_path, f"LIDC-IDRI-{str(patient_id).zfill(4)}")
    
    # Find the XML file in the patient's folder
    xml_file = None
    for root, _, files in os.walk(patient_folder):
        for file in files:
            if file.endswith(".xml"):
                xml_file = os.path.join(root, file)
                break
        if xml_file:
            break
    
    if not xml_file:
        print(f"  No XML file found for Patient {patient_id}. Skipping...")
        continue
    
    # Parse the XML file
    tree = ET.parse(xml_file)
    root = tree.getroot()
    
    # Dictionary to store malignancy scores for this patient
    dicom_scores = defaultdict(list)
    
    # Iterate through all unblindedReadNodule elements
    for nodule in root.findall(".//ns:unblindedReadNodule", namespace):
        # Get the malignancy score
        characteristics = nodule.find("ns:characteristics", namespace)
        if characteristics is not None:
            malignancy = characteristics.find("ns:malignancy", namespace)
            if malignancy is not None:
                malignancy_score = int(malignancy.text)
            else:
                continue
        else:
            continue

        # Iterate through all ROI elements to map the score to DICOM images
        for roi in nodule.findall("ns:roi", namespace):
            image_sop_uid = roi.find("ns:imageSOP_UID", namespace)
            if image_sop_uid is not None:
                dicom_uid = image_sop_uid.text
                dicom_scores[dicom_uid].append(malignancy_score)
    
    # Calculate the mean malignancy score for this patient
    all_scores = [score for scores in dicom_scores.values() for score in scores]
    if all_scores:
        mean_score = sum(all_scores) / len(all_scores)
        all_patient_scores[patient_id] = mean_score
    else:
        all_patient_scores[patient_id] = None

# Print the results
print("\nMean Malignancy Scores for Patients 973 to 1012:")
for patient_id in range(973, 1013):
    mean_score = all_patient_scores.get(patient_id)
    if mean_score is not None:
        print(f"  Patient {patient_id}: Mean Malignancy Score = {mean_score:.2f}")
    else:
        print(f"  Patient {patient_id}: No malignancy scores available.")

  No XML file found for Patient 975. Skipping...
  No XML file found for Patient 988. Skipping...
  No XML file found for Patient 993. Skipping...
  No XML file found for Patient 1008. Skipping...

Mean Malignancy Scores for Patients 973 to 1012:
  Patient 973: Mean Malignancy Score = 3.17
  Patient 974: Mean Malignancy Score = 2.75
  Patient 975: No malignancy scores available.
  Patient 976: Mean Malignancy Score = 4.28
  Patient 977: Mean Malignancy Score = 3.00
  Patient 978: Mean Malignancy Score = 3.67
  Patient 979: No malignancy scores available.
  Patient 980: Mean Malignancy Score = 3.37
  Patient 981: Mean Malignancy Score = 3.50
  Patient 982: Mean Malignancy Score = 2.68
  Patient 983: Mean Malignancy Score = 3.00
  Patient 984: Mean Malignancy Score = 3.36
  Patient 985: Mean Malignancy Score = 2.61
  Patient 986: Mean Malignancy Score = 3.25
  Patient 987: Mean Malignancy Score = 1.89
  Patient 988: No malignancy scores available.
  Patient 989: Mean Malignancy Score = 3

## Malignancy Categorizing

Used the malignancy mean scores to categorize patients

Mean Scoring:
- 1 <= x <= 2: Benign
- 2 < x < 4: Uncertain
- 4 <= x <=5: Malignant

In [7]:
# Categorize patients based on their mean malignancy score
patient_categories = {}
for patient_id, mean_score in all_patient_scores.items():
    if mean_score is None:
        category = "uncertain"
    elif 1 <= mean_score <= 2:
        category = "benign"
    elif 4 <= mean_score <= 5:
        category = "malignant"
    else:
        category = "uncertain"
    patient_categories[patient_id] = category

# Print the categorized patients
print("Patient Categories:")
for patient_id, category in patient_categories.items():
    print(f"  Patient {patient_id}: {category}")

Patient Categories:
  Patient 1: malignant
  Patient 2: malignant
  Patient 3: uncertain
  Patient 4: benign
  Patient 5: uncertain
  Patient 6: uncertain
  Patient 7: malignant
  Patient 8: uncertain
  Patient 9: benign
  Patient 10: uncertain
  Patient 11: uncertain
  Patient 12: uncertain
  Patient 13: malignant
  Patient 14: uncertain
  Patient 15: malignant
  Patient 16: uncertain
  Patient 17: uncertain
  Patient 18: uncertain
  Patient 19: uncertain
  Patient 20: uncertain
  Patient 21: benign
  Patient 22: benign
  Patient 23: malignant
  Patient 24: uncertain
  Patient 25: benign
  Patient 26: uncertain
  Patient 27: uncertain
  Patient 28: uncertain
  Patient 29: malignant
  Patient 30: uncertain
  Patient 31: uncertain
  Patient 32: uncertain
  Patient 33: uncertain
  Patient 34: benign
  Patient 35: uncertain
  Patient 36: uncertain
  Patient 37: uncertain
  Patient 38: uncertain
  Patient 39: benign
  Patient 40: uncertain
  Patient 41: uncertain
  Patient 42: uncertain
  

In [8]:
# Separate patients into three lists based on their categories
malignant_patients = [patient_id for patient_id, category in patient_categories.items() if category == "malignant"]
benign_patients = [patient_id for patient_id, category in patient_categories.items() if category == "benign"]
uncertain_patients = [patient_id for patient_id, category in patient_categories.items() if category == "uncertain"]

# Print the lists
print("Malignant Patients:", malignant_patients)
print("Benign Patients:", benign_patients)
print("Uncertain Patients:", uncertain_patients)

Malignant Patients: [1, 2, 7, 13, 15, 23, 29, 47, 52, 57, 58, 59, 60, 66, 976, 1007]
Benign Patients: [4, 9, 21, 22, 25, 34, 39, 987, 996, 997, 1009, 1012]
Uncertain Patients: [3, 5, 6, 8, 10, 11, 12, 14, 16, 17, 18, 19, 20, 24, 26, 27, 28, 30, 31, 32, 33, 35, 36, 37, 38, 40, 41, 42, 43, 44, 45, 46, 48, 49, 50, 51, 53, 54, 55, 56, 61, 62, 63, 64, 65, 67, 68, 70, 973, 974, 977, 978, 979, 980, 981, 982, 983, 984, 985, 986, 989, 990, 991, 992, 994, 995, 998, 999, 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1010, 1011]


In [9]:
import csv

# Define the file name
output_file = "malignancy_label.csv"

# Write the malignant and benign patients to the CSV file
with open(output_file, "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["Category", "PatientID"])
    
    # Write malignant patients
    for patient_id in malignant_patients:
        writer.writerow(["Malignant", patient_id])
    
    # Write benign patients
    for patient_id in benign_patients:
        writer.writerow(["Benign", patient_id])

print(f"CSV file '{output_file}' created successfully.")

CSV file 'malignancy_label.csv' created successfully.


In [2]:
import csv

with open("malignancy_scores.csv", "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["PatientID", "DICOM_UID", "Malignancy_Score"])
    for patient_id, scores in all_patient_scores.items():
        for dicom_uid, score in scores.items():
            writer.writerow([patient_id, dicom_uid, score])

In [3]:
import pandas as pd

# Load the CSV file
csv_file = "c:/Users/darte/Documents/Projects/lung-cancer-detection/Notebooks/malignancy_scores.csv"
df = pd.read_csv(csv_file)

# Function to categorize patients
def categorize_patient(scores):
    if any(score in [4, 5] for score in scores):
        return "malignant"
    else:
        return "benign"

# Group by PatientID and categorize
patient_categories = df.groupby("PatientID")["Malignancy_Score"].apply(categorize_patient)

# Print the results
print("Patient Categories:")
print(patient_categories)

# Save the results to a new CSV file
output_file = "patient_categories.csv"
patient_categories.reset_index(name="Category").to_csv(output_file, index=False)
print(f"\nPatient categories saved to {output_file}")

Patient Categories:
PatientID
1     malignant
2     malignant
3     malignant
4        benign
5        benign
6     malignant
7     malignant
8        benign
9        benign
10    malignant
11    malignant
12    malignant
13    malignant
14    malignant
15    malignant
16    malignant
17       benign
18    malignant
19    malignant
20    malignant
Name: Malignancy_Score, dtype: object

Patient categories saved to patient_categories.csv
