# Mammogram Dataset Sorting and Preprocessing

This notebook/script prepares the **CBIS-DDSM** mammography dataset for training deep learning models. It performs the following steps:

- Converts categorical `pathology` labels (`BENIGN`, `MALIGNANT`, etc.) into binary format (`0 = benign`, `1 = malignant`)
- Standardises and renames image file paths to ensure consistency
- Checks for inconsistent annotations and ignores duplicate filenames with conflicting pathology labels
- Organises images into a structured folder hierarchy (`train/benign`, `train/malignant`, `test/benign`, `test/malignant`)
- Exports CSV files with image paths and binary labels for future training

The data is split into training and test sets based on pre-labeled metadata files provided with CBIS-DDSM.

## Notes

- The CBIS-DDSM dataset must be downloaded separately from [The Cancer Imaging Archive (TCIA)](https://www.cancerimagingarchive.net/)
- This script assumes the original DICOM images have already been converted to PNG format (see `dicom_to_png_converter.ipynb`)


In [102]:
import os
import shutil
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [103]:
pd.options.mode.chained_assignment = None  # default='warn'

In [None]:
def replace_pathology(pathology_df):
    """
    Converts 'pathology' column entries from strings to binary labels.
    'MALIGNANT' becomes 1, all others become 0.
    
    Args:
        pathology_df (pd.DataFrame): DataFrame containing a 'pathology' column to be modified in place.
    """
    for i in range(pathology_df.shape[0]):
        #print(pathology_df.loc[i, 'pathology'])
        if pathology_df.loc[i, 'pathology'] == "MALIGNANT":
            pathology_df.loc[i, 'pathology'] = 1
        else:
            pathology_df.loc[i, 'pathology'] = 0

    #return pathology_df

#change name of full_paths so that they are the same as the actual image names
def rename_filepath(filepath_df):
    """
    Standardises image file paths to ensure consistent naming for use with image loaders.
    
    Args:
        filepath_df (pd.DataFrame): DataFrame with an 'image file path' column to be renamed in place.
    """
    for i in range(filepath_df.shape[0]):

        filename = filepath_df.loc[i, 'image file path'].split('/')[0]
        #print(filename)

        if "FULL" not in filename:
            filepath_df.loc[i, 'image file path'] = filename + "_FULL.png"
        else:
            filepath_df.loc[i, 'image file path'] = filename + ".png"


In [None]:
def check_different_pathologies(df, file_name):
    """
    Checks if a given image file path has conflicting pathology labels in the DataFrame.
    
    Args:
        df (pd.DataFrame): DataFrame containing 'image file path' and 'pathology' columns.
        file_name (str): Filename to check for conflicting labels.
    
    Returns:
        bool or int: True if multiple different labels exist, False if only one label, 
                     -1 if the image is not found in the DataFrame.
    """
    #list of pathologies
    pathologies = []
    for index, instance in df.iterrows():
        # find pathologies for all instances of file
        if instance['image file path'] == file_name:
            #print(instance['image file path'])
            pathologies.append(instance['pathology'])
    
    #print(pathologies)

    #if the corresponding set of pathologies list has only lenght 1, then there is only one type of pathology,
    #otherwise it means there are at least two pathologies
    if len(set(pathologies)) == 0:
        return -1
    elif len(set(pathologies)) == 1:
        return False
    else:
        return True


In [None]:
def sort_data(pp_df, original_path, csv_filename):
    """
    Organises images and generates labels into a structured directory for training/testing.

    Args:
        pp_df (pd.DataFrame): DataFrame containing cleaned 'pathology' and 'image file path' columns.
        original_path (str): Path to the directory containing source PNG images.
        csv_filename (str): Name for the CSV file to save the label info.
    
    Returns:
        set: A set of filenames ignored due to conflicting pathology labels.
    """

    images_new_path = original_path + "/images"
    labels_new_path = original_path + "/labels"

    #create new directories for images and labels if they don't exist yet
    if not os.path.exists(images_new_path):
        os.makedirs(images_new_path)
        #print("SUCCESS> ", images_new_path, " was created")
    if not os.path.exists(labels_new_path):
        os.makedirs(labels_new_path)
        #print("SUCCESS> ", labels_new_path, " was created")

    # find value counts for each file_name
    file_name_value_counts = pp_df['image file path'].value_counts()  
    ignored_images_set = set() 

    #new dataframe for the labels
    labels_df = pd.DataFrame(columns=["img_name", "img_label"])

    for index, instance in pp_df.iterrows():
        #get the file name 
        file_name = instance['image file path']
        #get the pathology type (1 = Malignant, 0 = Benign)
        pathology = instance['pathology']
        #construct the path to the image
        image_file_path = original_path + "/" + file_name

        # if a file has more than one instances in the dataframe, and more than one pathology, ignore
        if int(file_name_value_counts[file_name]) > 1:
            if check_different_pathologies(pp_df, file_name):
                ignored_images_set.add(file_name)
                continue

        #print(file_name, pathology)

        #confirm the directory and file exist
        if os.path.exists(original_path) and os.path.isfile(image_file_path):  
            #if pathology == 1:
            #    if not os.path.exists(new_path + "/malignant/" + file_name):
            #        shutil.copy(source_file_path, new_path + "/malignant")
            #else:
            #    if not os.path.exists(new_path + "/benign/" + file_name):
            #        shutil.copy(source_file_path, new_path + "/benign")

            #add instance img_name, img_label to the dataframe
            labels_df.loc[len(labels_df)] = {"img_name": file_name, "img_label": pathology}
            #move image in new directory
            shutil.move(image_file_path, images_new_path)
            print("File", file_name, " exists, and was moved")
        else:
            print("File ", file_name, " not found!")

    #new_labels_df = labels_df.drop(labels_df.columns[0], axis=1)
    labels_df.to_csv(labels_new_path + "/" + csv_filename, header=False)
    return ignored_images_set
    

In [None]:
#define paths to the 4 csv files, one for each set of images
calc_test_df = pd.read_csv("mammogram-ai-project/Data/csv/calc_case_description_test_set.csv")
calc_train_df = pd.read_csv("mammogram-ai-project/Data/csv/calc_case_description_train_set.csv")

mass_test_df = pd.read_csv("mammogram-ai-project/Data/csv/mass_case_description_test_set.csv")
mass_train_df = pd.read_csv("mammogram-ai-project/Data/csv/mass_case_description_train_set.csv")


In [124]:
#replace all instances in 'pathology', and 'full_path'
new_calc_test_df = calc_test_df[['patient_id', 'pathology', 'image file path']]
replace_pathology(new_calc_test_df)
rename_filepath(new_calc_test_df)

new_calc_train_df = calc_train_df[['patient_id', 'pathology', 'image file path']]
replace_pathology(new_calc_train_df)
rename_filepath(new_calc_train_df)

new_mass_test_df = mass_test_df[['patient_id', 'pathology', 'image file path']]
replace_pathology(new_mass_test_df)
rename_filepath(new_mass_test_df)

new_mass_train_df = mass_train_df[['patient_id', 'pathology', 'image file path']]
replace_pathology(new_mass_train_df)
rename_filepath(new_mass_train_df)

In [None]:
#define path to the actual png images
calc_test_png_path = "mammogram-ai-project/Data/Data png/Calc-Test-png"
calc_train_png_path = "mammogram-ai-project/Data/Data png/Calc-Training-png"

mass_test_png_path = "mammogram-ai-project/Data/Data png/Mass-Test-png"
mass_train_png_path = "mammogram-ai-project/Data/Data png/Mass-Training-png"


In [None]:
# sort calc testing data
ignored_calc_test_images = sort_data(new_calc_test_df, calc_test_png_path, "calc-test_labels.csv")
print(ignored_calc_test_images)

# sort calc training data
ignored_calc_train_images = sort_data(new_calc_train_df, calc_train_png_path, "calc-train_labels.csv")
print(ignored_calc_train_images)

# sort mass testing data
ignored_mass_test_images = sort_data(new_mass_test_df, mass_test_png_path, "mass-test_labels.csv")
print(ignored_mass_test_images)

# sort mass training data
ignored_mass_train_images = sort_data(new_mass_train_df, mass_train_png_path, "mass-train_labels.csv")
print(ignored_mass_train_images)