Creation of PandaDF is heavily inspired from https://www.kaggle.com/code/princelubisi/breast-cancer-classification-googlenet

In [1]:
import os
import shutil
import random

from timeit import default_timer as timer

import numpy as np
import pandas as pd

from PIL import Image
import glob 
from pathlib import Path
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns 
import cv2

from sklearn.model_selection import train_test_split

In [2]:
# Paths, note that you use r"path" for absolute paths and '.' for relative
dir_path = "./dataset/breast-cancer"
csv_path = '/'.join([dir_path, "csv"])
organized_dir = "./dataset/organized_mammograms"
train_dir = "./dataset/organized_mammograms/train"
test_dir = "./dataset/organized_mammograms/test"
imdir = './dataset/breast-cancer/jpeg'
dir_path, train_dir, test_dir, csv_path

('./dataset/breast-cancer',
 './dataset/organized_mammograms/train',
 './dataset/organized_mammograms/test',
 './dataset/breast-cancer/csv')

In [3]:
df_meta= pd.read_csv('/'.join([csv_path, "meta.csv"]))
df_dicom = pd.read_csv('/'.join([csv_path, "dicom_info.csv"]))

In [4]:
cropped_images = df_dicom[df_dicom.SeriesDescription=='cropped images'].image_path
full_mammo = df_dicom[df_dicom.SeriesDescription=='full mammogram images'].image_path
roi_img = df_dicom[df_dicom.SeriesDescription=='ROI mask images'].image_path

In [5]:
cropped_images = cropped_images.replace('CBIS-DDSM/jpeg', imdir, regex=True)
full_mammo = full_mammo.replace('CBIS-DDSM/jpeg', imdir, regex=True)
roi_img = roi_img.replace('CBIS-DDSM/jpeg', imdir, regex=True)

In [6]:
full_mammo_dict = dict()
cropped_images_dict = dict()
roi_img_dict = dict()

for dicom in full_mammo:
    key = dicom.split("/")[4]
    full_mammo_dict[key] = dicom
for dicom in cropped_images:
    key = dicom.split("/")[4]
    cropped_images_dict[key] = dicom
for dicom in roi_img:
    key = dicom.split("/")[4]
    roi_img_dict[key] = dicom

In [7]:
mass_train = pd.read_csv('/'.join([csv_path, 'mass_case_description_train_set.csv']))
mass_test = pd.read_csv('/'.join([csv_path, 'mass_case_description_test_set.csv']))

In [8]:
mass_train

Unnamed: 0,patient_id,breast_density,left or right breast,image view,abnormality id,abnormality type,mass shape,mass margins,assessment,pathology,subtlety,image file path,cropped image file path,ROI mask file path
0,P_00001,3,LEFT,CC,1,mass,IRREGULAR-ARCHITECTURAL_DISTORTION,SPICULATED,4,MALIGNANT,4,Mass-Training_P_00001_LEFT_CC/1.3.6.1.4.1.9590...,Mass-Training_P_00001_LEFT_CC_1/1.3.6.1.4.1.95...,Mass-Training_P_00001_LEFT_CC_1/1.3.6.1.4.1.95...
1,P_00001,3,LEFT,MLO,1,mass,IRREGULAR-ARCHITECTURAL_DISTORTION,SPICULATED,4,MALIGNANT,4,Mass-Training_P_00001_LEFT_MLO/1.3.6.1.4.1.959...,Mass-Training_P_00001_LEFT_MLO_1/1.3.6.1.4.1.9...,Mass-Training_P_00001_LEFT_MLO_1/1.3.6.1.4.1.9...
2,P_00004,3,LEFT,CC,1,mass,ARCHITECTURAL_DISTORTION,ILL_DEFINED,4,BENIGN,3,Mass-Training_P_00004_LEFT_CC/1.3.6.1.4.1.9590...,Mass-Training_P_00004_LEFT_CC_1/1.3.6.1.4.1.95...,Mass-Training_P_00004_LEFT_CC_1/1.3.6.1.4.1.95...
3,P_00004,3,LEFT,MLO,1,mass,ARCHITECTURAL_DISTORTION,ILL_DEFINED,4,BENIGN,3,Mass-Training_P_00004_LEFT_MLO/1.3.6.1.4.1.959...,Mass-Training_P_00004_LEFT_MLO_1/1.3.6.1.4.1.9...,Mass-Training_P_00004_LEFT_MLO_1/1.3.6.1.4.1.9...
4,P_00004,3,RIGHT,MLO,1,mass,OVAL,CIRCUMSCRIBED,4,BENIGN,5,Mass-Training_P_00004_RIGHT_MLO/1.3.6.1.4.1.95...,Mass-Training_P_00004_RIGHT_MLO_1/1.3.6.1.4.1....,Mass-Training_P_00004_RIGHT_MLO_1/1.3.6.1.4.1....
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1313,P_02033,2,RIGHT,MLO,1,mass,IRREGULAR,ILL_DEFINED,3,MALIGNANT,4,Mass-Training_P_02033_RIGHT_MLO/1.3.6.1.4.1.95...,Mass-Training_P_02033_RIGHT_MLO_1/1.3.6.1.4.1....,Mass-Training_P_02033_RIGHT_MLO_1/1.3.6.1.4.1....
1314,P_02079,2,RIGHT,CC,1,mass,ROUND,SPICULATED,3,MALIGNANT,5,Mass-Training_P_02079_RIGHT_CC/1.3.6.1.4.1.959...,Mass-Training_P_02079_RIGHT_CC_1/1.3.6.1.4.1.9...,Mass-Training_P_02079_RIGHT_CC_1/1.3.6.1.4.1.9...
1315,P_02079,2,RIGHT,MLO,1,mass,ROUND,SPICULATED,3,MALIGNANT,5,Mass-Training_P_02079_RIGHT_MLO/1.3.6.1.4.1.95...,Mass-Training_P_02079_RIGHT_MLO_1/1.3.6.1.4.1....,Mass-Training_P_02079_RIGHT_MLO_1/1.3.6.1.4.1....
1316,P_02092,2,LEFT,CC,1,mass,IRREGULAR,SPICULATED,3,MALIGNANT,2,Mass-Training_P_02092_LEFT_CC/1.3.6.1.4.1.9590...,Mass-Training_P_02092_LEFT_CC_1/1.3.6.1.4.1.95...,Mass-Training_P_02092_LEFT_CC_1/1.3.6.1.4.1.95...


In [9]:
mass_test

Unnamed: 0,patient_id,breast_density,left or right breast,image view,abnormality id,abnormality type,mass shape,mass margins,assessment,pathology,subtlety,image file path,cropped image file path,ROI mask file path
0,P_00016,4,LEFT,CC,1,mass,IRREGULAR,SPICULATED,5,MALIGNANT,5,Mass-Test_P_00016_LEFT_CC/1.3.6.1.4.1.9590.100...,Mass-Test_P_00016_LEFT_CC_1/1.3.6.1.4.1.9590.1...,Mass-Test_P_00016_LEFT_CC_1/1.3.6.1.4.1.9590.1...
1,P_00016,4,LEFT,MLO,1,mass,IRREGULAR,SPICULATED,5,MALIGNANT,5,Mass-Test_P_00016_LEFT_MLO/1.3.6.1.4.1.9590.10...,Mass-Test_P_00016_LEFT_MLO_1/1.3.6.1.4.1.9590....,Mass-Test_P_00016_LEFT_MLO_1/1.3.6.1.4.1.9590....
2,P_00017,2,LEFT,CC,1,mass,ROUND,CIRCUMSCRIBED,4,MALIGNANT,4,Mass-Test_P_00017_LEFT_CC/1.3.6.1.4.1.9590.100...,Mass-Test_P_00017_LEFT_CC_1/1.3.6.1.4.1.9590.1...,Mass-Test_P_00017_LEFT_CC_1/1.3.6.1.4.1.9590.1...
3,P_00017,2,LEFT,MLO,1,mass,ROUND,ILL_DEFINED,4,MALIGNANT,4,Mass-Test_P_00017_LEFT_MLO/1.3.6.1.4.1.9590.10...,Mass-Test_P_00017_LEFT_MLO_1/1.3.6.1.4.1.9590....,Mass-Test_P_00017_LEFT_MLO_1/1.3.6.1.4.1.9590....
4,P_00032,3,RIGHT,CC,1,mass,ROUND,OBSCURED,0,BENIGN,2,Mass-Test_P_00032_RIGHT_CC/1.3.6.1.4.1.9590.10...,Mass-Test_P_00032_RIGHT_CC_1/1.3.6.1.4.1.9590....,Mass-Test_P_00032_RIGHT_CC_1/1.3.6.1.4.1.9590....
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
373,P_01825,2,RIGHT,MLO,1,mass,LOBULATED,MICROLOBULATED,3,BENIGN_WITHOUT_CALLBACK,3,Mass-Test_P_01825_RIGHT_MLO/1.3.6.1.4.1.9590.1...,Mass-Test_P_01825_RIGHT_MLO_1/1.3.6.1.4.1.9590...,Mass-Test_P_01825_RIGHT_MLO_1/1.3.6.1.4.1.9590...
374,P_01833,2,RIGHT,MLO,1,mass,IRREGULAR,ILL_DEFINED,5,MALIGNANT,5,Mass-Test_P_01833_RIGHT_MLO/1.3.6.1.4.1.9590.1...,Mass-Test_P_01833_RIGHT_MLO_1/1.3.6.1.4.1.9590...,Mass-Test_P_01833_RIGHT_MLO_1/1.3.6.1.4.1.9590...
375,P_01865,2,LEFT,MLO,1,mass,IRREGULAR,ILL_DEFINED,4,MALIGNANT,2,Mass-Test_P_01865_LEFT_MLO/1.3.6.1.4.1.9590.10...,Mass-Test_P_01865_LEFT_MLO_1/1.3.6.1.4.1.9590....,Mass-Test_P_01865_LEFT_MLO_1/1.3.6.1.4.1.9590....
376,P_01912,3,RIGHT,CC,1,mass,IRREGULAR,SPICULATED,5,MALIGNANT,4,Mass-Test_P_01912_RIGHT_CC/1.3.6.1.4.1.9590.10...,Mass-Test_P_01912_RIGHT_CC_1/1.3.6.1.4.1.9590....,Mass-Test_P_01912_RIGHT_CC_1/1.3.6.1.4.1.9590....


In [10]:
def fix_image_path(data):
    """correct dicom paths to correct image paths"""
    for index, img in enumerate(data.values):
        img_name = img[11].split("/")[2]
        data.iloc[index,11] = full_mammo_dict[img_name]
        img_name = img[12].split("/")[2]
        data.iloc[index,12] = cropped_images_dict[img_name]
        img_name = img[13].split("/")[2]
        data.iloc[index,13] = roi_img_dict[img_name]
        
fix_image_path(mass_train)
fix_image_path(mass_test)

In [11]:
mass_train = mass_train.rename(columns={'left or right breast': 'left_or_right_breast',
                                           'image view': 'image_view',
                                           'abnormality id': 'abnormality_id',
                                           'abnormality type': 'abnormality_type',
                                           'mass shape': 'mass_shape',
                                           'mass margins': 'mass_margins',
                                           'image file path': 'image_file_path',
                                           'cropped image file path': 'cropped_image_file_path',
                                           'ROI mask file path': 'ROI_mask_file_path'})

mass_test = mass_test.rename(columns={'left or right breast': 'left_or_right_breast',
                                           'image view': 'image_view',
                                           'abnormality id': 'abnormality_id',
                                           'abnormality type': 'abnormality_type',
                                           'mass shape': 'mass_shape',
                                           'mass margins': 'mass_margins',
                                           'image file path': 'image_file_path',
                                           'cropped image file path': 'cropped_image_file_path',
                                           'ROI mask file path': 'ROI_mask_file_path'})

In [12]:
mass_train['mass_shape'] = mass_train['mass_shape'].bfill()
mass_train['mass_margins'] = mass_train['mass_margins'].bfill()

mass_test['mass_shape'] = mass_test['mass_shape'].bfill()
mass_test['mass_margins'] = mass_test['mass_margins'].bfill()

In [13]:
mass_total = pd.concat([mass_train, mass_test], axis=0)

In [14]:
print(f"Original number of rows: {len(mass_total)}")

mass_total_sorted = mass_total.sort_values(
    by='pathology',
    ascending=False 
)


mass_cleaned = mass_total_sorted.drop_duplicates(
    subset=['patient_id', 'left_or_right_breast', 'image_view'],
    keep='first'
)

full_mass_final = mass_cleaned.sort_values(
    by=['patient_id', 'left_or_right_breast', 'image_view', 'pathology'],
    ascending=[True, True, True, False]
)


print(f"New Mammogram Count: {len(mass_cleaned)}")
print("\nImages Removed:", len(mass_total_sorted) - len(mass_cleaned))
print("\nPrev Distribution:")
print(mass_total_sorted['pathology'].value_counts())
print("\nPost Distribution:")
print(mass_cleaned['pathology'].value_counts())

Original number of rows: 1696
New Mammogram Count: 1592

Images Removed: 104

Prev Distribution:
pathology
MALIGNANT                  784
BENIGN                     771
BENIGN_WITHOUT_CALLBACK    141
Name: count, dtype: int64

Post Distribution:
pathology
MALIGNANT                  747
BENIGN                     726
BENIGN_WITHOUT_CALLBACK    119
Name: count, dtype: int64


In [15]:
train_size = int(0.8 * len(mass_cleaned)) #80%
test_size = len(mass_cleaned) - train_size  #20%
mass_cleaned['strat_column'] = mass_cleaned['pathology'] + '_' + mass_cleaned['image_view']

train_data, test_data = train_test_split(
    mass_cleaned,
    test_size=0.2,
    random_state=42, 
    stratify=mass_cleaned['strat_column']  
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mass_cleaned['strat_column'] = mass_cleaned['pathology'] + '_' + mass_cleaned['image_view']


In [17]:
def organize_mammogram_dataset(train_data, test_data, output_dir, clear_existing=True):  
    if clear_existing and os.path.exists(output_dir):
        shutil.rmtree(output_dir)
    
    for split in ['train', 'test']:
        for label in ['benign', 'malignant']:
            os.makedirs(os.path.join(output_dir, split, label), exist_ok=True)
            
    def name(row):
        unique_id = f"{row['patient_id']}_{row['left_or_right_breast']}_{row['image_view']}"
        _, tag = os.path.splitext(os.path.basename(row['image_file_path']))
        return f"{unique_id}{tag}"
     
    def determine_breast_side(image_path): # Flips based on whichever side has more white pixels
        img = cv2.imread(image_path, 0)  
        if img is None:
            print(f"Error loading image: {image_path}")
            return False
        
        height, width = img.shape
        left_half = img[:, :width//2]
        right_half = img[:, width//2:]
        
        left_intensity = np.mean(left_half)
        right_intensity = np.mean(right_half)
        
        return left_intensity > right_intensity
    
    def process(src_path, dst_path):
        try:
            img = cv2.imread(src_path)
            if img is None:
                print(f"Warning: Could not read image - {src_path}")
                return False
                
            # if determine_breast_side(src_path):
            #     img = cv2.flip(img, 1)  # Flip horizontally
                
            cv2.imwrite(dst_path, img)
            return True
            
        except Exception as e:
            print(f"Error processing image {src_path}: {e}")
            return False
    
    def copy_images(df, split):
        benign_count = 0
        malignant_count = 0
        error_count = 0
        
        for idx, row in df.iterrows():
            src_path = row['image_file_path']
            
            is_benign = row['pathology'] in ['BENIGN', 'BENIGN_WITHOUT_CALLBACK']
            label = 'benign' if is_benign else 'malignant'
            
            unique_filename = name(row)
            dst_path = os.path.join(output_dir, split, label, unique_filename)
            
            temp = process(src_path, dst_path)
            if temp:
                if is_benign:
                    benign_count += 1
                else:
                    malignant_count += 1
        
        final_benign = len(os.listdir(os.path.join(output_dir, split, 'benign')))
        final_malignant = len(os.listdir(os.path.join(output_dir, split, 'malignant')))
        print(f"Benign Count: {final_benign} images")
        print(f"Malignant Count: {final_malignant} images")
        
        return benign_count, malignant_count

    train_stats = copy_images(train_data, 'train')
    test_stats = copy_images(test_data, 'test')

print(f"Train data shape: {train_data.shape}")
print(f"Test data shape: {test_data.shape}")
    
output_dir = "./dataset/test17"
organize_mammogram_dataset(train_data, test_data, output_dir, clear_existing=True)

Train data shape: (1273, 15)
Test data shape: (319, 15)


KeyboardInterrupt: 