In [77]:
import sys, os, shutil, PIL
from pathlib import Path
import pandas as pd
from PIL import Image
import numpy as np

In [78]:
basedir = Path(os.path.abspath("../data/cbis-ddsm/"))

In [79]:
calc_test = pd.read_csv(basedir / "calc_case_description_test_set.csv")
mass_test = pd.read_csv(basedir / "mass_case_description_test_set.csv")
meta = pd.read_csv(basedir / "manifest-ZkhPvrLo5216730872708713142" / "metadata2.csv")

In [80]:
# Align their columns
calc_test = calc_test.rename(columns={
    "breast density":"breast_density",
    })

# Concat test cases
all_test = pd.concat([calc_test, mass_test])

In [81]:
all_test['patient_id'].nunique()

349

In [82]:
def ambigious_labels(df, pathology_variable):
    # Remove cases with ambiguous labels, as those correspond to ROI-level labels (rather than whole image)
    counts = df.groupby(["patient_id"])[pathology_variable].nunique().reset_index(name='count') \
                                .sort_values(['count'], ascending=False)
    print(counts)
    #multi_label_list = list(counts[counts["count"] > 1]["image file path"])
    #df = df[~df["image file path"].isin(multi_label_list)]

In [83]:
ambigious_labels(calc_test, "pathology_fixed")
ambigious_labels(mass_test, "pathology_fixed")

KeyError: 'Column not found: pathology_fixed'

In [84]:
def clean_dupli(df):
    df = df.sort_values(by=['pathology'], ascending=False)
    df.drop_duplicates(subset=['image file path'], inplace=True)
    return df

In [85]:
calc_test2 = clean_dupli(calc_test)
mass_test2 = clean_dupli(mass_test)

In [93]:
def get_specs(df):
    print("Unique ROIs")
    print(df['ROI mask file path'].nunique())
    df["pathology_fixed"] = "BENIGN"
    df.loc[(df["pathology"] == "MALIGNANT"), "pathology_fixed"] = "MALIGNANT"
    pathology_counts = df.groupby("pathology_fixed").nunique()[["ROI mask file path"]]
    print(pathology_counts)

In [94]:
# Calc test set
get_specs(calc_test)

Unique ROIs
326
                 ROI mask file path
pathology_fixed                    
BENIGN                          197
MALIGNANT                       129


In [95]:
# Mass test set
get_specs(mass_test)

Unique ROIs
378
                 ROI mask file path
pathology_fixed                    
BENIGN                          231
MALIGNANT                       147


In [92]:
mass_test

Unnamed: 0,patient_id,breast_density,left or right breast,image view,abnormality id,abnormality type,mass shape,mass margins,assessment,pathology,subtlety,image file path,cropped image file path,ROI mask file path,pathology_fixed
0,P_00016,4,LEFT,CC,1,mass,IRREGULAR,SPICULATED,5,MALIGNANT,5,Mass-Test_P_00016_LEFT_CC/1.3.6.1.4.1.9590.100...,Mass-Test_P_00016_LEFT_CC_1/1.3.6.1.4.1.9590.1...,Mass-Test_P_00016_LEFT_CC_1/1.3.6.1.4.1.9590.1...,MALIGNANT
1,P_00016,4,LEFT,MLO,1,mass,IRREGULAR,SPICULATED,5,MALIGNANT,5,Mass-Test_P_00016_LEFT_MLO/1.3.6.1.4.1.9590.10...,Mass-Test_P_00016_LEFT_MLO_1/1.3.6.1.4.1.9590....,Mass-Test_P_00016_LEFT_MLO_1/1.3.6.1.4.1.9590....,MALIGNANT
2,P_00017,2,LEFT,CC,1,mass,ROUND,CIRCUMSCRIBED,4,MALIGNANT,4,Mass-Test_P_00017_LEFT_CC/1.3.6.1.4.1.9590.100...,Mass-Test_P_00017_LEFT_CC_1/1.3.6.1.4.1.9590.1...,Mass-Test_P_00017_LEFT_CC_1/1.3.6.1.4.1.9590.1...,MALIGNANT
3,P_00017,2,LEFT,MLO,1,mass,ROUND,ILL_DEFINED,4,MALIGNANT,4,Mass-Test_P_00017_LEFT_MLO/1.3.6.1.4.1.9590.10...,Mass-Test_P_00017_LEFT_MLO_1/1.3.6.1.4.1.9590....,Mass-Test_P_00017_LEFT_MLO_1/1.3.6.1.4.1.9590....,MALIGNANT
4,P_00032,3,RIGHT,CC,1,mass,ROUND,OBSCURED,0,BENIGN,2,Mass-Test_P_00032_RIGHT_CC/1.3.6.1.4.1.9590.10...,Mass-Test_P_00032_RIGHT_CC_1/1.3.6.1.4.1.9590....,Mass-Test_P_00032_RIGHT_CC_1/1.3.6.1.4.1.9590....,BENIGN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
373,P_01825,2,RIGHT,MLO,1,mass,LOBULATED,MICROLOBULATED,3,BENIGN_WITHOUT_CALLBACK,3,Mass-Test_P_01825_RIGHT_MLO/1.3.6.1.4.1.9590.1...,Mass-Test_P_01825_RIGHT_MLO_1/1.3.6.1.4.1.9590...,Mass-Test_P_01825_RIGHT_MLO_1/1.3.6.1.4.1.9590...,BENIGN
374,P_01833,2,RIGHT,MLO,1,mass,IRREGULAR,ILL_DEFINED,5,MALIGNANT,5,Mass-Test_P_01833_RIGHT_MLO/1.3.6.1.4.1.9590.1...,Mass-Test_P_01833_RIGHT_MLO_1/1.3.6.1.4.1.9590...,Mass-Test_P_01833_RIGHT_MLO_1/1.3.6.1.4.1.9590...,MALIGNANT
375,P_01865,2,LEFT,MLO,1,mass,IRREGULAR,ILL_DEFINED,4,MALIGNANT,2,Mass-Test_P_01865_LEFT_MLO/1.3.6.1.4.1.9590.10...,Mass-Test_P_01865_LEFT_MLO_1/1.3.6.1.4.1.9590....,Mass-Test_P_01865_LEFT_MLO_1/1.3.6.1.4.1.9590....,MALIGNANT
376,P_01912,3,RIGHT,CC,1,mass,IRREGULAR,SPICULATED,5,MALIGNANT,4,Mass-Test_P_01912_RIGHT_CC/1.3.6.1.4.1.9590.10...,Mass-Test_P_01912_RIGHT_CC_1/1.3.6.1.4.1.9590....,Mass-Test_P_01912_RIGHT_CC_1/1.3.6.1.4.1.9590....,MALIGNANT


In [42]:
meta

Unnamed: 0,Series UID,Collection,3rd Party Analysis,Data Description URI,Subject ID,Study UID,Study Description,Study Date,Series Description,Manufacturer,Modality,SOP Class Name,SOP Class UID,Number of Images,Unnamed: 14,File Size,File Location,Download Timestamp
0,1.3.6.1.4.1.9590.100.1.2.419081637812053404913...,CBIS-DDSM,,https://doi.org/10.7937/K9/TCIA.2016.7O02S9CY,Calc-Test_P_00038_LEFT_CC_1,1.3.6.1.4.1.9590.100.1.2.161465562211359959230...,,08-29-2017,ROI mask images,,MG,Secondary Capture Image Storage,1.2.840.10008.5.1.4.1.1.7,2,14,06 MB,.\CBIS-DDSM\Calc-Test_P_00038_LEFT_CC_1\08-29-...,2022-02-28T21:13:14.487
1,1.3.6.1.4.1.9590.100.1.2.188613955710170417803...,CBIS-DDSM,,https://doi.org/10.7937/K9/TCIA.2016.7O02S9CY,Calc-Test_P_00038_LEFT_MLO_1,1.3.6.1.4.1.9590.100.1.2.291121996131431385353...,,08-29-2017,ROI mask images,,MG,Secondary Capture Image Storage,1.2.840.10008.5.1.4.1.1.7,2,14,62 MB,.\CBIS-DDSM\Calc-Test_P_00038_LEFT_MLO_1\08-29...,2022-02-28T21:13:28.105
2,1.3.6.1.4.1.9590.100.1.2.374115997511889073021...,CBIS-DDSM,,https://doi.org/10.7937/K9/TCIA.2016.7O02S9CY,Calc-Test_P_00038_LEFT_CC,1.3.6.1.4.1.9590.100.1.2.859354343102033567126...,,08-29-2017,full mammogram images,,MG,Secondary Capture Image Storage,1.2.840.10008.5.1.4.1.1.7,1,27,84 MB,.\CBIS-DDSM\Calc-Test_P_00038_LEFT_CC\08-29-20...,2022-02-28T21:13:35.311
3,1.3.6.1.4.1.9590.100.1.2.174390361112646747718...,CBIS-DDSM,,https://doi.org/10.7937/K9/TCIA.2016.7O02S9CY,Calc-Test_P_00038_LEFT_MLO,1.3.6.1.4.1.9590.100.1.2.384159464510350889125...,,08-29-2017,full mammogram images,,MG,Secondary Capture Image Storage,1.2.840.10008.5.1.4.1.1.7,1,28,97 MB,.\CBIS-DDSM\Calc-Test_P_00038_LEFT_MLO\08-29-2...,2022-02-28T21:13:36.483
4,1.3.6.1.4.1.9590.100.1.2.244876997513875090239...,CBIS-DDSM,,https://doi.org/10.7937/K9/TCIA.2016.7O02S9CY,Calc-Test_P_00038_RIGHT_CC_1,1.3.6.1.4.1.9590.100.1.2.200764632211227648028...,,08-29-2017,ROI mask images,,MG,Secondary Capture Image Storage,1.2.840.10008.5.1.4.1.1.7,2,13,41 MB,.\CBIS-DDSM\Calc-Test_P_00038_RIGHT_CC_1\08-29...,2022-02-28T21:13:46.913
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6770,1.3.6.1.4.1.9590.100.1.2.321062807811123845106...,CBIS-DDSM,,https://doi.org/10.7937/K9/TCIA.2016.7O02S9CY,Mass-Training_P_02092_LEFT_MLO_1,1.3.6.1.4.1.9590.100.1.2.195655760513031195523...,,07-20-2016,cropped images,,MG,Secondary Capture Image Storage,1.2.840.10008.5.1.4.1.1.7,1,379,24 KB,.\CBIS-DDSM\Mass-Training_P_02092_LEFT_MLO_1\0...,2022-03-01T21:24:32.04
6771,1.3.6.1.4.1.9590.100.1.2.203989029910964209440...,CBIS-DDSM,,https://doi.org/10.7937/K9/TCIA.2016.7O02S9CY,Mass-Training_P_02092_LEFT_MLO_1,1.3.6.1.4.1.9590.100.1.2.222512969612930058132...,,07-21-2016,ROI mask images,,MG,Secondary Capture Image Storage,1.2.840.10008.5.1.4.1.1.7,1,14,14 MB,.\CBIS-DDSM\Mass-Training_P_02092_LEFT_MLO_1\0...,2022-03-01T21:24:42.228
6772,1.3.6.1.4.1.9590.100.1.2.290251769212905477734...,CBIS-DDSM,,https://doi.org/10.7937/K9/TCIA.2016.7O02S9CY,Mass-Training_P_02092_LEFT_MLO,1.3.6.1.4.1.9590.100.1.2.322851018411213611731...,,07-20-2016,full mammogram images,,MG,Secondary Capture Image Storage,1.2.840.10008.5.1.4.1.1.7,1,37,35 MB,.\CBIS-DDSM\Mass-Training_P_02092_LEFT_MLO\07-...,2022-03-01T21:25:04.66
6773,1.3.6.1.4.1.9590.100.1.2.412558050811722685411...,CBIS-DDSM,,https://doi.org/10.7937/K9/TCIA.2016.7O02S9CY,Mass-Training_P_02079_RIGHT_MLO,1.3.6.1.4.1.9590.100.1.2.791687062125620465047...,,07-20-2016,full mammogram images,,MG,Secondary Capture Image Storage,1.2.840.10008.5.1.4.1.1.7,1,36,04 MB,.\CBIS-DDSM\Mass-Training_P_02079_RIGHT_MLO\07...,2022-03-01T21:25:15.016


In [96]:
meta

Unnamed: 0,Series UID,Collection,3rd Party Analysis,Data Description URI,Subject ID,Study UID,Study Description,Study Date,Series Description,Manufacturer,Modality,SOP Class Name,SOP Class UID,Number of Images,Unnamed: 14,File Size,File Location,Download Timestamp
0,1.3.6.1.4.1.9590.100.1.2.419081637812053404913...,CBIS-DDSM,,https://doi.org/10.7937/K9/TCIA.2016.7O02S9CY,Calc-Test_P_00038_LEFT_CC_1,1.3.6.1.4.1.9590.100.1.2.161465562211359959230...,,08-29-2017,ROI mask images,,MG,Secondary Capture Image Storage,1.2.840.10008.5.1.4.1.1.7,2,14,06 MB,.\CBIS-DDSM\Calc-Test_P_00038_LEFT_CC_1\08-29-...,2022-02-28T21:13:14.487
1,1.3.6.1.4.1.9590.100.1.2.188613955710170417803...,CBIS-DDSM,,https://doi.org/10.7937/K9/TCIA.2016.7O02S9CY,Calc-Test_P_00038_LEFT_MLO_1,1.3.6.1.4.1.9590.100.1.2.291121996131431385353...,,08-29-2017,ROI mask images,,MG,Secondary Capture Image Storage,1.2.840.10008.5.1.4.1.1.7,2,14,62 MB,.\CBIS-DDSM\Calc-Test_P_00038_LEFT_MLO_1\08-29...,2022-02-28T21:13:28.105
2,1.3.6.1.4.1.9590.100.1.2.374115997511889073021...,CBIS-DDSM,,https://doi.org/10.7937/K9/TCIA.2016.7O02S9CY,Calc-Test_P_00038_LEFT_CC,1.3.6.1.4.1.9590.100.1.2.859354343102033567126...,,08-29-2017,full mammogram images,,MG,Secondary Capture Image Storage,1.2.840.10008.5.1.4.1.1.7,1,27,84 MB,.\CBIS-DDSM\Calc-Test_P_00038_LEFT_CC\08-29-20...,2022-02-28T21:13:35.311
3,1.3.6.1.4.1.9590.100.1.2.174390361112646747718...,CBIS-DDSM,,https://doi.org/10.7937/K9/TCIA.2016.7O02S9CY,Calc-Test_P_00038_LEFT_MLO,1.3.6.1.4.1.9590.100.1.2.384159464510350889125...,,08-29-2017,full mammogram images,,MG,Secondary Capture Image Storage,1.2.840.10008.5.1.4.1.1.7,1,28,97 MB,.\CBIS-DDSM\Calc-Test_P_00038_LEFT_MLO\08-29-2...,2022-02-28T21:13:36.483
4,1.3.6.1.4.1.9590.100.1.2.244876997513875090239...,CBIS-DDSM,,https://doi.org/10.7937/K9/TCIA.2016.7O02S9CY,Calc-Test_P_00038_RIGHT_CC_1,1.3.6.1.4.1.9590.100.1.2.200764632211227648028...,,08-29-2017,ROI mask images,,MG,Secondary Capture Image Storage,1.2.840.10008.5.1.4.1.1.7,2,13,41 MB,.\CBIS-DDSM\Calc-Test_P_00038_RIGHT_CC_1\08-29...,2022-02-28T21:13:46.913
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6770,1.3.6.1.4.1.9590.100.1.2.321062807811123845106...,CBIS-DDSM,,https://doi.org/10.7937/K9/TCIA.2016.7O02S9CY,Mass-Training_P_02092_LEFT_MLO_1,1.3.6.1.4.1.9590.100.1.2.195655760513031195523...,,07-20-2016,cropped images,,MG,Secondary Capture Image Storage,1.2.840.10008.5.1.4.1.1.7,1,379,24 KB,.\CBIS-DDSM\Mass-Training_P_02092_LEFT_MLO_1\0...,2022-03-01T21:24:32.04
6771,1.3.6.1.4.1.9590.100.1.2.203989029910964209440...,CBIS-DDSM,,https://doi.org/10.7937/K9/TCIA.2016.7O02S9CY,Mass-Training_P_02092_LEFT_MLO_1,1.3.6.1.4.1.9590.100.1.2.222512969612930058132...,,07-21-2016,ROI mask images,,MG,Secondary Capture Image Storage,1.2.840.10008.5.1.4.1.1.7,1,14,14 MB,.\CBIS-DDSM\Mass-Training_P_02092_LEFT_MLO_1\0...,2022-03-01T21:24:42.228
6772,1.3.6.1.4.1.9590.100.1.2.290251769212905477734...,CBIS-DDSM,,https://doi.org/10.7937/K9/TCIA.2016.7O02S9CY,Mass-Training_P_02092_LEFT_MLO,1.3.6.1.4.1.9590.100.1.2.322851018411213611731...,,07-20-2016,full mammogram images,,MG,Secondary Capture Image Storage,1.2.840.10008.5.1.4.1.1.7,1,37,35 MB,.\CBIS-DDSM\Mass-Training_P_02092_LEFT_MLO\07-...,2022-03-01T21:25:04.66
6773,1.3.6.1.4.1.9590.100.1.2.412558050811722685411...,CBIS-DDSM,,https://doi.org/10.7937/K9/TCIA.2016.7O02S9CY,Mass-Training_P_02079_RIGHT_MLO,1.3.6.1.4.1.9590.100.1.2.791687062125620465047...,,07-20-2016,full mammogram images,,MG,Secondary Capture Image Storage,1.2.840.10008.5.1.4.1.1.7,1,36,04 MB,.\CBIS-DDSM\Mass-Training_P_02079_RIGHT_MLO\07...,2022-03-01T21:25:15.016
