In [1]:
import json
import numpy as np
import pandas as pd
from itertools import chain
from collections import defaultdict, Counter
from IPython.display import display
import json

In [7]:
GLOBAL_PATH = "..\\metadata\\breast-level_annotations.csv"
LOCAL_PATH = "..\\metadata\\finding_annotations.csv"
birads_LESIONS = {
    "Mass",
    "Suspicious Calcification",
    "Architectural Distortion",
    "Focal Asymmetry",
    "Global Asymmetry",
    "Asymmetry",
}
NO_BIRADS = {
    "Suspicious Lymph Node",
    "Skin Thickening",
    "Skin Retraction",
    "Nipple Retraction",
    "No Finding",
}
BIRADS345 = ["BI-RADS 3", "BI-RADS 4", "BI-RADS 5"]
ALL_LESIONS = [
    "Suspicious Lymph Node",
    "Mass",
    "Suspicious Calcification",
    "Asymmetry",
    "Focal Asymmetry",
    "Global Asymmetry",
    "Architectural Distortion",
    "Skin Thickening",
    "Skin Retraction",
    "Nipple Retraction",
    "No Finding",
]


def show_df(df):
    with pd.option_context(
        "display.max_rows",
        None,
        "display.max_columns",
        None,
        "display.max_colwidth",
        None,
    ):  # more options can be specified also
        display(df)


def count_birads_densities(df):
    """
    count birads density at breast level
    """
    counter = defaultdict(lambda: 0)
    den_counter = defaultdict(lambda: 0)
    for (study_id, side), rows in df.groupby(["study_id", "laterality"]):
        birads = rows.breast_birads.values[0]
        counter[birads] += 1
        density = rows.breast_density.values[0]
        den_counter[density] += 1

    total = sum(counter.values())
    total2 = sum(den_counter.values())
    assert total == total2
    percent = {k: f"{100.*v/total:.2f}" for k, v in counter.items()}
    counter["Total"] = total
    stats = pd.DataFrame.from_records({"No. breast": counter, "percent": percent})
    stats.index.name = "BI-RADS"
    stats = stats.sort_index()

    den_percent = {k: f"{100.*v/total:.2f}" for k, v in den_counter.items()}
    den_counter["Total"] = total
    den_stats = pd.DataFrame({"No. breast": den_counter, "percent": den_percent})
    den_stats.index.name = "DENSITY"
    den_stats = den_stats.sort_index()
    return stats, den_stats


def count_box_birads(df):
    """ """
    counter = defaultdict(lambda: defaultdict(lambda: 0))
    df.finding_birads = df.finding_birads.fillna("")
    all_birads = sorted(df.finding_birads.unique().tolist())
    for _, row in df.iterrows():
        for clas in row.finding_categories:
            counter[clas]["Total"] += 1
            counter[clas][row.finding_birads] += 1
    for k, v in counter.items():
        v["Lesion"] = k
    df = pd.DataFrame.from_records(
        list(counter.values()), columns=["Lesion", "Total"] + all_birads
    )
    lesion = df["Lesion"].values
    df = df.set_index("Lesion")
    df = df.reindex(ALL_LESIONS)

    df = df.fillna(0)
    df.loc["All lesions"] = df.sum()
    df = df.astype("int32")
    return df


def count_box_label(df):
    box_label = list(chain(*df.box_label.tolist()))
    return Counter(box_label)


def df_counts(df):
    print("no. studies", len(df.study_id.unique()))
    print("no. images", len(df.image_id.unique()))

In [8]:
local_df = pd.read_csv(LOCAL_PATH)
local_df["finding_categories"] = local_df["finding_categories"].apply(
    lambda x: json.loads(x.replace("'", '"'))
)
local_df.head()

Unnamed: 0,study_id,series_id,image_id,laterality,view_position,height,width,breast_birads,breast_density,finding_categories,finding_birads,xmin,ymin,xmax,ymax,split
0,48575a27b7c992427041a82fa750d3fa,26de4993fa6b8ae50a91c8baf49b92b0,4e3a578fe535ea4f5258d3f7f4419db8,R,CC,3518,2800,BI-RADS 4,DENSITY C,[Mass],BI-RADS 4,2355.139893,1731.640015,2482.97998,1852.75,training
1,48575a27b7c992427041a82fa750d3fa,26de4993fa6b8ae50a91c8baf49b92b0,dac39351b0f3a8c670b7f8dc88029364,R,MLO,3518,2800,BI-RADS 4,DENSITY C,[Mass],BI-RADS 4,2386.679932,1240.609985,2501.800049,1354.040039,training
2,75e8e48933289d70b407379a564f8594,853b70e7e6f39133497909d9ca4c756d,c83f780904f25eacb44e9030f32c66e1,R,CC,3518,2800,BI-RADS 3,DENSITY C,[Global Asymmetry],BI-RADS 3,2279.179932,1166.51001,2704.439941,2184.26001,training
3,75e8e48933289d70b407379a564f8594,853b70e7e6f39133497909d9ca4c756d,893528bc38a0362928a89364f1b692fd,R,MLO,3518,2800,BI-RADS 3,DENSITY C,[Global Asymmetry],BI-RADS 3,1954.27002,1443.640015,2589.76001,2193.810059,training
4,c3487424fee1bdd4515b72dc3fd69813,77619c914263eae44e9099f1ce07192c,318264c881bf12f2c1efe5f93920cc37,R,CC,3518,2800,BI-RADS 4,DENSITY C,[Architectural Distortion],BI-RADS 4,2172.300049,1967.410034,2388.699951,2147.159912,training


In [9]:
global_df = pd.read_csv(GLOBAL_PATH)
global_df.head()

Unnamed: 0,study_id,series_id,image_id,laterality,view_position,height,width,breast_birads,breast_density,split
0,b8d273e8601f348d3664778dae0e7e0b,b36517b9cbbcfd286a7ae04f643af97a,d8125545210c08e1b1793a5af6458ee2,L,CC,3518,2800,BI-RADS 2,DENSITY C,training
1,b8d273e8601f348d3664778dae0e7e0b,b36517b9cbbcfd286a7ae04f643af97a,290c658f4e75a3f83ec78a847414297c,L,MLO,3518,2800,BI-RADS 2,DENSITY C,training
2,b8d273e8601f348d3664778dae0e7e0b,b36517b9cbbcfd286a7ae04f643af97a,cd0fc7bc53ac632a11643ac4cc91002a,R,CC,3518,2800,BI-RADS 2,DENSITY C,training
3,b8d273e8601f348d3664778dae0e7e0b,b36517b9cbbcfd286a7ae04f643af97a,71638b1e853799f227492bfb08a01491,R,MLO,3518,2800,BI-RADS 2,DENSITY C,training
4,8269f5971eaca3e5d3772d1796e6bd7a,d931832a0815df082c085b6e09d20aac,dd9ce3288c0773e006a294188aadba8e,L,CC,3518,2800,BI-RADS 1,DENSITY C,training


In [8]:
# create attributes list for each study to stratify
split_col = [f"BI-RADS {i}" for i in range(1,6)]
split_col = split_col + [f"DENSITY {x}" for x in "ABCD"]
split_col.extend(list(NO_BIRADS))
split_col = split_col + [f"{box_name}_{box_birads}" for box_name in birads_LESIONS for box_birads in BIRADS345]
split_col

['BI-RADS 1',
 'BI-RADS 2',
 'BI-RADS 3',
 'BI-RADS 4',
 'BI-RADS 5',
 'DENSITY A',
 'DENSITY B',
 'DENSITY C',
 'DENSITY D',
 'Suspicious Lymph Node',
 'Skin Retraction',
 'No Finding',
 'Nipple Retraction',
 'Skin Thickening',
 'Asymmetry_BI-RADS 3',
 'Asymmetry_BI-RADS 4',
 'Asymmetry_BI-RADS 5',
 'Focal Asymmetry_BI-RADS 3',
 'Focal Asymmetry_BI-RADS 4',
 'Focal Asymmetry_BI-RADS 5',
 'Suspicious Calcification_BI-RADS 3',
 'Suspicious Calcification_BI-RADS 4',
 'Suspicious Calcification_BI-RADS 5',
 'Mass_BI-RADS 3',
 'Mass_BI-RADS 4',
 'Mass_BI-RADS 5',
 'Global Asymmetry_BI-RADS 3',
 'Global Asymmetry_BI-RADS 4',
 'Global Asymmetry_BI-RADS 5',
 'Architectural Distortion_BI-RADS 3',
 'Architectural Distortion_BI-RADS 4',
 'Architectural Distortion_BI-RADS 5']

In [10]:
# count number of instances for each attribute of the study
# e.g for breast-level annotations it is number of images in the study
# for finding annotations it is number of bounding box in the study
study_ids = sorted(global_df.study_id.unique().tolist())
labels_ar = np.zeros((len(study_ids), len(split_col)), dtype=np.int32)
for (study_id, lat), rows in global_df.groupby(["study_id", "laterality"]):
    birads = rows.breast_birads.values[0]
    density = rows.breast_density.values[0]
    labels_ar[study_ids.index(study_id),split_col.index(birads)] += 1
    labels_ar[study_ids.index(study_id),split_col.index(density)] += 1
for _, x in local_df.iterrows():
    birads = x["finding_birads"]
    for label in x["finding_categories"]:
        if label in birads_LESIONS:
            labels_ar[
                study_ids.index(x["study_id"]),
                split_col.index(f"{label}_{birads}"),
            ] += 1
        else:
            labels_ar[
                study_ids.index(x["study_id"]),
                split_col.index(label),
            ] += 1
total = labels_ar.sum(axis=0)
for name,v in zip(split_col, total):
    print(name,v)

BI-RADS 1 6703
BI-RADS 2 2338
BI-RADS 3 465
BI-RADS 4 381
BI-RADS 5 113
DENSITY A 50
DENSITY B 954
DENSITY C 7646
DENSITY D 1350
Suspicious Lymph Node 57
Skin Retraction 18
No Finding 18232
Nipple Retraction 37
Skin Thickening 57
Asymmetry_BI-RADS 3 83
Asymmetry_BI-RADS 4 13
Asymmetry_BI-RADS 5 1
Focal Asymmetry_BI-RADS 3 138
Focal Asymmetry_BI-RADS 4 120
Focal Asymmetry_BI-RADS 5 11
Suspicious Calcification_BI-RADS 3 72
Suspicious Calcification_BI-RADS 4 347
Suspicious Calcification_BI-RADS 5 124
Mass_BI-RADS 3 568
Mass_BI-RADS 4 496
Mass_BI-RADS 5 162
Global Asymmetry_BI-RADS 3 20
Global Asymmetry_BI-RADS 4 6
Global Asymmetry_BI-RADS 5 0
Architectural Distortion_BI-RADS 3 20
Architectural Distortion_BI-RADS 4 80
Architectural Distortion_BI-RADS 5 19


In [11]:
from utils.stratification import IterativeStratification
SEED = 1999
SPLITS = np.array([0.8, 0.2])
stratifier = IterativeStratification(SEED)
fold_ids = stratifier.stratify(labels_ar, SPLITS)

In [12]:
global_df['fold'] = ""
local_df['fold'] = ""
fold_name = ["training", "test"]
for k in range(2):
    fold_idx = np.where(fold_ids==k)[0]
#     print(fold_idx)
    study_uids = [study_ids[i] for i in fold_idx]
#     print(study_uids[:5])
    global_df.loc[global_df.study_id.isin(study_uids), 'fold'] = fold_name[k]
    local_df.loc[local_df.study_id.isin(study_uids), 'fold'] = fold_name[k]
    print(global_df[global_df.study_id.isin(study_uids)].shape)
    print(local_df[local_df.study_id.isin(study_uids)].shape)

(16000, 11)
(16404, 17)
(4000, 11)
(4082, 17)


In [13]:
show_df(count_box_birads(local_df[local_df.fold == "training"]))
show_df(count_box_birads(local_df[local_df.fold == "test"]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.finding_birads = df.finding_birads.fillna("")


Unnamed: 0_level_0,Total,Unnamed: 2_level_0,BI-RADS 3,BI-RADS 4,BI-RADS 5
Lesion,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Suspicious Lymph Node,46,46,0,0,0
Mass,1001,0,459,412,130
Suspicious Calcification,435,0,58,277,100
Asymmetry,77,0,66,10,1
Focal Asymmetry,215,0,110,96,9
Global Asymmetry,20,0,16,4,0
Architectural Distortion,97,0,17,65,15
Skin Thickening,46,39,3,4,0
Skin Retraction,14,10,0,0,4
Nipple Retraction,29,18,0,5,6


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.finding_birads = df.finding_birads.fillna("")


Unnamed: 0_level_0,Total,Unnamed: 2_level_0,BI-RADS 3,BI-RADS 4,BI-RADS 5
Lesion,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Suspicious Lymph Node,11,11,0,0,0
Mass,225,0,109,84,32
Suspicious Calcification,108,0,14,70,24
Asymmetry,20,0,17,3,0
Focal Asymmetry,54,0,28,24,2
Global Asymmetry,6,0,4,2,0
Architectural Distortion,22,0,3,15,4
Skin Thickening,11,8,1,2,0
Skin Retraction,4,2,0,0,2
Nipple Retraction,8,2,0,2,4


In [14]:
print("Whole dataset:")
bi, den = count_birads_densities(global_df)
show_df(bi)
show_df(den)

print("Training split:")
bi, den = count_birads_densities(global_df[global_df.fold == "training"])
show_df(bi)
show_df(den)

print("Test split:")
bi, den = count_birads_densities(global_df[global_df.fold == "test"])
show_df(bi)
show_df(den)

Whole dataset:


Unnamed: 0_level_0,No. breast,percent
BI-RADS,Unnamed: 1_level_1,Unnamed: 2_level_1
BI-RADS 1,6703,67.03
BI-RADS 2,2338,23.38
BI-RADS 3,465,4.65
BI-RADS 4,381,3.81
BI-RADS 5,113,1.13
Total,10000,


Unnamed: 0_level_0,No. breast,percent
DENSITY,Unnamed: 1_level_1,Unnamed: 2_level_1
DENSITY A,50,0.5
DENSITY B,954,9.54
DENSITY C,7646,76.46
DENSITY D,1350,13.5
Total,10000,


Training split:


Unnamed: 0_level_0,No. breast,percent
BI-RADS,Unnamed: 1_level_1,Unnamed: 2_level_1
BI-RADS 1,5363,67.04
BI-RADS 2,1870,23.38
BI-RADS 3,372,4.65
BI-RADS 4,305,3.81
BI-RADS 5,90,1.12
Total,8000,


Unnamed: 0_level_0,No. breast,percent
DENSITY,Unnamed: 1_level_1,Unnamed: 2_level_1
DENSITY A,40,0.5
DENSITY B,764,9.55
DENSITY C,6116,76.45
DENSITY D,1080,13.5
Total,8000,


Test split:


Unnamed: 0_level_0,No. breast,percent
BI-RADS,Unnamed: 1_level_1,Unnamed: 2_level_1
BI-RADS 1,1340,67.0
BI-RADS 2,468,23.4
BI-RADS 3,93,4.65
BI-RADS 4,76,3.8
BI-RADS 5,23,1.15
Total,2000,


Unnamed: 0_level_0,No. breast,percent
DENSITY,Unnamed: 1_level_1,Unnamed: 2_level_1
DENSITY A,10,0.5
DENSITY B,190,9.5
DENSITY C,1530,76.5
DENSITY D,270,13.5
Total,2000,
