# DATASET CURATION - MASKED ROI PROJECT


**Objectives**: 

To create the following groups:
1. **Positive group**: BIRADS 0 that became BIRADS 3, 4, 5, 6 in the subsequent diagnostic study
2. **Negative group**: BIRADS 1, 2 and BIRADS 0 that became BIRADS 1, 2 in the subsequent diagnostic study


## 1. Prep

In [1]:
import pandas as pd
import numpy as np
import random
from tqdm import tqdm

from IPython.display import display

pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 500)

In [2]:
def get_stats(df, suffix=None):
    """Provides a quick summary of a dataframe."""
    try:
        print(f"DF shape: {df.shape}")
        print(f"# Patients: {df.empi_anon.nunique()}")
        print(f"# Cases: {df.acc_anon.nunique()}\n")
        print(f"# Images: {df.png_path.nunique()}\n")
    except Exception as e:
        print(e)

In [3]:
# EMBED
metadata_full = pd.read_csv("/home/szelesteya/projects/EMBED_Open_Data/tables/EMBED_OpenData_metadata.csv", dtype=str)
magview_full = pd.read_csv("/home/szelesteya/projects/EMBED_Open_Data/tables/EMBED_OpenData_clinical.csv", dtype=str)

In [4]:
# Selecting the following columns

meta_cols = [
    "empi_anon",
    "acc_anon",
    "ImageLateralityFinal",
    "ViewPosition",
    "study_date_anon",
    "FinalImageType",
    "png_path",
    "StudyDescription",
    "match_level",
    "num_roi",
    "ROI_coords"
]

mag_cols = [
    "empi_anon",
    "acc_anon",
    "study_date_anon",
    "desc",
    "side",
    "asses",
    "path_severity",
    "bside",
    'procdate_anon',
    'pdate_anon',
]

In [5]:
metadata = metadata_full[meta_cols].copy()
magview = magview_full[mag_cols].copy()

In [6]:
metadata.study_date_anon = pd.to_datetime(metadata.study_date_anon)
magview.study_date_anon = pd.to_datetime(magview.study_date_anon)

In [7]:
metadata.num_roi = metadata.num_roi.astype(int)

## 2. METADATA: 2D MLO & CC

In [12]:
# EMBED 2D (MLO and CC)
meta_2d = metadata.loc[(metadata.FinalImageType=="2D") & (metadata.ViewPosition.isin(["MLO", "CC"]))]
get_stats(meta_2d)

DF shape: (328961, 11)
# Patients: 22455
# Cases: 70861

# Images: 328961



In [13]:
def get_image_stats(df):
    """Provides a quick summary of the number of unique images and the ROIs."""
    temp_df = pd.merge(df, meta_2d, on=["empi_anon", "acc_anon"], how="left")
    temp_df = temp_df.loc[
        (temp_df.side==temp_df.ImageLateralityFinal)
    ]
    temp_df.drop_duplicates(subset="png_path", inplace=True)
    print(f"# PNG PATH: {int(temp_df.png_path.nunique())}")
    print(f"# ROI: {int(temp_df.num_roi.sum())}")
    print(f"{temp_df.num_roi.value_counts()}")
    del temp_df

## 3. Screening

In [14]:
# SCREENING
screening_magview = magview.loc[magview.desc.str.contains("screen", case=False)].copy()
get_stats(screening_magview)

DF shape: (58888, 10)
# Patients: 20460
# Cases: 55956

'DataFrame' object has no attribute 'png_path'


### 3.1. Creating entries for the negative contralateral breast in bilateral examinations

```
MAGVIEW only has entries if a finding exists.

This means that if an exam is a bilateral exam and only one of the breast has a finding, the contralateral breast (negative) won't have an entry.

This would be problematic at the time when we need to merge with METADATA, because the contralateral breast would be excluded.

Therefore, we would need to create rows for the negative contralateral breast.
```

In [15]:
def get_exam_laterality(row):
    """A convenient function to get the exam laterality to be used with DF.apply() instead of iterating over each row."""
    if ("bilat" in row.desc.lower()):
        return "B"
    elif ("left" in row.desc.lower()):
        return "L"
    elif ("right" in row.desc.lower()):
        return "R"
    else:
        return None

In [16]:
# Applying the get_exam_laterality function
screening_magview["exam_laterality"] = screening_magview.apply(get_exam_laterality, axis=1)

In [17]:
screening_magview.exam_laterality.value_counts(dropna=False)

exam_laterality
B    56558
L     1180
R     1150
Name: count, dtype: int64

In [18]:
screening_magview.side.value_counts(dropna=False)

side
NaN    39661
L       8264
R       8082
B       2881
Name: count, dtype: int64

In [19]:
# side == nan --> B
screening_magview.side = screening_magview.side.fillna("B")

In [20]:
# create copy for assigning B to R
screening_magview_r = screening_magview.loc[screening_magview.side=="B"].copy()
screening_magview_r.side = screening_magview.side.str.replace("B", "R")

# assigning B to L
screening_magview.side = screening_magview.side.str.replace("B", "L")

# appending R and L
screening_magview = pd.concat([screening_magview, screening_magview_r])

In [21]:
print(screening_magview.side.value_counts(dropna=False))
print(screening_magview.shape)

side
L    50806
R    50624
Name: count, dtype: int64
(101430, 11)


In [22]:
screening_magview = screening_magview.sort_values(["empi_anon", "acc_anon", "study_date_anon"]).drop_duplicates()
screening_magview

Unnamed: 0,empi_anon,acc_anon,study_date_anon,desc,side,asses,path_severity,bside,procdate_anon,pdate_anon,exam_laterality
31489,10000879,6992096043050201,2018-02-16,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,N,,,,,B
31489,10000879,6992096043050201,2018-02-16,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B
10198,10009146,4190527469809995,2014-07-04,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,N,,,,,B
10198,10009146,4190527469809995,2014-07-04,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B
20474,10015693,1334581155737139,2015-10-11,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,A,,,,,B
...,...,...,...,...,...,...,...,...,...,...,...
21119,99996622,9655172659462321,2016-06-04,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B
25708,99999564,4369225803558884,2017-04-25,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,N,,,,,B
25708,99999564,4369225803558884,2017-04-25,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B
31493,99999564,8832872399780580,2019-02-27,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,N,,,,,B


In [23]:
exam_lat_b = screening_magview.loc[screening_magview.exam_laterality=="B"]
exam_lat_b.sample(2)

Unnamed: 0,empi_anon,acc_anon,study_date_anon,desc,side,asses,path_severity,bside,procdate_anon,pdate_anon,exam_laterality
63065,92551711,5144930862304707,2017-10-08,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,N,,,,,B
77011,88606862,7861744225198264,2019-09-27,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B


In [24]:
# We want to aggregate all the sides for each bilateral exam so that we can filter those having only a single side.
exam_lat_b_agg = exam_lat_b.groupby('acc_anon')['side'].apply(''.join).reset_index()
exam_lat_b_agg.sample(2)

Unnamed: 0,acc_anon,side
37217,7219285840077245,R
16661,3757998867335947,LR


In [25]:
exam_lat_b_agg.side.value_counts()

side
LR        42769
L          4938
R          4882
RL          615
LL          123
RR          114
LLR          75
LRR          43
RLR          25
LLRR         20
RRL          16
RLL          12
LLL          11
RLLR          5
LRL           5
RRR           4
LRLR          4
RRLL          3
RRLR          2
LLLRRR        2
LRRR          1
RRRR          1
LLLLLR        1
LLRRR         1
LLLRR         1
LLLR          1
RRRLLL        1
LRRL          1
Name: count, dtype: int64

In [26]:
exam_lat_b_side_r = exam_lat_b_agg.loc[~(exam_lat_b_agg.side.str.contains("L"))].copy()
exam_lat_b_side_l = exam_lat_b_agg.loc[~(exam_lat_b_agg.side.str.contains("R"))].copy()

In [27]:
screening_magview_right_to_left = screening_magview.loc[screening_magview.acc_anon.isin(exam_lat_b_side_r.acc_anon)].copy().drop_duplicates()
screening_magview_left_to_right = screening_magview.loc[screening_magview.acc_anon.isin(exam_lat_b_side_l.acc_anon)].copy().drop_duplicates()

In [28]:
# Creating the negative Left side
screening_magview_right_to_left.loc[screening_magview_right_to_left.side=="R", "side"] = "L"
screening_magview_right_to_left.loc[screening_magview_right_to_left.side=="L", "asses"] = "N"
screening_magview_right_to_left.loc[screening_magview_right_to_left.side=="L", "path_severity"] = np.nan

screening_magview_right_to_left

Unnamed: 0,empi_anon,acc_anon,study_date_anon,desc,side,asses,path_severity,bside,procdate_anon,pdate_anon,exam_laterality
37325,10033806,1069386741434572,2019-10-05,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,N,,,,,B
64534,10043985,1960584382049532,2018-04-18,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,N,,,,,B
59088,10043985,3613575521057039,2017-03-01,MG Screening Bilateral,L,N,,,,,B
42847,10043985,9492972692582499,2014-05-14,MG Screening Bilateral,L,N,,,,,B
8607,10065082,6346759651734606,2015-03-03,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,N,,,,,B
...,...,...,...,...,...,...,...,...,...,...,...
11436,99853035,2905584160156737,2015-02-15,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,N,,,,,B
33776,99853035,6677454260490853,2019-02-07,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,N,,,,,B
67642,99860105,6470240272862407,2018-03-19,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,N,,,,,B
12820,99871644,5176060292067455,2015-06-26,MG Screening Bilateral,L,N,,,,,B


In [29]:
# Creating the negative Right side
screening_magview_left_to_right.loc[screening_magview_left_to_right.side=="L", "side"] = "R"
screening_magview_left_to_right.loc[screening_magview_left_to_right.side=="R", "asses"] = "N"
screening_magview_left_to_right.loc[screening_magview_left_to_right.side=="R", "path_severity"] = np.nan

screening_magview_left_to_right

Unnamed: 0,empi_anon,acc_anon,study_date_anon,desc,side,asses,path_severity,bside,procdate_anon,pdate_anon,exam_laterality
20474,10015693,1334581155737139,2015-10-11,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B
25070,10023113,5135241747022662,2016-10-05,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B
63407,10029585,3189592535497441,2017-06-11,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B
62320,10042753,1955284757719450,2017-06-15,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B
37004,10044241,3993319361430024,2019-07-27,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B
...,...,...,...,...,...,...,...,...,...,...,...
80934,99881569,1140879824262422,2021-01-05,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B
43901,99881569,3921887412575009,2013-11-02,MG Screening Bilateral w/CAD,R,N,,,,,B
9588,99908618,9288525074493489,2014-08-14,MG Screening Bilateral w/CAD,R,N,,L,2014-08-26,2014-08-27 00:00:00,B
6519,99957941,2224428804635608,2014-06-03,MG Screening Bilateral,R,N,,,,,B


In [30]:
# Merging the original and the two negative contralaterals
screening_magview_with_contralat = pd.concat([screening_magview, screening_magview_left_to_right, screening_magview_right_to_left]).sort_values(["empi_anon", "acc_anon", "study_date_anon"]).drop_duplicates()
screening_magview_with_contralat.sample(2)

Unnamed: 0,empi_anon,acc_anon,study_date_anon,desc,side,asses,path_severity,bside,procdate_anon,pdate_anon,exam_laterality
64187,95391222,2645893262086235,2017-04-11,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B
18388,68630789,3761982580693787,2016-03-10,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,N,,,,,B


In [31]:
get_image_stats(screening_magview_with_contralat)

# PNG PATH: 258269
# ROI: 5255
num_roi
0.0    253498
1.0      4330
2.0       399
3.0        41
4.0         1
Name: count, dtype: int64


### 3.2. BIRADS 0

In [32]:
b0 = screening_magview_with_contralat.loc[screening_magview_with_contralat.asses.isin(["A"])]

get_stats(b0)
get_image_stats(b0)

DF shape: (10876, 11)
# Patients: 7747
# Cases: 8829

'DataFrame' object has no attribute 'png_path'
# PNG PATH: 24387
# ROI: 5131
num_roi
0.0    19725
1.0     4232
2.0      392
3.0       37
4.0        1
Name: count, dtype: int64


### 3.3. BIRADS 1, 2

In [33]:
b12 = screening_magview_with_contralat.loc[screening_magview_with_contralat.asses.isin(["B", "N"])]

get_stats(b12)
get_image_stats(b12)

DF shape: (99482, 11)
# Patients: 19665
# Cases: 54081

'DataFrame' object has no attribute 'png_path'
# PNG PATH: 234362
# ROI: 192
num_roi
0.0    234192
1.0       153
2.0        12
3.0         5
Name: count, dtype: int64


## 4. Diagnostic

In [35]:
diag_magview = magview.loc[magview.desc.str.contains('diag', case=False)]

get_stats(diag_magview)
print()
print(f"Asses Counts:\n{diag_magview.asses.value_counts()}")

DF shape: (22888, 10)
# Patients: 9656
# Cases: 16814

'DataFrame' object has no attribute 'png_path'

Asses Counts:
asses
B    8794
P    5563
N    4193
S    3063
A     580
K     386
M     284
X      25
Name: count, dtype: int64


## 5. Screening BIRADS 0 and Diagnostic

In [36]:
b0_dx = pd.merge(b0, diag_magview, on='empi_anon', suffixes=[None, "_dx"])
b0_dx = b0_dx.loc[
    (b0_dx.side==b0_dx.side_dx)
    | (b0_dx.side_dx=="B")
    | (b0_dx.side_dx.isna())
]

In [37]:
# Getting only subsequent diagnostic studies within 3 months
b0_dx["delta_date_dx"] = (b0_dx.study_date_anon_dx - b0_dx.study_date_anon).dt.days
b0_dx_3mo = b0_dx.loc[b0_dx.delta_date_dx.isin(range(0, 91))]
b0_dx_3mo.sample(1)

Unnamed: 0,empi_anon,acc_anon,study_date_anon,desc,side,asses,path_severity,bside,procdate_anon,pdate_anon,exam_laterality,acc_anon_dx,study_date_anon_dx,desc_dx,side_dx,asses_dx,path_severity_dx,bside_dx,procdate_anon_dx,pdate_anon_dx,delta_date_dx
13888,70300497,4868622619390927,2015-12-07,MG Screening Bilateral,R,A,,,,,B,5362144262639097,2016-02-11,MG Diagnostic Mammo Bilateral,R,B,,,,,66


### 5.1. BIRADS 0 (Screening) --> BIRADS 1, 2 (Diagnostic)

In [38]:
b0_12dx = b0_dx_3mo.loc[b0_dx_3mo.asses_dx.isin(["N", "B"])].copy()
get_stats(b0_12dx)
get_image_stats(b0_12dx)

DF shape: (3755, 21)
# Patients: 2924
# Cases: 3169

'DataFrame' object has no attribute 'png_path'
# PNG PATH: 7695
# ROI: 1676
num_roi
0.0    6212
1.0    1306
2.0     162
3.0      14
4.0       1
Name: count, dtype: int64


### 5.2. BIRADS 0 (Screening) --> BIRADS 3, 4, 5, 6 (Diagnostic)

In [39]:
b0_3456dx = b0_dx_3mo.loc[b0_dx_3mo.asses_dx.isin(["P", "S", "M", "K"])].copy()
get_stats(b0_3456dx)
get_image_stats(b0_3456dx)

DF shape: (3491, 21)
# Patients: 2100
# Cases: 2187

'DataFrame' object has no attribute 'png_path'
# PNG PATH: 5543
# ROI: 1416
num_roi
0.0    4268
1.0    1150
2.0     109
3.0      16
Name: count, dtype: int64


## 6. Negative group

In [40]:
# Negative group = BIRADS_12 + BIRADS_0_12dx
neg_group = pd.concat([b12, b0_12dx])
neg_group.drop_duplicates(inplace=True)

get_stats(neg_group)
get_image_stats(neg_group)

DF shape: (103023, 21)
# Patients: 19877
# Cases: 54639

'DataFrame' object has no attribute 'png_path'
# PNG PATH: 241885
# ROI: 1860
num_roi
0.0    240240
1.0      1451
2.0       174
3.0        19
4.0         1
Name: count, dtype: int64


In [41]:
# Include only ones with negative follow-up after 1 year
neg_group_b12 = pd.merge(neg_group, b12, on=["empi_anon"], suffixes=(None, "_1yrfu"))

neg_group_b12 = neg_group_b12.loc[
    (neg_group_b12.side==neg_group_b12.side_1yrfu)
]

neg_group_b12["delta_date_1yrfu"] = (neg_group_b12.study_date_anon_1yrfu - neg_group_b12.study_date_anon).dt.days

get_stats(neg_group_b12)
get_image_stats(neg_group_b12)

neg_group_b12.sample(2)

DF shape: (419008, 32)
# Patients: 19665
# Cases: 54427

'DataFrame' object has no attribute 'png_path'
# PNG PATH: 239895
# ROI: 1287
num_roi
0.0    238741
1.0      1034
2.0       108
3.0        11
4.0         1
Name: count, dtype: int64


Unnamed: 0,empi_anon,acc_anon,study_date_anon,desc,side,asses,path_severity,bside,procdate_anon,pdate_anon,exam_laterality,acc_anon_dx,study_date_anon_dx,desc_dx,side_dx,asses_dx,path_severity_dx,bside_dx,procdate_anon_dx,pdate_anon_dx,delta_date_dx,acc_anon_1yrfu,study_date_anon_1yrfu,desc_1yrfu,side_1yrfu,asses_1yrfu,path_severity_1yrfu,bside_1yrfu,procdate_anon_1yrfu,pdate_anon_1yrfu,exam_laterality_1yrfu,delta_date_1yrfu
668586,82719304,9575884414073277,2016-01-19,MG Screening Bilateral,R,N,,,,,B,,NaT,,,,,,,,,3954966127387819,2017-01-21,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B,368
775008,94630072,4186561795774382,2016-02-14,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B,,NaT,,,,,,,,,7378930085211253,2017-02-19,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B,371


In [42]:
neg_group_1yrfu = neg_group_b12.loc[
    (neg_group_b12.delta_date_1yrfu > 360)
]
get_stats(neg_group_1yrfu)
get_image_stats(neg_group_1yrfu)

DF shape: (158091, 32)
# Patients: 11590
# Cases: 34180

'DataFrame' object has no attribute 'png_path'
# PNG PATH: 150887
# ROI: 690
num_roi
0.0    150266
1.0       563
2.0        48
3.0         9
4.0         1
Name: count, dtype: int64


In [43]:
neg_group_1yrfu_first_study = neg_group_1yrfu.sort_values(["empi_anon", "acc_anon", "study_date_anon_1yrfu"]).drop_duplicates(subset=["acc_anon", "side"]) # to only get the first followup study
get_stats(neg_group_1yrfu_first_study)
get_image_stats(neg_group_1yrfu_first_study)

DF shape: (63835, 32)
# Patients: 11590
# Cases: 34180

'DataFrame' object has no attribute 'png_path'
# PNG PATH: 150887
# ROI: 690
num_roi
0.0    150266
1.0       563
2.0        48
3.0         9
4.0         1
Name: count, dtype: int64


In [44]:
neg_group_1yrfu_first_study.path_severity.value_counts()

path_severity
4.0    25
2.0     7
0.0     4
3.0     1
Name: count, dtype: int64

In [45]:
# Exclude any patient with any biopsy result
neg_group_1yrfu_first_study_no_biopsy = neg_group_1yrfu_first_study.loc[neg_group_1yrfu_first_study.path_severity.isna()].copy()

In [46]:
# Merging with METADATA to get the images
neg_group_1yrfu_first_study_no_biopsy_images = pd.merge(neg_group_1yrfu_first_study_no_biopsy, meta_2d, on=["empi_anon", "acc_anon", "study_date_anon"])
neg_group_1yrfu_first_study_no_biopsy_images = neg_group_1yrfu_first_study_no_biopsy_images.loc[
    (neg_group_1yrfu_first_study_no_biopsy_images.side == neg_group_1yrfu_first_study_no_biopsy_images.ImageLateralityFinal)
]
neg_group_1yrfu_first_study_no_biopsy_images.drop_duplicates(subset="png_path", inplace=True)
get_stats(neg_group_1yrfu_first_study_no_biopsy_images)

DF shape: (149727, 40)
# Patients: 11243
# Cases: 33180

# Images: 149727



In [47]:
print(f"ROIs = {neg_group_1yrfu_first_study_no_biopsy_images.num_roi.sum()}")
print(neg_group_1yrfu_first_study_no_biopsy_images.num_roi.value_counts())

ROIs = 678
num_roi
0    149116
1       555
2        46
3         9
4         1
Name: count, dtype: int64


## 7. Positive Group

In [48]:
pos_group_images = pd.merge(b0_3456dx, meta_2d, on=["empi_anon", "acc_anon", "study_date_anon"])
pos_group_images = pos_group_images.loc[
    (pos_group_images.side == pos_group_images.ImageLateralityFinal)
]
pos_group_images.drop_duplicates(subset="png_path", inplace=True)
get_stats(pos_group_images)

DF shape: (5521, 29)
# Patients: 2050
# Cases: 2135

# Images: 5521



In [49]:
print(f"ROIs  = {pos_group_images.num_roi.sum()}")
print(pos_group_images.num_roi.value_counts())

ROIs  = 1414
num_roi
0    4248
1    1148
2     109
3      16
Name: count, dtype: int64


## 8. Excluding Images from the Negative Group that are found in the Positive Group using acc_anon and side

In [50]:
# Merge negatives and positive groups
neg_pos = pd.merge(neg_group_1yrfu_first_study_no_biopsy_images, pos_group_images, on=["empi_anon", "acc_anon", "side"], suffixes=["_neg", "_pos"])
neg_pos.sample(2)

Unnamed: 0,empi_anon,acc_anon,study_date_anon_neg,desc_neg,side,asses_neg,path_severity_neg,bside_neg,procdate_anon_neg,pdate_anon_neg,exam_laterality_neg,acc_anon_dx_neg,study_date_anon_dx_neg,desc_dx_neg,side_dx_neg,asses_dx_neg,path_severity_dx_neg,bside_dx_neg,procdate_anon_dx_neg,pdate_anon_dx_neg,delta_date_dx_neg,acc_anon_1yrfu,study_date_anon_1yrfu,desc_1yrfu,side_1yrfu,asses_1yrfu,path_severity_1yrfu,bside_1yrfu,procdate_anon_1yrfu,pdate_anon_1yrfu,exam_laterality_1yrfu,delta_date_1yrfu,ImageLateralityFinal_neg,ViewPosition_neg,FinalImageType_neg,png_path_neg,StudyDescription_neg,match_level_neg,num_roi_neg,ROI_coords_neg,study_date_anon_pos,desc_pos,asses_pos,path_severity_pos,bside_pos,procdate_anon_pos,pdate_anon_pos,exam_laterality_pos,acc_anon_dx_pos,study_date_anon_dx_pos,desc_dx_pos,side_dx_pos,asses_dx_pos,path_severity_dx_pos,bside_dx_pos,procdate_anon_dx_pos,pdate_anon_dx_pos,delta_date_dx_pos,ImageLateralityFinal_pos,ViewPosition_pos,FinalImageType_pos,png_path_pos,StudyDescription_pos,match_level_pos,num_roi_pos,ROI_coords_pos
484,90739748,7155394177428117,2015-11-23,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,A,,,,,B,8225301671906659,2015-12-09,MG Diagnostic Left w/CAD,L,N,,,,,16.0,6279142675661006,2019-10-21,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,N,,,,,B,1428,L,MLO,2D,/mnt/PACS_NAS1/mammo/png/cohort_2/extracted-im...,MG Screen Bilat w/Tomo/CAD Stnd Protocol,[1],1,"((1752, 623, 1983, 985),)",2015-11-23,MG Screen Bilat w/Tomo/CAD Stnd Protocol,A,,,,,B,8225301671906659,2015-12-09,MG Diagnostic Left w/CAD,L,P,,,,,16,L,MLO,2D,/mnt/PACS_NAS1/mammo/png/cohort_2/extracted-im...,MG Screen Bilat w/Tomo/CAD Stnd Protocol,[1],1,"((1752, 623, 1983, 985),)"
410,86343112,2369418586248410,2014-06-29,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,A,,,,,B,7394215681214742,2014-07-03,MG Diagnostic Right w/Tomosynthesis,R,N,,,,,4.0,8490502153918576,2016-10-08,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B,832,R,CC,2D,/mnt/PACS_NAS1/mammo/png/cohort_1/extracted-im...,MG Screen Bilat w/Tomo/CAD Stnd Protocol,[],0,(),2014-06-29,MG Screen Bilat w/Tomo/CAD Stnd Protocol,A,,,,,B,7394215681214742,2014-07-03,MG Diagnostic Right w/Tomosynthesis,R,P,,,,,4,R,CC,2D,/mnt/PACS_NAS1/mammo/png/cohort_1/extracted-im...,MG Screen Bilat w/Tomo/CAD Stnd Protocol,[],0,()


In [51]:
# Create new KeyID of acc_anon + side on negative group and negative+positive group
neg_pos["acc_anon_side"] = neg_pos.acc_anon + neg_pos.side
neg_pos.sample(2)

Unnamed: 0,empi_anon,acc_anon,study_date_anon_neg,desc_neg,side,asses_neg,path_severity_neg,bside_neg,procdate_anon_neg,pdate_anon_neg,exam_laterality_neg,acc_anon_dx_neg,study_date_anon_dx_neg,desc_dx_neg,side_dx_neg,asses_dx_neg,path_severity_dx_neg,bside_dx_neg,procdate_anon_dx_neg,pdate_anon_dx_neg,delta_date_dx_neg,acc_anon_1yrfu,study_date_anon_1yrfu,desc_1yrfu,side_1yrfu,asses_1yrfu,path_severity_1yrfu,bside_1yrfu,procdate_anon_1yrfu,pdate_anon_1yrfu,exam_laterality_1yrfu,delta_date_1yrfu,ImageLateralityFinal_neg,ViewPosition_neg,FinalImageType_neg,png_path_neg,StudyDescription_neg,match_level_neg,num_roi_neg,ROI_coords_neg,study_date_anon_pos,desc_pos,asses_pos,path_severity_pos,bside_pos,procdate_anon_pos,pdate_anon_pos,exam_laterality_pos,acc_anon_dx_pos,study_date_anon_dx_pos,desc_dx_pos,side_dx_pos,asses_dx_pos,path_severity_dx_pos,bside_dx_pos,procdate_anon_dx_pos,pdate_anon_dx_pos,delta_date_dx_pos,ImageLateralityFinal_pos,ViewPosition_pos,FinalImageType_pos,png_path_pos,StudyDescription_pos,match_level_pos,num_roi_pos,ROI_coords_pos,acc_anon_side
75,23403355,2178194762485773,2013-11-15,MG Screening Bilateral w/CAD,L,B,,,,,B,,NaT,,,,,,,,,5145328297721299,2015-01-02,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,N,,,,,B,413,L,MLO,2D,/mnt/PACS_NAS1/mammo/png/cohort_2/extracted-im...,MG Screening Bilateral w/CAD,[],0,(),2013-11-15,MG Screening Bilateral w/CAD,A,,,,,B,3539141438628064,2013-11-29,MG Diagnostic Left,L,S,4.0,L,2013-12-14,2013-12-18 00:00:00,14,L,MLO,2D,/mnt/PACS_NAS1/mammo/png/cohort_2/extracted-im...,MG Screening Bilateral w/CAD,[],0,(),2178194762485773L
284,69943636,9006977453526071,2013-11-09,MG Screening Bilateral w/CAD,L,A,,,,,B,1753412361881724.0,2013-11-23,MG Diagnostic Bilateral w/Tomosynthesis,L,B,,,,,14.0,9653903342984476,2019-10-10,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,N,,,,,B,2161,L,CC,2D,/mnt/PACS_NAS1/mammo/png/cohort_2/extracted-im...,MG Screening Bilateral w/CAD,[],0,(),2013-11-09,MG Screening Bilateral w/CAD,A,,,,,B,1753412361881724,2013-11-23,MG Diagnostic Bilateral w/Tomosynthesis,L,P,,,,,14,L,MLO,2D,/mnt/PACS_NAS1/mammo/png/cohort_2/extracted-im...,MG Screening Bilateral w/CAD,[],0,(),9006977453526071L


In [52]:
neg_group_1yrfu_first_study_no_biopsy_images["acc_anon_side"] = neg_group_1yrfu_first_study_no_biopsy_images.acc_anon + neg_group_1yrfu_first_study_no_biopsy_images.side
neg_group_1yrfu_first_study_no_biopsy_images.sample(2)

Unnamed: 0,empi_anon,acc_anon,study_date_anon,desc,side,asses,path_severity,bside,procdate_anon,pdate_anon,exam_laterality,acc_anon_dx,study_date_anon_dx,desc_dx,side_dx,asses_dx,path_severity_dx,bside_dx,procdate_anon_dx,pdate_anon_dx,delta_date_dx,acc_anon_1yrfu,study_date_anon_1yrfu,desc_1yrfu,side_1yrfu,asses_1yrfu,path_severity_1yrfu,bside_1yrfu,procdate_anon_1yrfu,pdate_anon_1yrfu,exam_laterality_1yrfu,delta_date_1yrfu,ImageLateralityFinal,ViewPosition,FinalImageType,png_path,StudyDescription,match_level,num_roi,ROI_coords,acc_anon_side
2849,10908410,8705088827027165,2017-05-24,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B,,NaT,,,,,,,,,4104556289608579,2018-06-06,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B,378,R,MLO,2D,/mnt/PACS_NAS1/mammo/png/cohort_1/extracted-im...,MG Screen Bilat w/Tomo/CAD Stnd Protocol,[],0,(),8705088827027165R
268612,91746780,5521629479489499,2016-09-08,MG Screening Bilateral,L,N,,,,,B,,NaT,,,,,,,,,7461032943488451,2017-09-15,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,N,,,,,B,372,L,MLO,2D,/mnt/PACS_NAS1/mammo/png/cohort_2/extracted-im...,MG Screening Bilateral,[],0,(),5521629479489499L


In [53]:
# Removing any images that are found in the positive group from the negative group using the created KeyID (acc_anon+side)
neg_group_final = neg_group_1yrfu_first_study_no_biopsy_images.loc[~neg_group_1yrfu_first_study_no_biopsy_images.acc_anon_side.isin(neg_pos.acc_anon_side)]
neg_group_final.sample(2)

Unnamed: 0,empi_anon,acc_anon,study_date_anon,desc,side,asses,path_severity,bside,procdate_anon,pdate_anon,exam_laterality,acc_anon_dx,study_date_anon_dx,desc_dx,side_dx,asses_dx,path_severity_dx,bside_dx,procdate_anon_dx,pdate_anon_dx,delta_date_dx,acc_anon_1yrfu,study_date_anon_1yrfu,desc_1yrfu,side_1yrfu,asses_1yrfu,path_severity_1yrfu,bside_1yrfu,procdate_anon_1yrfu,pdate_anon_1yrfu,exam_laterality_1yrfu,delta_date_1yrfu,ImageLateralityFinal,ViewPosition,FinalImageType,png_path,StudyDescription,match_level,num_roi,ROI_coords,acc_anon_side
144673,54158646,4724842923697642,2018-07-18,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B,,NaT,,,,,,,,,9458387876781172,2019-12-26,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B,526,R,CC,2D,/mnt/PACS_NAS1/mammo/png/cohort_2/extracted-im...,MG Screen Bilat w/Tomo/CAD Stnd Protocol,[],0,(),4724842923697642R
155457,57120502,9590770378094025,2013-11-04,MG Screening Bilateral w/CAD,R,N,,,,,B,,NaT,,,,,,,,,2295083017858248,2014-12-22,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B,413,R,MLO,2D,/mnt/PACS_NAS1/mammo/png/cohort_1/extracted-im...,MG Screening Bilateral w/CAD,[],0,(),9590770378094025R


In [54]:
get_stats(neg_group_final)

print(f"ROIs  = {neg_group_final.num_roi.sum()}")
print(neg_group_final.num_roi.value_counts())

DF shape: (149523, 41)
# Patients: 11238
# Cases: 33162

# Images: 149523

ROIs  = 644
num_roi
0    148940
1       532
2        42
3         8
4         1
Name: count, dtype: int64


## 9. Saving and Exporting

In [55]:
columns_to_save = [
    'empi_anon',
    'acc_anon',
    'desc',
    'asses',
    'asses_dx',
    'path_severity',
    'study_date_anon',
    'study_date_anon_dx',
    'side',
    'ImageLateralityFinal',
    'bside',
    'ViewPosition',
    'match_level',
    'num_roi',
    'ROI_coords',
    'png_path',
]

In [57]:
neg_group_final[columns_to_save].to_csv("NEGATIVE_GROUP.csv", index=False)

In [58]:
pos_group_images[columns_to_save].to_csv("POSITIVE_GROUP.csv", index=False)

# END