In [1]:
import os
import numpy as np
import pandas as pd
import sys
from tqdm import tqdm
from pydicom import dcmread
import shutil
import subprocess as sp

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 5)

In [2]:
def get_dicom_meta_from_file(dicom_root_dir, patient_id, img_id, extension = 'dcm'):
    dicom_path = os.path.join(dicom_root_dir, f'{patient_id}/{img_id}.{extension}')
    return dcmread(dicom_path, stop_before_pixels=True)


def get_dicom_meta(dicom_root_dir, dicom_df, extension = 'dcm'):
    dicom_data = dict()
    keywords = set()
    dicom_df = dicom_df[["patient_id", "image_id"]].copy()

    for index, row in tqdm(dicom_df.iterrows()):
        patient_id = row["patient_id"]
        image_id = row["image_id"]
        dicom = get_dicom_meta_from_file(dicom_root_dir, patient_id, image_id, extension)
        assert dicom.get('ModalityLUTSequence', None) is None
        if patient_id not in dicom_data:
            dicom_data[patient_id] = dict()
        if image_id not in dicom_data[patient_id]:
            dicom_data[patient_id][image_id] = dict()
        for feature in dicom.iterall():
            dicom_data[patient_id][image_id][feature.keyword] = feature.value
            keywords.add(feature.keyword)
    for keyword in keywords:
        dicom_df[keyword] = dicom_df[["patient_id", "image_id"]].apply(lambda x: np.nan if keyword not in dicom_data[x.patient_id][x.image_id] else dicom_data[x.patient_id][x.image_id][keyword], axis=1)
    print(": Keywords extracted from dicom files:")
    for keyword in keywords:
        print("--> {}".format(keyword))
    return dicom_df

### Competition data 

In [None]:
comp_df = pd.read_csv('/home/dangnh36/datasets/.comp/rsna/cv/v1/train_fold_0.csv')
print(comp_df.columns)
comp_df

### Vindr

In [None]:
ROOT_DIR = '/home/dangnh36/datasets/.comp/rsna/external/vindr/'
DCM_ROOT_DIR = os.path.join(ROOT_DIR, 'images')

In [None]:
df = pd.read_csv(os.path.join(ROOT_DIR, 'breast-level_annotations.csv'))
df

In [None]:
df.columns

In [None]:
df.rename(columns = {'study_id': 'patient_id', 'view_position': 'view',
                     'breast_birads': 'BIRADS', 'breast_density': 'density'}, inplace = True)
df.BIRADS = df.BIRADS.apply(lambda x: int(x.split()[-1]))
df.density = df.density.apply(lambda x: x.split()[-1] if type(x) == str else x)
df

In [None]:
df.density.value_counts()

In [None]:
df.BIRADS.value_counts()

In [None]:
meta_df = get_dicom_meta(DCM_ROOT_DIR, df, extension = 'dicom')

In [None]:
meta_df

In [None]:
meta_df[~meta_df.VOILUTSequence.isna()]

In [None]:
arr = np.array(meta_df.at[19965, 'LUTData'])
arr

In [None]:
arr.max(), arr.mean(), arr.min()

In [None]:
list(set(arr))

In [None]:
arr[2048]

In [None]:
meta_df.to_csv(os.path.join(ROOT_DIR, 'dicom_meta_only.csv'), index = False)

In [None]:
meta_df.rename(columns = {name:'__' + name for name in meta_df.columns if name != 'image_id'}, inplace = True)
merged = pd.merge(
    df,
    meta_df,
    how="inner",
    on='image_id',
    left_on=None,
    right_on=None,
    left_index=False,
    right_index=False,
    sort=False,
    suffixes=("??", "__"),
    copy=True,
    indicator=False,
    validate='1:1',
)
merged

In [None]:
merged.at[0, '__VOILUTSequence'][1].get('LUTData', None)

In [None]:
merged.at[0, '__VOILUTSequence'][1].get('LUTData', None)

In [None]:
merged.at[0, '__VOILUTSequence'][0].get('LUTData', None)

In [None]:
merged.at[0, '__VOILUTSequence'][2]

In [None]:
merged.columns

In [None]:
merged.split.value_counts()

In [None]:
merged['__hasVOILUTSequence'] = merged['__VOILUTSequence'].isna()
merged['__hasLUTDescriptor'] = merged['__LUTDescriptor'].isna()
merged['__hasLUTData'] = merged['__LUTData'].isna()

In [None]:
merged.rename(columns = {'__PatientAge': 'age'}, inplace = True)

In [None]:
merged['__PhotometricInterpretation'].value_counts()

In [None]:
merged['__LUTExplanation'].value_counts()

In [None]:
merged = merged[['patient_id', 'series_id', 'image_id', 'laterality', 'view', 'age',
       'BIRADS', 'density', 'split', '__SmallestImagePixelValue',
       '__PixelRepresentation', '__SOPInstanceUID', '__PatientWeight', '__WindowWidth',
       '__PixelPaddingValue', '__BodyPartExamined',
       '__StudyInstanceUID', '__ViewPosition', '__LargestImagePixelValue',
       '__PixelSpacing', '__PixelPaddingRangeLimit',
       '__SeriesInstanceUID', '__SamplesPerPixel', '__PresentationLUTShape',
                 '__WindowCenterWidthExplanation', '__WindowCenter',
       '__ImagerPixelSpacing', '__VOILUTFunction', '__RescaleIntercept', '__RescaleSlope',
        '__PhotometricInterpretation', '__Rows', '__Columns',
                 '__hasVOILUTSequence', '__hasLUTDescriptor', '__hasLUTData' ]]
merged

In [None]:
merged.BIRADS.value_counts()

In [None]:
merged['cancer'] = merged.BIRADS.apply(lambda x: 1 if x == 5 else 0)

In [None]:
merged.cancer.value_counts()

In [None]:
merged.to_csv(os.path.join(ROOT_DIR, 'all_labels.csv'), index = False)

In [None]:
import dicomsdl

In [None]:
merged[merged['__hasVOILUTSequence'] == True]

In [None]:
dcm = dicomsdl.open('/home/dangnh36/datasets/.comp/rsna/external/vindr/images/b8d273e8601f348d3664778dae0e7e0b/d8125545210c08e1b1793a5af6458ee2.dicom')

In [None]:
dcm.VOILUTSequence[0]

### MiniDDSM

In [None]:
ROOT_DIR = '/home/dangnh36/datasets/.comp/rsna/external/miniddsm/'
DCM_ROOT_DIR = os.path.join(ROOT_DIR, 'MINI-DDSM-Complete-PNG-16')

In [None]:
df = pd.read_excel(os.path.join(ROOT_DIR, 'Data-MoreThanTwoMasks', 'Data-MoreThanTwoMasks.xlsx'))
df

In [None]:
df['patient_id'] = df['fileName'].apply(lambda x: x.split('.')[0])
df['image_id'] = df['patient_id'] + '@' + df['Side'] + '@' + df['View']
df

In [None]:
df.columns

In [None]:
df.patient_id.nunique()

In [None]:
df.image_id.nunique()

In [None]:
for patient_id, sub_df in tqdm(df.groupby('patient_id')):
    assert sub_df.Status.nunique() == 1

In [None]:
for patient_id, sub_df in tqdm(df.groupby(['patient_id'])):
    sub_df = sub_df.reset_index(drop = True)
    assert sub_df.Status.nunique() == 1
    status = sub_df.at[0, 'Status']
    no_annotation = True
    for j in range(len(sub_df)):
        if type(sub_df.at[j, 'Tumour_Contour']) == str:
            if len(sub_df.at[j, 'Tumour_Contour']) > 5:
                no_annotation = False
    if status == 'Normal':
        if not no_annotation:
            display(sub_df)
            break
    else:
        if no_annotation:
            display(sub_df)
            break

In [None]:
cancer_dict = {}
for patient_id, sub_df in tqdm(df.groupby(['patient_id', 'Side'])):
    breast = '@'.join(patient_id)
    sub_df = sub_df.reset_index(drop = True)
    assert sub_df.Status.nunique() == 1
    status = sub_df.at[0, 'Status']
    no_annotation = True
    for j in range(len(sub_df)):
        if type(sub_df.at[j, 'Tumour_Contour']) == str:
            if len(sub_df.at[j, 'Tumour_Contour']) > 5:
                no_annotation = False
    if status == 'Normal':
        if not no_annotation:
            raise AssertionError()
        else:
            cancer_dict[breast] = 0
    elif status == 'Benign':
        cancer_dict[breast] = 0
    elif status == 'Cancer':
        if no_annotation:
            cancer_dict[breast] = 0
        else:
            cancer_dict[breast] = 1
    else:
        raise AssertionError()

In [None]:
cancer_dict

In [None]:
df['cancer'] = -1

In [None]:
for i in tqdm(range(len(df))):
    breast = df.at[i, 'patient_id'] + '@' + df.at[i, 'Side']
    df.at[i, 'cancer'] = cancer_dict[breast]
df

In [None]:
df[df.cancer == 1].Status.value_counts()

In [None]:
df[df.cancer == 1]['Tumour_Contour'].isna().any()

In [None]:
(df[(df.cancer == 0) & (df.Status=='Cancer')]['Tumour_Contour'].apply(lambda x: len(x)) > 1).any()

In [None]:
df.cancer.value_counts()

In [None]:
df.columns

In [None]:
src_dir = os.path.join(ROOT_DIR, 'MINI-DDSM-Complete-PNG-16')
dst_dir = os.path.join(ROOT_DIR, 'png16_v2')
for i in tqdm(range(len(df))):
    status = df.at[i, 'Status']
    patient_id = df.at[i, 'patient_id']
    patient_idx = patient_id.split('_')[1]
    image_id = df.at[i, 'image_id']
    name = df.at[i, 'fileName']
    src_path = os.path.join(src_dir, status, patient_idx, name)
    dst_path = os.path.join(dst_dir, patient_id, f'{image_id}.png')
    os.makedirs(os.path.dirname(dst_path), exist_ok=True)
    shutil.copy2(src_path, dst_path)

In [None]:
df.Density.value_counts()

In [None]:
# LEFT, RIGHT --> L, R
df['Side'] = df['Side'].apply(lambda x: x[0])

density_map = {1: 'A', 2: 'B', 3: 'C', 4: 'D', 0: 'A'}
df['Density'] = df['Density'].apply(lambda x: density_map[x])

In [None]:
df.drop(columns = ['fullPath', 'fileName', 'Tumour_Contour', 'Tumour_Contour2',
                   'Tumour_Contour3', 'Tumour_Contour4', 'Tumour_Contour5',
                   'Tumour_Contour6'], inplace = True)
df.rename(columns = {'View': 'view', 'Side': 'laterality',
                    'Status': 'ddsm_ori_status', 'Age': 'age', 'Density': 'density',
                    }, inplace = True)

In [None]:
df = df[['patient_id', 'image_id', 'view', 'laterality', 'density', 'age', 'ddsm_ori_status', 'cancer']]
df

In [None]:
df.to_csv(os.path.join(ROOT_DIR, 'label_v2.csv'), index = False)

### CMMD

In [29]:
ROOT_DIR = '/home/dangnh36/datasets/.comp/rsna/external/cmmd'
DCM_ROOT_DIR = os.path.join(ROOT_DIR, 'CMMD')

In [30]:
df = pd.read_csv(os.path.join(ROOT_DIR, 'CMMD_label.csv'))
df

Unnamed: 0,ID1,LeftRight,Age,number,abnormality,classification,subtype
0,D1-0001,R,44,2,calcification,Benign,
1,D1-0002,L,40,2,calcification,Benign,
...,...,...,...,...,...,...,...
1870,D2-0748,R,29,2,mass,Malignant,Luminal B
1871,D2-0749,L,42,2,mass,Malignant,Luminal B


In [31]:
df.classification.value_counts()

Malignant    1316
Benign        556
Name: classification, dtype: int64

In [32]:
df[df.classification.isna()]

Unnamed: 0,ID1,LeftRight,Age,number,abnormality,classification,subtype


In [7]:
src_dir = DCM_ROOT_DIR
dst_dir = os.path.join(ROOT_DIR, 'dicoms_v2')

for patient_id, sub_df in tqdm(df.groupby('ID1')):
    tmp_dir = os.path.join(src_dir, patient_id)
    tmp2_dirs = os.listdir(tmp_dir)
    assert len(tmp2_dirs) == 1
    tmp3_dirs = os.listdir(os.path.join(tmp_dir, tmp2_dirs[0]))
    assert len(tmp3_dirs) == 1
    src_img_dir = os.path.join(tmp_dir, tmp2_dirs[0], tmp3_dirs[0])
    img_names = os.listdir(src_img_dir)
#     if len(img_names) != sub_df.number.sum():
#         print(patient_id, img_names)
#         print(sub_df.number)

#     for img_name in img_names:
#         src_img_path = os.path.join(src_img_dir, img_name)
#         dst_img_path = os.path.join(dst_dir, patient_id, f'{patient_id}_{img_name}')
#         os.makedirs(os.path.dirname(dst_img_path), exist_ok=True)
#         shutil.copy2(src_img_path, dst_img_path)
    

100%|███████████████████████████████████████| 1775/1775 [00:01<00:00, 1155.52it/s]


In [8]:
all_patient_ids = []
all_image_ids = []

patients = os.listdir(dst_dir)
for patient in patients:
    patient_dir = os.path.join(dst_dir, patient)
    image_names = os.listdir(patient_dir)
    for image_name in image_names:
        all_patient_ids.append(patient)
        all_image_ids.append(image_name.split('.dcm')[0])
dicom_df = pd.DataFrame({
    'patient_id': all_patient_ids,
    'image_id': all_image_ids
})
dicom_df

Unnamed: 0,patient_id,image_id
0,D1-0830,D1-0830_1-1
1,D1-0830,D1-0830_1-4
...,...,...
5200,D2-0448,D2-0448_1-4
5201,D2-0448,D2-0448_1-3


In [9]:
dicom_df = get_dicom_meta(dst_dir, dicom_df, extension = 'dcm')
dicom_df

5202it [01:28, 58.78it/s]


: Keywords extracted from dicom files:
--> 
--> HighBit
--> PixelIntensityRelationshipSign
--> AcquisitionContextSequence
--> StudyDate
--> WindowCenter
--> WindowWidth
--> PositionerType
--> InstanceCreatorUID
--> DetectorType
--> RescaleSlope
--> ImageType
--> AccessionNumber
--> RescaleType
--> DeidentificationMethodCodeSequence
--> PixelRepresentation
--> PatientSex
--> ImagerPixelSpacing
--> PatientName
--> PatientBirthDate
--> DeidentificationMethod
--> AcquisitionDate
--> PatientAge
--> SeriesNumber
--> PhotometricInterpretation
--> SeriesInstanceUID
--> ContentTime
--> InstanceCreationTime
--> StudyID
--> BurnedInAnnotation
--> BitsStored
--> CodeValue
--> CodeMeaning
--> AnatomicRegionSequence
--> SOPClassUID
--> ViewCodeSequence
--> SamplesPerPixel
--> PatientOrientation
--> InstanceNumber
--> PixelIntensityRelationship
--> CodingSchemeDesignator
--> LossyImageCompression
--> OrganExposed
--> LongitudinalTemporalInformationModified
--> StudyInstanceUID
--> SOPInstanceUID
--> 

Unnamed: 0,patient_id,image_id,Unnamed: 3,HighBit,PixelIntensityRelationshipSign,AcquisitionContextSequence,StudyDate,WindowCenter,WindowWidth,PositionerType,InstanceCreatorUID,DetectorType,RescaleSlope,ImageType,AccessionNumber,RescaleType,DeidentificationMethodCodeSequence,PixelRepresentation,PatientSex,ImagerPixelSpacing,PatientName,PatientBirthDate,DeidentificationMethod,AcquisitionDate,PatientAge,SeriesNumber,PhotometricInterpretation,SeriesInstanceUID,ContentTime,InstanceCreationTime,StudyID,BurnedInAnnotation,BitsStored,CodeValue,CodeMeaning,AnatomicRegionSequence,SOPClassUID,ViewCodeSequence,SamplesPerPixel,PatientOrientation,InstanceNumber,PixelIntensityRelationship,CodingSchemeDesignator,LossyImageCompression,OrganExposed,LongitudinalTemporalInformationModified,StudyInstanceUID,SOPInstanceUID,ContentDate,ImageLaterality,Modality,PatientID,InstanceCreationDate,PresentationLUTShape,PresentationIntentType,RescaleIntercept,SeriesDate,BodyPartExamined,ReferringPhysicianName,SeriesTime,Manufacturer,BitsAllocated,PatientIdentityRemoved,Columns,SpecificCharacterSet,WindowCenterWidthExplanation,VOILUTFunction,AcquisitionTime,ViewModifierCodeSequence,Rows,StudyTime
0,D1-0830,D1-0830_1-1,12391759,7,-1,[],20100718,128.0,256.0,MAMMOGRAPHIC,1.3.6.1.4.1.14519.5.2.1.1239.1759.256693145395...,SCINTILLATOR,1.0,"[DERIVED, PRIMARY]",,US,"[[(0008, 0100) Code Value ...",0,F,"[0.094090909, 0.094090909]","(D, 1, -, 0, 8, 3, 0)",,"Per DICOM PS 3.15 AnnexE. Details in 0012,0064",20100718,063Y,1,MONOCHROME2,1.3.6.1.4.1.14519.5.2.1.1239.1759.320712920134...,000000,120428,,NO,8,399162004,cranio-caudal,"[[(0008, 0100) Code Value ...",1.2.840.10008.5.1.4.1.1.1.2,"[[(0008, 0100) Code Value ...",1,"[A, R]",1,LOG,SCT,00,BREAST,MODIFIED,1.3.6.1.4.1.14519.5.2.1.1239.1759.160257839898...,1.3.6.1.4.1.14519.5.2.1.1239.1759.244359496749...,20100718,L,MG,D1-0830,20170818,IDENTITY,FOR PRESENTATION,0.0,20100718,BREAST,(),000000,,8,YES,1914,ISO_IR 100,Full width of 8 bit data,SIGMOID,000000,[],2294,000000
1,D1-0830,D1-0830_1-4,12391759,7,-1,[],20100718,128.0,256.0,MAMMOGRAPHIC,1.3.6.1.4.1.14519.5.2.1.1239.1759.256693145395...,SCINTILLATOR,1.0,"[DERIVED, PRIMARY]",,US,"[[(0008, 0100) Code Value ...",0,F,"[0.094090909, 0.094090909]","(D, 1, -, 0, 8, 3, 0)",,"Per DICOM PS 3.15 AnnexE. Details in 0012,0064",20100718,063Y,1,MONOCHROME2,1.3.6.1.4.1.14519.5.2.1.1239.1759.320712920134...,000000,120430,,NO,8,399368009,medio-lateral oblique,"[[(0008, 0100) Code Value ...",1.2.840.10008.5.1.4.1.1.1.2,"[[(0008, 0100) Code Value ...",1,"[P, FL]",4,LOG,SCT,00,BREAST,MODIFIED,1.3.6.1.4.1.14519.5.2.1.1239.1759.160257839898...,1.3.6.1.4.1.14519.5.2.1.1239.1759.159807368290...,20100718,R,MG,D1-0830,20170818,IDENTITY,FOR PRESENTATION,0.0,20100718,BREAST,(),000000,,8,YES,1914,ISO_IR 100,Full width of 8 bit data,SIGMOID,000000,[],2294,000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5200,D2-0448,D2-0448_1-4,12391759,7,-1,[],20100718,128.0,256.0,MAMMOGRAPHIC,1.3.6.1.4.1.14519.5.2.1.1239.1759.256693145395...,SCINTILLATOR,1.0,"[DERIVED, PRIMARY]",,US,"[[(0008, 0100) Code Value ...",0,F,"[0.094090909, 0.094090909]","(D, 2, -, 0, 4, 4, 8)",,"Per DICOM PS 3.15 AnnexE. Details in 0012,0064",20100718,053Y,1,MONOCHROME2,1.3.6.1.4.1.14519.5.2.1.1239.1759.378084371300...,000000,124254,,NO,8,399368009,medio-lateral oblique,"[[(0008, 0100) Code Value ...",1.2.840.10008.5.1.4.1.1.1.2,"[[(0008, 0100) Code Value ...",1,"[P, FL]",4,LOG,SCT,00,BREAST,MODIFIED,1.3.6.1.4.1.14519.5.2.1.1239.1759.744773683160...,1.3.6.1.4.1.14519.5.2.1.1239.1759.704200754105...,20100718,R,MG,D2-0448,20170818,IDENTITY,FOR PRESENTATION,0.0,20100718,BREAST,(),000000,,8,YES,1914,ISO_IR 100,Full width of 8 bit data,SIGMOID,000000,[],2294,000000
5201,D2-0448,D2-0448_1-3,12391759,7,-1,[],20100718,128.0,256.0,MAMMOGRAPHIC,1.3.6.1.4.1.14519.5.2.1.1239.1759.256693145395...,SCINTILLATOR,1.0,"[DERIVED, PRIMARY]",,US,"[[(0008, 0100) Code Value ...",0,F,"[0.094090909, 0.094090909]","(D, 2, -, 0, 4, 4, 8)",,"Per DICOM PS 3.15 AnnexE. Details in 0012,0064",20100718,053Y,1,MONOCHROME2,1.3.6.1.4.1.14519.5.2.1.1239.1759.378084371300...,000000,124253,,NO,8,399162004,cranio-caudal,"[[(0008, 0100) Code Value ...",1.2.840.10008.5.1.4.1.1.1.2,"[[(0008, 0100) Code Value ...",1,"[P, L]",3,LOG,SCT,00,BREAST,MODIFIED,1.3.6.1.4.1.14519.5.2.1.1239.1759.744773683160...,1.3.6.1.4.1.14519.5.2.1.1239.1759.170184140124...,20100718,R,MG,D2-0448,20170818,IDENTITY,FOR PRESENTATION,0.0,20100718,BREAST,(),000000,,8,YES,1914,ISO_IR 100,Full width of 8 bit data,SIGMOID,000000,[],2294,000000


In [33]:
df.rename(columns = {'ID1': 'patient_id', 'LeftRight': 'laterality'}, inplace = True)

In [11]:
dicom_df.rename(columns = {'ImageLaterality': 'laterality'}, inplace = True)

In [12]:
dicom_df.rename(columns = {name:'__' + name for name in dicom_df.columns if name not in ['patient_id', 'laterality']}, inplace = True)

In [13]:
dicom_df

Unnamed: 0,patient_id,__image_id,__,__HighBit,__PixelIntensityRelationshipSign,__AcquisitionContextSequence,__StudyDate,__WindowCenter,__WindowWidth,__PositionerType,__InstanceCreatorUID,__DetectorType,__RescaleSlope,__ImageType,__AccessionNumber,__RescaleType,__DeidentificationMethodCodeSequence,__PixelRepresentation,__PatientSex,__ImagerPixelSpacing,__PatientName,__PatientBirthDate,__DeidentificationMethod,__AcquisitionDate,__PatientAge,__SeriesNumber,__PhotometricInterpretation,__SeriesInstanceUID,__ContentTime,__InstanceCreationTime,__StudyID,__BurnedInAnnotation,__BitsStored,__CodeValue,__CodeMeaning,__AnatomicRegionSequence,__SOPClassUID,__ViewCodeSequence,__SamplesPerPixel,__PatientOrientation,__InstanceNumber,__PixelIntensityRelationship,__CodingSchemeDesignator,__LossyImageCompression,__OrganExposed,__LongitudinalTemporalInformationModified,__StudyInstanceUID,__SOPInstanceUID,__ContentDate,laterality,__Modality,__PatientID,__InstanceCreationDate,__PresentationLUTShape,__PresentationIntentType,__RescaleIntercept,__SeriesDate,__BodyPartExamined,__ReferringPhysicianName,__SeriesTime,__Manufacturer,__BitsAllocated,__PatientIdentityRemoved,__Columns,__SpecificCharacterSet,__WindowCenterWidthExplanation,__VOILUTFunction,__AcquisitionTime,__ViewModifierCodeSequence,__Rows,__StudyTime
0,D1-0830,D1-0830_1-1,12391759,7,-1,[],20100718,128.0,256.0,MAMMOGRAPHIC,1.3.6.1.4.1.14519.5.2.1.1239.1759.256693145395...,SCINTILLATOR,1.0,"[DERIVED, PRIMARY]",,US,"[[(0008, 0100) Code Value ...",0,F,"[0.094090909, 0.094090909]","(D, 1, -, 0, 8, 3, 0)",,"Per DICOM PS 3.15 AnnexE. Details in 0012,0064",20100718,063Y,1,MONOCHROME2,1.3.6.1.4.1.14519.5.2.1.1239.1759.320712920134...,000000,120428,,NO,8,399162004,cranio-caudal,"[[(0008, 0100) Code Value ...",1.2.840.10008.5.1.4.1.1.1.2,"[[(0008, 0100) Code Value ...",1,"[A, R]",1,LOG,SCT,00,BREAST,MODIFIED,1.3.6.1.4.1.14519.5.2.1.1239.1759.160257839898...,1.3.6.1.4.1.14519.5.2.1.1239.1759.244359496749...,20100718,L,MG,D1-0830,20170818,IDENTITY,FOR PRESENTATION,0.0,20100718,BREAST,(),000000,,8,YES,1914,ISO_IR 100,Full width of 8 bit data,SIGMOID,000000,[],2294,000000
1,D1-0830,D1-0830_1-4,12391759,7,-1,[],20100718,128.0,256.0,MAMMOGRAPHIC,1.3.6.1.4.1.14519.5.2.1.1239.1759.256693145395...,SCINTILLATOR,1.0,"[DERIVED, PRIMARY]",,US,"[[(0008, 0100) Code Value ...",0,F,"[0.094090909, 0.094090909]","(D, 1, -, 0, 8, 3, 0)",,"Per DICOM PS 3.15 AnnexE. Details in 0012,0064",20100718,063Y,1,MONOCHROME2,1.3.6.1.4.1.14519.5.2.1.1239.1759.320712920134...,000000,120430,,NO,8,399368009,medio-lateral oblique,"[[(0008, 0100) Code Value ...",1.2.840.10008.5.1.4.1.1.1.2,"[[(0008, 0100) Code Value ...",1,"[P, FL]",4,LOG,SCT,00,BREAST,MODIFIED,1.3.6.1.4.1.14519.5.2.1.1239.1759.160257839898...,1.3.6.1.4.1.14519.5.2.1.1239.1759.159807368290...,20100718,R,MG,D1-0830,20170818,IDENTITY,FOR PRESENTATION,0.0,20100718,BREAST,(),000000,,8,YES,1914,ISO_IR 100,Full width of 8 bit data,SIGMOID,000000,[],2294,000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5200,D2-0448,D2-0448_1-4,12391759,7,-1,[],20100718,128.0,256.0,MAMMOGRAPHIC,1.3.6.1.4.1.14519.5.2.1.1239.1759.256693145395...,SCINTILLATOR,1.0,"[DERIVED, PRIMARY]",,US,"[[(0008, 0100) Code Value ...",0,F,"[0.094090909, 0.094090909]","(D, 2, -, 0, 4, 4, 8)",,"Per DICOM PS 3.15 AnnexE. Details in 0012,0064",20100718,053Y,1,MONOCHROME2,1.3.6.1.4.1.14519.5.2.1.1239.1759.378084371300...,000000,124254,,NO,8,399368009,medio-lateral oblique,"[[(0008, 0100) Code Value ...",1.2.840.10008.5.1.4.1.1.1.2,"[[(0008, 0100) Code Value ...",1,"[P, FL]",4,LOG,SCT,00,BREAST,MODIFIED,1.3.6.1.4.1.14519.5.2.1.1239.1759.744773683160...,1.3.6.1.4.1.14519.5.2.1.1239.1759.704200754105...,20100718,R,MG,D2-0448,20170818,IDENTITY,FOR PRESENTATION,0.0,20100718,BREAST,(),000000,,8,YES,1914,ISO_IR 100,Full width of 8 bit data,SIGMOID,000000,[],2294,000000
5201,D2-0448,D2-0448_1-3,12391759,7,-1,[],20100718,128.0,256.0,MAMMOGRAPHIC,1.3.6.1.4.1.14519.5.2.1.1239.1759.256693145395...,SCINTILLATOR,1.0,"[DERIVED, PRIMARY]",,US,"[[(0008, 0100) Code Value ...",0,F,"[0.094090909, 0.094090909]","(D, 2, -, 0, 4, 4, 8)",,"Per DICOM PS 3.15 AnnexE. Details in 0012,0064",20100718,053Y,1,MONOCHROME2,1.3.6.1.4.1.14519.5.2.1.1239.1759.378084371300...,000000,124253,,NO,8,399162004,cranio-caudal,"[[(0008, 0100) Code Value ...",1.2.840.10008.5.1.4.1.1.1.2,"[[(0008, 0100) Code Value ...",1,"[P, L]",3,LOG,SCT,00,BREAST,MODIFIED,1.3.6.1.4.1.14519.5.2.1.1239.1759.744773683160...,1.3.6.1.4.1.14519.5.2.1.1239.1759.170184140124...,20100718,R,MG,D2-0448,20170818,IDENTITY,FOR PRESENTATION,0.0,20100718,BREAST,(),000000,,8,YES,1914,ISO_IR 100,Full width of 8 bit data,SIGMOID,000000,[],2294,000000


In [14]:
dicom_df.groupby(['patient_id', 'laterality']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,__image_id,__,__HighBit,__PixelIntensityRelationshipSign,__AcquisitionContextSequence,__StudyDate,__WindowCenter,__WindowWidth,__PositionerType,__InstanceCreatorUID,__DetectorType,__RescaleSlope,__ImageType,__AccessionNumber,__RescaleType,__DeidentificationMethodCodeSequence,__PixelRepresentation,__PatientSex,__ImagerPixelSpacing,__PatientName,__PatientBirthDate,__DeidentificationMethod,__AcquisitionDate,__PatientAge,__SeriesNumber,__PhotometricInterpretation,__SeriesInstanceUID,__ContentTime,__InstanceCreationTime,__StudyID,__BurnedInAnnotation,__BitsStored,__CodeValue,__CodeMeaning,__AnatomicRegionSequence,__SOPClassUID,__ViewCodeSequence,__SamplesPerPixel,__PatientOrientation,__InstanceNumber,__PixelIntensityRelationship,__CodingSchemeDesignator,__LossyImageCompression,__OrganExposed,__LongitudinalTemporalInformationModified,__StudyInstanceUID,__SOPInstanceUID,__ContentDate,__Modality,__PatientID,__InstanceCreationDate,__PresentationLUTShape,__PresentationIntentType,__RescaleIntercept,__SeriesDate,__BodyPartExamined,__ReferringPhysicianName,__SeriesTime,__Manufacturer,__BitsAllocated,__PatientIdentityRemoved,__Columns,__SpecificCharacterSet,__WindowCenterWidthExplanation,__VOILUTFunction,__AcquisitionTime,__ViewModifierCodeSequence,__Rows,__StudyTime
patient_id,laterality,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1
D1-0001,R,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
D1-0002,L,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
D2-0749,L,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
D2-0749,R,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2


In [26]:
dicom_df.laterality.value_counts()

L    2682
R    2520
Name: laterality, dtype: int64

In [15]:
dicom_df.patient_id.nunique()

1775

In [16]:
df.patient_id.nunique()

1775

In [17]:
df

Unnamed: 0,patient_id,laterality,Age,number,abnormality,classification,subtype
0,D1-0001,R,44,2,calcification,Benign,
1,D1-0002,L,40,2,calcification,Benign,
...,...,...,...,...,...,...,...
1870,D2-0748,R,29,2,mass,Malignant,Luminal B
1871,D2-0749,L,42,2,mass,Malignant,Luminal B


In [18]:
merged = pd.merge(
    dicom_df,
    df,
    how="outer",
    on=['patient_id', 'laterality'],
    left_on=None,
    right_on=None,
    left_index=False,
    right_index=False,
    sort=False,
    suffixes=("__", ""),
    copy=True,
    indicator=False,
#     validate='1:1',
)
merged

Unnamed: 0,patient_id,__image_id,__,__HighBit,__PixelIntensityRelationshipSign,__AcquisitionContextSequence,__StudyDate,__WindowCenter,__WindowWidth,__PositionerType,__InstanceCreatorUID,__DetectorType,__RescaleSlope,__ImageType,__AccessionNumber,__RescaleType,__DeidentificationMethodCodeSequence,__PixelRepresentation,__PatientSex,__ImagerPixelSpacing,__PatientName,__PatientBirthDate,__DeidentificationMethod,__AcquisitionDate,__PatientAge,__SeriesNumber,__PhotometricInterpretation,__SeriesInstanceUID,__ContentTime,__InstanceCreationTime,__StudyID,__BurnedInAnnotation,__BitsStored,__CodeValue,__CodeMeaning,__AnatomicRegionSequence,__SOPClassUID,__ViewCodeSequence,__SamplesPerPixel,__PatientOrientation,__InstanceNumber,__PixelIntensityRelationship,__CodingSchemeDesignator,__LossyImageCompression,__OrganExposed,__LongitudinalTemporalInformationModified,__StudyInstanceUID,__SOPInstanceUID,__ContentDate,laterality,__Modality,__PatientID,__InstanceCreationDate,__PresentationLUTShape,__PresentationIntentType,__RescaleIntercept,__SeriesDate,__BodyPartExamined,__ReferringPhysicianName,__SeriesTime,__Manufacturer,__BitsAllocated,__PatientIdentityRemoved,__Columns,__SpecificCharacterSet,__WindowCenterWidthExplanation,__VOILUTFunction,__AcquisitionTime,__ViewModifierCodeSequence,__Rows,__StudyTime,Age,number,abnormality,classification,subtype
0,D1-0830,D1-0830_1-1,12391759,7,-1,[],20100718,128.0,256.0,MAMMOGRAPHIC,1.3.6.1.4.1.14519.5.2.1.1239.1759.256693145395...,SCINTILLATOR,1.0,"[DERIVED, PRIMARY]",,US,"[[(0008, 0100) Code Value ...",0,F,"[0.094090909, 0.094090909]","(D, 1, -, 0, 8, 3, 0)",,"Per DICOM PS 3.15 AnnexE. Details in 0012,0064",20100718,063Y,1,MONOCHROME2,1.3.6.1.4.1.14519.5.2.1.1239.1759.320712920134...,000000,120428,,NO,8,399162004,cranio-caudal,"[[(0008, 0100) Code Value ...",1.2.840.10008.5.1.4.1.1.1.2,"[[(0008, 0100) Code Value ...",1,"[A, R]",1,LOG,SCT,00,BREAST,MODIFIED,1.3.6.1.4.1.14519.5.2.1.1239.1759.160257839898...,1.3.6.1.4.1.14519.5.2.1.1239.1759.244359496749...,20100718,L,MG,D1-0830,20170818,IDENTITY,FOR PRESENTATION,0.0,20100718,BREAST,(),000000,,8,YES,1914,ISO_IR 100,Full width of 8 bit data,SIGMOID,000000,[],2294,000000,63.0,2.0,mass,Benign,
1,D1-0830,D1-0830_1-2,12391759,7,-1,[],20100718,128.0,256.0,MAMMOGRAPHIC,1.3.6.1.4.1.14519.5.2.1.1239.1759.256693145395...,SCINTILLATOR,1.0,"[DERIVED, PRIMARY]",,US,"[[(0008, 0100) Code Value ...",0,F,"[0.094090909, 0.094090909]","(D, 1, -, 0, 8, 3, 0)",,"Per DICOM PS 3.15 AnnexE. Details in 0012,0064",20100718,063Y,1,MONOCHROME2,1.3.6.1.4.1.14519.5.2.1.1239.1759.320712920134...,000000,120429,,NO,8,399368009,medio-lateral oblique,"[[(0008, 0100) Code Value ...",1.2.840.10008.5.1.4.1.1.1.2,"[[(0008, 0100) Code Value ...",1,"[A, FR]",2,LOG,SCT,00,BREAST,MODIFIED,1.3.6.1.4.1.14519.5.2.1.1239.1759.160257839898...,1.3.6.1.4.1.14519.5.2.1.1239.1759.320677583371...,20100718,L,MG,D1-0830,20170818,IDENTITY,FOR PRESENTATION,0.0,20100718,BREAST,(),000000,,8,YES,1914,ISO_IR 100,Full width of 8 bit data,SIGMOID,000000,[],2294,000000,63.0,2.0,mass,Benign,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5200,D2-0448,D2-0448_1-4,12391759,7,-1,[],20100718,128.0,256.0,MAMMOGRAPHIC,1.3.6.1.4.1.14519.5.2.1.1239.1759.256693145395...,SCINTILLATOR,1.0,"[DERIVED, PRIMARY]",,US,"[[(0008, 0100) Code Value ...",0,F,"[0.094090909, 0.094090909]","(D, 2, -, 0, 4, 4, 8)",,"Per DICOM PS 3.15 AnnexE. Details in 0012,0064",20100718,053Y,1,MONOCHROME2,1.3.6.1.4.1.14519.5.2.1.1239.1759.378084371300...,000000,124254,,NO,8,399368009,medio-lateral oblique,"[[(0008, 0100) Code Value ...",1.2.840.10008.5.1.4.1.1.1.2,"[[(0008, 0100) Code Value ...",1,"[P, FL]",4,LOG,SCT,00,BREAST,MODIFIED,1.3.6.1.4.1.14519.5.2.1.1239.1759.744773683160...,1.3.6.1.4.1.14519.5.2.1.1239.1759.704200754105...,20100718,R,MG,D2-0448,20170818,IDENTITY,FOR PRESENTATION,0.0,20100718,BREAST,(),000000,,8,YES,1914,ISO_IR 100,Full width of 8 bit data,SIGMOID,000000,[],2294,000000,,,,,
5201,D2-0448,D2-0448_1-3,12391759,7,-1,[],20100718,128.0,256.0,MAMMOGRAPHIC,1.3.6.1.4.1.14519.5.2.1.1239.1759.256693145395...,SCINTILLATOR,1.0,"[DERIVED, PRIMARY]",,US,"[[(0008, 0100) Code Value ...",0,F,"[0.094090909, 0.094090909]","(D, 2, -, 0, 4, 4, 8)",,"Per DICOM PS 3.15 AnnexE. Details in 0012,0064",20100718,053Y,1,MONOCHROME2,1.3.6.1.4.1.14519.5.2.1.1239.1759.378084371300...,000000,124253,,NO,8,399162004,cranio-caudal,"[[(0008, 0100) Code Value ...",1.2.840.10008.5.1.4.1.1.1.2,"[[(0008, 0100) Code Value ...",1,"[P, L]",3,LOG,SCT,00,BREAST,MODIFIED,1.3.6.1.4.1.14519.5.2.1.1239.1759.744773683160...,1.3.6.1.4.1.14519.5.2.1.1239.1759.170184140124...,20100718,R,MG,D2-0448,20170818,IDENTITY,FOR PRESENTATION,0.0,20100718,BREAST,(),000000,,8,YES,1914,ISO_IR 100,Full width of 8 bit data,SIGMOID,000000,[],2294,000000,,,,,


In [19]:
merged.at[0, '__ViewCodeSequence']

<Sequence, length 1>

In [20]:
merged.columns

Index(['patient_id', '__image_id', '__', '__HighBit',
       '__PixelIntensityRelationshipSign', '__AcquisitionContextSequence',
       '__StudyDate', '__WindowCenter', '__WindowWidth', '__PositionerType',
       '__InstanceCreatorUID', '__DetectorType', '__RescaleSlope',
       '__ImageType', '__AccessionNumber', '__RescaleType',
       '__DeidentificationMethodCodeSequence', '__PixelRepresentation',
       '__PatientSex', '__ImagerPixelSpacing', '__PatientName',
       '__PatientBirthDate', '__DeidentificationMethod', '__AcquisitionDate',
       '__PatientAge', '__SeriesNumber', '__PhotometricInterpretation',
       '__SeriesInstanceUID', '__ContentTime', '__InstanceCreationTime',
       '__StudyID', '__BurnedInAnnotation', '__BitsStored', '__CodeValue',
       '__CodeMeaning', '__AnatomicRegionSequence', '__SOPClassUID',
       '__ViewCodeSequence', '__SamplesPerPixel', '__PatientOrientation',
       '__InstanceNumber', '__PixelIntensityRelationship',
       '__CodingSchemeDesignato

In [21]:
merged = merged[
    ['patient_id', '__image_id', 'laterality', 'Age', 'number', 'abnormality', 'classification', 'subtype',
     '__VOILUTFunction', '__WindowCenter', '__WindowWidth', '__WindowCenterWidthExplanation',
     '__RescaleSlope', '__RescaleIntercept', '__Columns', '__Rows',
       '__SOPInstanceUID', '__LossyImageCompression',
       '__CodeMeaning', '__StudyID', '__PatientOrientation',
       '__SpecificCharacterSet', '__InstanceCreatorUID', '__PositionerType',
       '__PhotometricInterpretation', '__BitsAllocated',
       '__StudyDate', '__PatientName', '__ContentTime',
       '__PixelIntensityRelationship', '__Modality',
       '__PresentationIntentType', '__AcquisitionTime', '__BodyPartExamined', '__HighBit',
       '__PatientIdentityRemoved', '__BitsStored', '__InstanceCreationTime',
       '__StudyTime', '__SeriesTime', '__PatientBirthDate', '__ImageType',
       '__RescaleType', '__SeriesInstanceUID', '__ReferringPhysicianName',
       '__ContentDate', '__InstanceCreationDate',
       '__PixelIntensityRelationshipSign', '__SeriesNumber',
       '__StudyInstanceUID', '__PatientID', '__DetectorType',
       '__SamplesPerPixel', '__SOPClassUID', '__PixelRepresentation', '__CodeValue',
       '__OrganExposed', '__InstanceNumber', '__AccessionNumber',
       '__SeriesDate', '__AcquisitionDate',
       '__PatientAge',
       '__BurnedInAnnotation', '__PresentationLUTShape',
       '__Manufacturer', '__DeidentificationMethod', '__ImagerPixelSpacing',
       '__PatientSex', ]
]
merged

Unnamed: 0,patient_id,__image_id,laterality,Age,number,abnormality,classification,subtype,__VOILUTFunction,__WindowCenter,__WindowWidth,__WindowCenterWidthExplanation,__RescaleSlope,__RescaleIntercept,__Columns,__Rows,__SOPInstanceUID,__LossyImageCompression,__CodeMeaning,__StudyID,__PatientOrientation,__SpecificCharacterSet,__InstanceCreatorUID,__PositionerType,__PhotometricInterpretation,__BitsAllocated,__StudyDate,__PatientName,__ContentTime,__PixelIntensityRelationship,__Modality,__PresentationIntentType,__AcquisitionTime,__BodyPartExamined,__HighBit,__PatientIdentityRemoved,__BitsStored,__InstanceCreationTime,__StudyTime,__SeriesTime,__PatientBirthDate,__ImageType,__RescaleType,__SeriesInstanceUID,__ReferringPhysicianName,__ContentDate,__InstanceCreationDate,__PixelIntensityRelationshipSign,__SeriesNumber,__StudyInstanceUID,__PatientID,__DetectorType,__SamplesPerPixel,__SOPClassUID,__PixelRepresentation,__CodeValue,__OrganExposed,__InstanceNumber,__AccessionNumber,__SeriesDate,__AcquisitionDate,__PatientAge,__BurnedInAnnotation,__PresentationLUTShape,__Manufacturer,__DeidentificationMethod,__ImagerPixelSpacing,__PatientSex
0,D1-0830,D1-0830_1-1,L,63.0,2.0,mass,Benign,,SIGMOID,128.0,256.0,Full width of 8 bit data,1.0,0.0,1914,2294,1.3.6.1.4.1.14519.5.2.1.1239.1759.244359496749...,00,cranio-caudal,,"[A, R]",ISO_IR 100,1.3.6.1.4.1.14519.5.2.1.1239.1759.256693145395...,MAMMOGRAPHIC,MONOCHROME2,8,20100718,"(D, 1, -, 0, 8, 3, 0)",000000,LOG,MG,FOR PRESENTATION,000000,BREAST,7,YES,8,120428,000000,000000,,"[DERIVED, PRIMARY]",US,1.3.6.1.4.1.14519.5.2.1.1239.1759.320712920134...,(),20100718,20170818,-1,1,1.3.6.1.4.1.14519.5.2.1.1239.1759.160257839898...,D1-0830,SCINTILLATOR,1,1.2.840.10008.5.1.4.1.1.1.2,0,399162004,BREAST,1,,20100718,20100718,063Y,NO,IDENTITY,,"Per DICOM PS 3.15 AnnexE. Details in 0012,0064","[0.094090909, 0.094090909]",F
1,D1-0830,D1-0830_1-2,L,63.0,2.0,mass,Benign,,SIGMOID,128.0,256.0,Full width of 8 bit data,1.0,0.0,1914,2294,1.3.6.1.4.1.14519.5.2.1.1239.1759.320677583371...,00,medio-lateral oblique,,"[A, FR]",ISO_IR 100,1.3.6.1.4.1.14519.5.2.1.1239.1759.256693145395...,MAMMOGRAPHIC,MONOCHROME2,8,20100718,"(D, 1, -, 0, 8, 3, 0)",000000,LOG,MG,FOR PRESENTATION,000000,BREAST,7,YES,8,120429,000000,000000,,"[DERIVED, PRIMARY]",US,1.3.6.1.4.1.14519.5.2.1.1239.1759.320712920134...,(),20100718,20170818,-1,1,1.3.6.1.4.1.14519.5.2.1.1239.1759.160257839898...,D1-0830,SCINTILLATOR,1,1.2.840.10008.5.1.4.1.1.1.2,0,399368009,BREAST,2,,20100718,20100718,063Y,NO,IDENTITY,,"Per DICOM PS 3.15 AnnexE. Details in 0012,0064","[0.094090909, 0.094090909]",F
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5200,D2-0448,D2-0448_1-4,R,,,,,,SIGMOID,128.0,256.0,Full width of 8 bit data,1.0,0.0,1914,2294,1.3.6.1.4.1.14519.5.2.1.1239.1759.704200754105...,00,medio-lateral oblique,,"[P, FL]",ISO_IR 100,1.3.6.1.4.1.14519.5.2.1.1239.1759.256693145395...,MAMMOGRAPHIC,MONOCHROME2,8,20100718,"(D, 2, -, 0, 4, 4, 8)",000000,LOG,MG,FOR PRESENTATION,000000,BREAST,7,YES,8,124254,000000,000000,,"[DERIVED, PRIMARY]",US,1.3.6.1.4.1.14519.5.2.1.1239.1759.378084371300...,(),20100718,20170818,-1,1,1.3.6.1.4.1.14519.5.2.1.1239.1759.744773683160...,D2-0448,SCINTILLATOR,1,1.2.840.10008.5.1.4.1.1.1.2,0,399368009,BREAST,4,,20100718,20100718,053Y,NO,IDENTITY,,"Per DICOM PS 3.15 AnnexE. Details in 0012,0064","[0.094090909, 0.094090909]",F
5201,D2-0448,D2-0448_1-3,R,,,,,,SIGMOID,128.0,256.0,Full width of 8 bit data,1.0,0.0,1914,2294,1.3.6.1.4.1.14519.5.2.1.1239.1759.170184140124...,00,cranio-caudal,,"[P, L]",ISO_IR 100,1.3.6.1.4.1.14519.5.2.1.1239.1759.256693145395...,MAMMOGRAPHIC,MONOCHROME2,8,20100718,"(D, 2, -, 0, 4, 4, 8)",000000,LOG,MG,FOR PRESENTATION,000000,BREAST,7,YES,8,124253,000000,000000,,"[DERIVED, PRIMARY]",US,1.3.6.1.4.1.14519.5.2.1.1239.1759.378084371300...,(),20100718,20170818,-1,1,1.3.6.1.4.1.14519.5.2.1.1239.1759.744773683160...,D2-0448,SCINTILLATOR,1,1.2.840.10008.5.1.4.1.1.1.2,0,399162004,BREAST,3,,20100718,20100718,053Y,NO,IDENTITY,,"Per DICOM PS 3.15 AnnexE. Details in 0012,0064","[0.094090909, 0.094090909]",F


In [22]:
merged[merged['classification'].isna()]

Unnamed: 0,patient_id,__image_id,laterality,Age,number,abnormality,classification,subtype,__VOILUTFunction,__WindowCenter,__WindowWidth,__WindowCenterWidthExplanation,__RescaleSlope,__RescaleIntercept,__Columns,__Rows,__SOPInstanceUID,__LossyImageCompression,__CodeMeaning,__StudyID,__PatientOrientation,__SpecificCharacterSet,__InstanceCreatorUID,__PositionerType,__PhotometricInterpretation,__BitsAllocated,__StudyDate,__PatientName,__ContentTime,__PixelIntensityRelationship,__Modality,__PresentationIntentType,__AcquisitionTime,__BodyPartExamined,__HighBit,__PatientIdentityRemoved,__BitsStored,__InstanceCreationTime,__StudyTime,__SeriesTime,__PatientBirthDate,__ImageType,__RescaleType,__SeriesInstanceUID,__ReferringPhysicianName,__ContentDate,__InstanceCreationDate,__PixelIntensityRelationshipSign,__SeriesNumber,__StudyInstanceUID,__PatientID,__DetectorType,__SamplesPerPixel,__SOPClassUID,__PixelRepresentation,__CodeValue,__OrganExposed,__InstanceNumber,__AccessionNumber,__SeriesDate,__AcquisitionDate,__PatientAge,__BurnedInAnnotation,__PresentationLUTShape,__Manufacturer,__DeidentificationMethod,__ImagerPixelSpacing,__PatientSex
8,D2-0138,D2-0138_1-1,L,,,,,,SIGMOID,128.0,256.0,Full width of 8 bit data,1.0,0.0,1914,2294,1.3.6.1.4.1.14519.5.2.1.1239.1759.208719161957...,00,cranio-caudal,,"[A, R]",ISO_IR 100,1.3.6.1.4.1.14519.5.2.1.1239.1759.256693145395...,MAMMOGRAPHIC,MONOCHROME2,8,20100718,"(D, 2, -, 0, 1, 3, 8)",000000,LOG,MG,FOR PRESENTATION,000000,BREAST,7,YES,8,123206,000000,000000,,"[DERIVED, PRIMARY]",US,1.3.6.1.4.1.14519.5.2.1.1239.1759.160968165824...,(),20100718,20170818,-1,1,1.3.6.1.4.1.14519.5.2.1.1239.1759.291802729061...,D2-0138,SCINTILLATOR,1,1.2.840.10008.5.1.4.1.1.1.2,0,399162004,BREAST,1,,20100718,20100718,060Y,NO,IDENTITY,,"Per DICOM PS 3.15 AnnexE. Details in 0012,0064","[0.094090909, 0.094090909]",F
9,D2-0138,D2-0138_1-2,L,,,,,,SIGMOID,128.0,256.0,Full width of 8 bit data,1.0,0.0,1914,2294,1.3.6.1.4.1.14519.5.2.1.1239.1759.146858404676...,00,medio-lateral oblique,,"[A, FR]",ISO_IR 100,1.3.6.1.4.1.14519.5.2.1.1239.1759.256693145395...,MAMMOGRAPHIC,MONOCHROME2,8,20100718,"(D, 2, -, 0, 1, 3, 8)",000000,LOG,MG,FOR PRESENTATION,000000,BREAST,7,YES,8,123207,000000,000000,,"[DERIVED, PRIMARY]",US,1.3.6.1.4.1.14519.5.2.1.1239.1759.160968165824...,(),20100718,20170818,-1,1,1.3.6.1.4.1.14519.5.2.1.1239.1759.291802729061...,D2-0138,SCINTILLATOR,1,1.2.840.10008.5.1.4.1.1.1.2,0,399368009,BREAST,2,,20100718,20100718,060Y,NO,IDENTITY,,"Per DICOM PS 3.15 AnnexE. Details in 0012,0064","[0.094090909, 0.094090909]",F
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5200,D2-0448,D2-0448_1-4,R,,,,,,SIGMOID,128.0,256.0,Full width of 8 bit data,1.0,0.0,1914,2294,1.3.6.1.4.1.14519.5.2.1.1239.1759.704200754105...,00,medio-lateral oblique,,"[P, FL]",ISO_IR 100,1.3.6.1.4.1.14519.5.2.1.1239.1759.256693145395...,MAMMOGRAPHIC,MONOCHROME2,8,20100718,"(D, 2, -, 0, 4, 4, 8)",000000,LOG,MG,FOR PRESENTATION,000000,BREAST,7,YES,8,124254,000000,000000,,"[DERIVED, PRIMARY]",US,1.3.6.1.4.1.14519.5.2.1.1239.1759.378084371300...,(),20100718,20170818,-1,1,1.3.6.1.4.1.14519.5.2.1.1239.1759.744773683160...,D2-0448,SCINTILLATOR,1,1.2.840.10008.5.1.4.1.1.1.2,0,399368009,BREAST,4,,20100718,20100718,053Y,NO,IDENTITY,,"Per DICOM PS 3.15 AnnexE. Details in 0012,0064","[0.094090909, 0.094090909]",F
5201,D2-0448,D2-0448_1-3,R,,,,,,SIGMOID,128.0,256.0,Full width of 8 bit data,1.0,0.0,1914,2294,1.3.6.1.4.1.14519.5.2.1.1239.1759.170184140124...,00,cranio-caudal,,"[P, L]",ISO_IR 100,1.3.6.1.4.1.14519.5.2.1.1239.1759.256693145395...,MAMMOGRAPHIC,MONOCHROME2,8,20100718,"(D, 2, -, 0, 4, 4, 8)",000000,LOG,MG,FOR PRESENTATION,000000,BREAST,7,YES,8,124253,000000,000000,,"[DERIVED, PRIMARY]",US,1.3.6.1.4.1.14519.5.2.1.1239.1759.378084371300...,(),20100718,20170818,-1,1,1.3.6.1.4.1.14519.5.2.1.1239.1759.744773683160...,D2-0448,SCINTILLATOR,1,1.2.840.10008.5.1.4.1.1.1.2,0,399162004,BREAST,3,,20100718,20100718,053Y,NO,IDENTITY,,"Per DICOM PS 3.15 AnnexE. Details in 0012,0064","[0.094090909, 0.094090909]",F


In [27]:
dicom_df[(dicom_df.patient_id == 'D2-0138') & (dicom_df.laterality == 'L')]

Unnamed: 0,patient_id,__image_id,__,__HighBit,__PixelIntensityRelationshipSign,__AcquisitionContextSequence,__StudyDate,__WindowCenter,__WindowWidth,__PositionerType,__InstanceCreatorUID,__DetectorType,__RescaleSlope,__ImageType,__AccessionNumber,__RescaleType,__DeidentificationMethodCodeSequence,__PixelRepresentation,__PatientSex,__ImagerPixelSpacing,__PatientName,__PatientBirthDate,__DeidentificationMethod,__AcquisitionDate,__PatientAge,__SeriesNumber,__PhotometricInterpretation,__SeriesInstanceUID,__ContentTime,__InstanceCreationTime,__StudyID,__BurnedInAnnotation,__BitsStored,__CodeValue,__CodeMeaning,__AnatomicRegionSequence,__SOPClassUID,__ViewCodeSequence,__SamplesPerPixel,__PatientOrientation,__InstanceNumber,__PixelIntensityRelationship,__CodingSchemeDesignator,__LossyImageCompression,__OrganExposed,__LongitudinalTemporalInformationModified,__StudyInstanceUID,__SOPInstanceUID,__ContentDate,laterality,__Modality,__PatientID,__InstanceCreationDate,__PresentationLUTShape,__PresentationIntentType,__RescaleIntercept,__SeriesDate,__BodyPartExamined,__ReferringPhysicianName,__SeriesTime,__Manufacturer,__BitsAllocated,__PatientIdentityRemoved,__Columns,__SpecificCharacterSet,__WindowCenterWidthExplanation,__VOILUTFunction,__AcquisitionTime,__ViewModifierCodeSequence,__Rows,__StudyTime
8,D2-0138,D2-0138_1-1,12391759,7,-1,[],20100718,128.0,256.0,MAMMOGRAPHIC,1.3.6.1.4.1.14519.5.2.1.1239.1759.256693145395...,SCINTILLATOR,1.0,"[DERIVED, PRIMARY]",,US,"[[(0008, 0100) Code Value ...",0,F,"[0.094090909, 0.094090909]","(D, 2, -, 0, 1, 3, 8)",,"Per DICOM PS 3.15 AnnexE. Details in 0012,0064",20100718,060Y,1,MONOCHROME2,1.3.6.1.4.1.14519.5.2.1.1239.1759.160968165824...,0,123206,,NO,8,399162004,cranio-caudal,"[[(0008, 0100) Code Value ...",1.2.840.10008.5.1.4.1.1.1.2,"[[(0008, 0100) Code Value ...",1,"[A, R]",1,LOG,SCT,0,BREAST,MODIFIED,1.3.6.1.4.1.14519.5.2.1.1239.1759.291802729061...,1.3.6.1.4.1.14519.5.2.1.1239.1759.208719161957...,20100718,L,MG,D2-0138,20170818,IDENTITY,FOR PRESENTATION,0.0,20100718,BREAST,(),0,,8,YES,1914,ISO_IR 100,Full width of 8 bit data,SIGMOID,0,[],2294,0
11,D2-0138,D2-0138_1-2,12391759,7,-1,[],20100718,128.0,256.0,MAMMOGRAPHIC,1.3.6.1.4.1.14519.5.2.1.1239.1759.256693145395...,SCINTILLATOR,1.0,"[DERIVED, PRIMARY]",,US,"[[(0008, 0100) Code Value ...",0,F,"[0.094090909, 0.094090909]","(D, 2, -, 0, 1, 3, 8)",,"Per DICOM PS 3.15 AnnexE. Details in 0012,0064",20100718,060Y,1,MONOCHROME2,1.3.6.1.4.1.14519.5.2.1.1239.1759.160968165824...,0,123207,,NO,8,399368009,medio-lateral oblique,"[[(0008, 0100) Code Value ...",1.2.840.10008.5.1.4.1.1.1.2,"[[(0008, 0100) Code Value ...",1,"[A, FR]",2,LOG,SCT,0,BREAST,MODIFIED,1.3.6.1.4.1.14519.5.2.1.1239.1759.291802729061...,1.3.6.1.4.1.14519.5.2.1.1239.1759.146858404676...,20100718,L,MG,D2-0138,20170818,IDENTITY,FOR PRESENTATION,0.0,20100718,BREAST,(),0,,8,YES,1914,ISO_IR 100,Full width of 8 bit data,SIGMOID,0,[],2294,0


In [35]:
df[(df.patient_id == 'D2-0138') & (df.laterality == 'R')]

Unnamed: 0,patient_id,laterality,Age,number,abnormality,classification,subtype
1250,D2-0138,R,60,2,both,Malignant,HER2-enriched


In [None]:
merged.classification.fillna('Normal', inplace = True)

In [None]:
merged[merged['classification'].isna()]

In [None]:
merged.columns

In [None]:
merged.classification.hist()

In [None]:
merged['cancer'] = merged.classification.apply(lambda x: 1 if x=='Malignant' else 0)

In [None]:
merged.columns

In [None]:
merged = merged[['patient_id', '__image_id', 'laterality', 'Age', 'cancer', 'number',
       'abnormality', 'classification', 'subtype', '__VOILUTFunction',
       '__WindowCenter', '__WindowWidth', '__WindowCenterWidthExplanation',
       '__RescaleSlope', '__RescaleIntercept', '__Columns', '__Rows',
       '__SOPInstanceUID', '__LossyImageCompression', '__CodeMeaning',
       '__StudyID', '__PatientOrientation', '__SpecificCharacterSet',
       '__InstanceCreatorUID', '__PositionerType',
       '__PhotometricInterpretation', '__BitsAllocated', '__StudyDate',
       '__PatientName', '__ContentTime', '__PixelIntensityRelationship',
       '__Modality', '__PresentationIntentType', '__AcquisitionTime',
       '__BodyPartExamined', '__HighBit', '__PatientIdentityRemoved',
       '__BitsStored', '__InstanceCreationTime', '__StudyTime', '__SeriesTime',
       '__PatientBirthDate', '__ImageType', '__RescaleType',
       '__SeriesInstanceUID', '__ReferringPhysicianName', '__ContentDate',
       '__InstanceCreationDate', '__PixelIntensityRelationshipSign',
       '__SeriesNumber', '__StudyInstanceUID', '__PatientID', '__DetectorType',
       '__SamplesPerPixel', '__SOPClassUID', '__PixelRepresentation',
       '__CodeValue', '__OrganExposed', '__InstanceNumber',
       '__AccessionNumber', '__SeriesDate', '__AcquisitionDate',
       '__PatientAge', '__BurnedInAnnotation', '__PresentationLUTShape',
       '__Manufacturer', '__DeidentificationMethod', '__ImagerPixelSpacing',
       '__PatientSex']]

merged.rename(columns = {
    '__image_id': 'image_id',
    'Age': 'age',
    'number': '_num_images',
}, inplace = True)
merged

In [None]:
merged.cancer.hist()

In [None]:
merged.to_csv(os.path.join(ROOT_DIR, 'label_v2.csv'), index = False)

In [None]:
merged

In [None]:
# for _name, sub_df in merged.groupby(['patient_id']):
#     assert sub_df.cancer.nunique() == 1

for _name, sub_df in merged.groupby(['patient_id', 'laterality']):
    assert sub_df.cancer.nunique() == 1

### BMCD

In [40]:
ROOT_DIR = '/home/dangnh36/datasets/.comp/rsna/external/bmcd/'
DCM_ROOT_DIR = os.path.join(ROOT_DIR, 'dicoms')

In [41]:
df = pd.read_csv(os.path.join(ROOT_DIR, 'label.csv'))
df.biopsy_result.fillna('NAN', inplace = True)
df

Unnamed: 0,dir,laterality,age,density,BIRADS,biopsy_result,folder_type
0,1,Right,57,b,4a,BENIGN,suspicious
1,2,Left,64,c,4a,BENIGN,suspicious
...,...,...,...,...,...,...,...
98,49,Left,75,a,2,NAN,normal
99,50,Right,56,b,2,NAN,normal


In [42]:
df.biopsy_result.value_counts()

NAN              50
BENIGN           21
NOT AVAILABLE    18
DCIS              6
MALIGNANT         5
Name: biopsy_result, dtype: int64

In [None]:
df = df[df.biopsy_result.isin(['NAN', 'BENIGN', 'DCIS', 'MALIGNANT'])].reset_index(drop = True)
df

In [None]:
src_root_dir = DCM_ROOT_DIR
dst_root_dir = os.path.join(ROOT_DIR, 'dicoms_v2')

df['view'] = None
df['image_id'] = None
all_series = []
for i in tqdm(range(len(df))):
    dir_type = df.at[i, 'folder_type']
    dir_name = df.at[i, 'dir']
    dir_parent = 'Suspicious_cases'
    if dir_type == 'normal':
        dir_parent = 'Normal_cases'
    
    src_dir = os.path.join(src_root_dir, str(dir_parent), str(dir_name))
    dst_dir = os.path.join(dst_root_dir, dir_type + '_' + str(dir_name))
    os.makedirs(dst_dir, exist_ok=True)
    names = os.listdir(src_dir)
#     for name in names:
    s = df.loc[i]
    names = [name for name in names if '.dcm' in name.lower()]
    priors = [name for name in names if 'prior' in name]
    recents = [name for name in names if 'recent' in name]
    assert len(priors) == 2 or len(priors) == 0, names
    assert len(recents) == 2, f'{names} and {dir_parent} -{dir_name}'
    for ori_name in names:
        name = ori_name.split('.')[0]
        new_s = s.copy()
        if new_s['biopsy_result'] == 'DCIS' or new_s['biopsy_result'] == 'MALIGNANT':
            if 'prior' in name:
                new_s['biopsy_result'] = 'BENIGN/NORMAL'
        new_name = str(new_s['folder_type']) + '_' + str(new_s['dir']) + name
        new_s['image_id'] =  new_name
        src_path = os.path.join(src_dir, ori_name)
        dst_path = os.path.join(dst_dir, new_name + '.dcm')
        shutil.copy2(src_path, dst_path)
        
        new_s['view'] = name.split('_')[0]
        all_series.append(new_s)

In [None]:
df = pd.DataFrame(all_series)
df

In [None]:
df.biopsy_result.value_counts()

In [None]:
print(df[df.biopsy_result.isin(['MALIGNANT', 'DCIS'])]['image_id'].tolist())

In [None]:
df.columns

In [None]:
df['cancer'] = df.biopsy_result.apply(lambda x: 1 if x in ['MALIGNANT', 'DCIS'] else 0)
df.cancer.value_counts()

In [None]:
df.rename(columns = {
    'dir': 'patient_id',
}, inplace = True)
df

In [None]:
df['patient_id'] = df['folder_type'] + '_' +  df['patient_id'].apply(str)

In [None]:
dicom_df = get_dicom_meta(dst_root_dir, df, extension = 'dcm')

In [None]:
dicom_df.dropna(axis=1, how='all', inplace = True)

In [None]:
dicom_df

In [None]:
dicom_df.rename(columns = {name:'__' + name for name in dicom_df.columns if name != 'image_id'}, inplace = True)
merged = pd.merge(
    df,
    dicom_df,
    how="inner",
    on='image_id',
    left_on=None,
    right_on=None,
    left_index=False,
    right_index=False,
    sort=False,
    suffixes=("", "__"),
    copy=True,
    indicator=False,
    validate='1:1',
)
merged

In [None]:
merged

In [None]:
print(list(merged.columns))

In [None]:
merged = merged[['patient_id', 'laterality', 'age', 'density', 'BIRADS', 'biopsy_result', 'folder_type', 'view', 'image_id', 'cancer', '__WindowWidth', '__WindowCenter', '__WindowCenterWidthExplanation', '__RescaleType', '__RescaleSlope', '__RescaleIntercept', '__Rows', '__Columns', '__BitsAllocated', '__BitsStored']]
merged

In [None]:
merged.to_csv(os.path.join(ROOT_DIR, 'label_v2.csv'), index = False)

### CDD-CESM

In [36]:
ROOT_DIR = '/home/dangnh36/datasets/.comp/rsna/external/cdd_cesm/'
DCM_ROOT_DIR = os.path.join(ROOT_DIR, 'low_energy')

In [38]:
df = pd.read_csv(os.path.join(ROOT_DIR, 'cdd_cesm.csv'))
df

Unnamed: 0,Image_name,Patient_ID,Side,Type,Age,Breast density (ACR),BIRADS,Findings,View,Tags,Machine,Pathology Classification/ Follow up
0,P1_L_DM_MLO,1,L,DM,46,C,4,Architectural distortion,MLO,"suspicious, distortion",1,Malignant
1,P1_L_CM_MLO,1,L,CESM,46,_,4,Heterogenous non mass enhancement,MLO,"suspicious, non mass",1,Malignant
...,...,...,...,...,...,...,...,...,...,...,...,...
2004,P326_L_CM_CC,326,L,CESM,51,_,4,Segmental non mass enhancement,CC,"suspicious non mass, segmental",1,Malignant
2005,P326_L_CM_MLO,326,L,CESM,51,_,4,Segmental non mass enhancement,MLO,"suspicious non mass, segmental",1,Malignant


In [None]:
df.Type.value_counts()

In [None]:
df['image_id'] = df['Image_name'].apply(lambda x: x.replace('DM_', '').replace('CM_', ''))
# df['patient_id'] = df['Image_name'].apply(lambda x: x.split('_')[0])
df

In [39]:
df['Pathology Classification/ Follow up'].value_counts()

Normal       757
Malignant    662
Benign       587
Name: Pathology Classification/ Follow up, dtype: int64

In [None]:
for image_name, sub_df in df.groupby('image_id'):
    sub_df = sub_df.reset_index(drop = True)
    if sub_df.at[0, 'Pathology Classification/ Follow up'] == 'Malignant':
        assert sub_df['Pathology Classification/ Follow up'].nunique() == 1, sub_df

In [None]:
df = df[df.Type == 'DM'].reset_index(drop = True)
df

In [None]:
df.columns

In [None]:
df.rename(columns = {
    'Image_name': 'image_name',
    'Patient_ID': 'patient_id',
    'Side': 'laterality',
    'Age': 'age',
    'Breast density (ACR)': 'density',
    'Findings': 'findings',
    'View': 'view',
    'Tags': 'tags',
    'Machine': 'machine_id',
    'Pathology Classification/ Follow up': 'classification',
}, inplace = True)
df.columns

In [None]:
df = df[['patient_id', 'image_id', 'laterality', 'view', 'image_name', 'age', 'density', 'BIRADS',
       'findings', 'tags', 'machine_id', 'classification', 'Type']]
df

In [None]:
df.classification.value_counts()

In [None]:
df['cancer'] = df.classification.apply(lambda x: 1 if x == 'Malignant' else 0)
df

In [None]:
df.image_id.nunique()

In [None]:
df.cancer.value_counts()

In [8]:
df.to_csv(os.path.join(ROOT_DIR, 'label_v2.csv'), index = False)

In [6]:
df = pd.read_csv(os.path.join(ROOT_DIR, 'label_v2.csv'))
df

Unnamed: 0,patient_id,image_id,laterality,view,image_name,age,density,BIRADS,findings,tags,machine_id,classification,Type,cancer
0,1,P1_L_MLO,L,MLO,P1_L_DM_MLO,46,C,4,Architectural distortion,"suspicious, distortion",1,Malignant,DM,1
1,2,P2_R_CC,R,CC,P2_R_DM_CC,31,C,3$2,Multiple equal density masses$Benign macrocal...,"benign, masses, macrocalcification",2,Benign,DM,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1001,326,P326_L_CC,L,CC,P326_L_DM_CC,51,C,3,Asymmetrical increased density,asymmetry,1,Malignant,DM,1
1002,326,P326_L_MLO,L,MLO,P326_L_DM_MLO,51,C,3,Asymmetrical increased density,asymmetry,1,Malignant,DM,1


In [7]:
df['image_id']  = df['image_id'].apply(lambda x: x.strip())