In [1]:
import numpy as np
import pandas as pd
import os
import re
import shutil

In [58]:
## rename the files to include the patient id so we can match the image up with the labels
## also copy the images to a single directory so we have them all in one place

def rename_and_copy_files(path, sourcedir="JPEG512", destdir="AllJPEGS"):
    directories = os.listdir(path+sourcedir)
    source_path = path + sourcedir + "/"
    destination_path = path + destdir + "/"
    
    # keep a counter so each file has a unique name
    i = 1
    
    # loop through the directories
    for directory in directories:
        # get the patient number and image type from the directory name
        patient_id = str(re.findall("_(P_[\d]+)_", directory)[0])
        image_side = str(re.findall("_(LEFT|RIGHT)_", directory)[0])
        image_type = str(re.findall("(CC|MLO)", directory)[0])
        
        if not patient_id:
            continue
            
        # get the subdirectories
        subdir = os.listdir(source_path+directory)

        # get the next level of subdirectories
        subsubdir = os.listdir(source_path+directory+'/'+subdir[0])

        # get the files 
        files = os.listdir(source_path+directory+'/'+subdir[0]+'/'+subsubdir[0])
        path = source_path+directory+'/'+subdir[0]+'/'+subsubdir[0]

        for file in files:
            # rename the file so we know where it came from
            #new_name = path+'/'+patient_id+'_'+image_type+'_'+image_side+'_'+str(i)+'.jpg'
            new_name = path+'/'+patient_id+'_'+image_type+'_'+image_side+'.jpg'
            os.rename(path+'/'+file, new_name)

            # make sure the destination directory exists
            try:
                os.stat(destination_path)
            except:
                os.mkdir(destination_path)  
            
            # copy the files so they are all in one directory
            shutil.copy(new_name, destination_path)

        i += 1

In [59]:
rename_and_copy_files("data/new/Mass Test/")
rename_and_copy_files("data/new/Mass Train/")

In [101]:
# create labels for images
mass_test_data = pd.read_csv(os.path.join("data", "new", "mass_case_description_test_set.csv"))
mass_test_data['image_name'] = mass_test_data.patient_id + '_' + mass_test_data['image view'] + '_' + mass_test_data['left or right breast'] + '.jpg'
mass_test_data.drop(["image file path","cropped image file path","ROI mask file path"], axis=1, inplace=True)
mass_test_data.columns = ["PATIENT_ID","BREAST_DENSITY","SIDE","VIEW","ABNORMALITY_ID","ABNORMALITY_TYPE","MASS_SHAPE","MASS_MARGINS","ASSESSMENT","PATHOLOGY", "SUBLETY","IMAGE_NAME"]
mass_test_data['CLASS'] = mass_test_data['PATHOLOGY'] + '_' + mass_test_data['ABNORMALITY_TYPE']

mass_train_data = pd.read_csv(os.path.join("data", "new", "mass_case_description_train_set.csv"))
mass_train_data['image_name'] = mass_train_data.patient_id + '_' + mass_train_data['image view'] + '_' + mass_train_data['left or right breast'] + '.jpg'
mass_train_data.drop(["image file path","cropped image file path","ROI mask file path"], axis=1, inplace=True)
mass_train_data.columns = ["PATIENT_ID","BREAST_DENSITY","SIDE","VIEW","ABNORMALITY_ID","ABNORMALITY_TYPE","MASS_SHAPE","MASS_MARGINS","ASSESSMENT","PATHOLOGY", "SUBLETY","IMAGE_NAME"]
mass_train_data['CLASS'] = mass_train_data['PATHOLOGY'] + '_' + mass_train_data['ABNORMALITY_TYPE']

calc_test_data = pd.read_csv(os.path.join("data", "new", "calc_case_description_test_set.csv"))
calc_test_data['image_name'] = calc_test_data.patient_id + '_' + calc_test_data['image view'] + '_' + calc_test_data['left or right breast'] + '.jpg'
calc_test_data.drop(["image file path","cropped image file path","ROI mask file path"], axis=1, inplace=True)
calc_test_data.columns = ["PATIENT_ID","BREAST_DENSITY","SIDE","VIEW","ABNORMALITY_ID","ABNORMALITY_TYPE","MASS_SHAPE","MASS_MARGINS","ASSESSMENT","PATHOLOGY", "SUBLETY","IMAGE_NAME"]
calc_test_data['CLASS'] = calc_test_data['PATHOLOGY'] + '_' + calc_test_data['ABNORMALITY_TYPE']

calc_train_data = pd.read_csv(os.path.join("data", "new", "calc_case_description_train_set.csv"))
calc_train_data['image_name'] = calc_train_data.patient_id + '_' + calc_train_data['image view'] + '_' + calc_train_data['left or right breast'] + '.jpg'
calc_train_data.drop(["image file path","cropped image file path","ROI mask file path"], axis=1, inplace=True)
calc_train_data.columns = ["PATIENT_ID","BREAST_DENSITY","SIDE","VIEW","ABNORMALITY_ID","ABNORMALITY_TYPE","MASS_SHAPE","MASS_MARGINS","ASSESSMENT","PATHOLOGY", "SUBLETY","IMAGE_NAME"]
calc_train_data['CLASS'] = calc_train_data['PATHOLOGY'] + '_' + calc_train_data['ABNORMALITY_TYPE']

train_labels = pd.concat([mass_train_data, calc_train_data], axis=0)
test_labels = pd.concat([mass_test_data, calc_test_data], axis=0)

# set the image name to be the index
train_labels.set_index("IMAGE_NAME", inplace=True)
test_labels.set_index("IMAGE_NAME", inplace=True)

# save the data
train_labels.to_pickle(os.path.join("data","train_labels.pkl"))
test_labels.to_pickle(os.path.join("data","test_labels.pkl"))

In [102]:
train_labels.loc['P_01888_MLO_RIGHT.jpg']

PATIENT_ID                 P_01888
BREAST_DENSITY                   2
SIDE                         RIGHT
VIEW                           MLO
ABNORMALITY_ID                   1
ABNORMALITY_TYPE              mass
MASS_SHAPE               IRREGULAR
MASS_MARGINS            SPICULATED
ASSESSMENT                       5
PATHOLOGY                MALIGNANT
SUBLETY                          5
CLASS               MALIGNANT_mass
Name: P_01888_MLO_RIGHT.jpg, dtype: object

In [106]:
print("Test labels:",test_labels.shape)
print("Train labels:",train_labels.shape)

Test labels: (704, 12)
Train labels: (2864, 12)
