In [1]:
import numpy as np
import pandas as pd
import os
import re
import shutil

In [26]:
## rename the files to include the patient id so we can match the image up with the labels
## also copy the images to a single directory so we have them all in one place
def rename_and_copy_files(path, sourcedir="JPEG512", destdir="AllJPEGS512"):
    directories = os.listdir(path+sourcedir)
    source_path = path + sourcedir + "/"
    destination_path = path + destdir + "/"
    
    # keep a counter so each file has a unique name
    i = 1
    
    # loop through the directories
    for directory in directories:
        # get the patient number and image type from the directory name
        patient_id = str(re.findall("_(P_[\d]+)_", directory))
        if len(patient_id) > 0:
            patient_id = patient_id[0]
        else:
            continue
            
        image_side = str(re.findall("_(LEFT|RIGHT)_", directory))
        
        if len(image_side) > 0:
            image_side = image_side[0]
        else:
            continue
        
        image_type = str(re.findall("(CC|MLO)", directory))
        if len(image_type) > 0:
            image_type = image_type[0]
        else:
            continue
        
        if not patient_id:
            continue
            
        # get the subdirectories
        subdir = os.listdir(source_path+directory)

        # get the next level of subdirectories
        subsubdir = os.listdir(source_path+directory+'/'+subdir[0])

        # get the files 
        files = os.listdir(source_path+directory+'/'+subdir[0]+'/'+subsubdir[0])
        path = source_path+directory+'/'+subdir[0]+'/'+subsubdir[0]

        for file in files:
            # rename the file so we know where it came from
            # some of the data is not properly labeled, if that is the case skip it since we won't be able to label it
            try:
                new_name = path+'/'+patient_id+'_'+image_side+'_'+image_type+'.jpg'
                os.rename(path+'/'+file, new_name)
            except:
                continue
                
            # make sure the destination directory exists
            try:
                os.stat(destination_path)
            except:
                os.mkdir(destination_path)  
            
            # copy the files so they are all in one directory
            shutil.copy(new_name, destination_path)

        i += 1

In [7]:
rename_and_copy_files("data/new/Mass Test/", sourcedir="JPEG299", destdir="AllJPEGS299")
rename_and_copy_files("data/new/Mass Train/", sourcedir="JPEG299", destdir="AllJPEGS299")

In [60]:
rename_and_copy_files("data/new/Mass Test/", sourcedir="JPEG512", destdir="AllJPEGS512")
rename_and_copy_files("data/new/Mass Train/", sourcedir="JPEG512", destdir="AllJPEGS512")

In [27]:
rename_and_copy_files("data/new/Calc Test/", sourcedir="JPEG299", destdir="AllJPEGS299")
rename_and_copy_files("data/new/Calc Train/", sourcedir="JPEG299", destdir="AllJPEGS299")

In [2]:
# create labels for images
mass_test_data = pd.read_csv(os.path.join("data", "new", "mass_case_description_test_set.csv"))
mass_test_data['image_name'] = mass_test_data.patient_id + '_' + mass_test_data['left or right breast'] + '_' + mass_test_data['image view'] + '.jpg'
mass_test_data.drop(["image file path","cropped image file path","ROI mask file path"], axis=1, inplace=True)
mass_test_data.columns = ["PATIENT_ID","BREAST_DENSITY","SIDE","VIEW","ABNORMALITY_ID","ABNORMALITY_TYPE","MASS_SHAPE","MASS_MARGINS","ASSESSMENT","PATHOLOGY", "SUBLETY","IMAGE_NAME"]

mass_train_data = pd.read_csv(os.path.join("data", "new", "mass_case_description_train_set.csv"))
mass_train_data['image_name'] = mass_train_data.patient_id + '_' + mass_train_data['left or right breast'] + '_' + mass_train_data['image view'] + '.jpg'
mass_train_data.drop(["image file path","cropped image file path","ROI mask file path"], axis=1, inplace=True)
mass_train_data.columns = ["PATIENT_ID","BREAST_DENSITY","SIDE","VIEW","ABNORMALITY_ID","ABNORMALITY_TYPE","MASS_SHAPE","MASS_MARGINS","ASSESSMENT","PATHOLOGY", "SUBLETY","IMAGE_NAME"]

calc_test_data = pd.read_csv(os.path.join("data", "new", "calc_case_description_test_set.csv"))
calc_test_data['image_name'] = calc_test_data.patient_id + '_' + calc_test_data['image view'] + '_' + calc_test_data['left or right breast'] + '.jpg'
calc_test_data.drop(["image file path","cropped image file path","ROI mask file path"], axis=1, inplace=True)
calc_test_data.columns = ["PATIENT_ID","BREAST_DENSITY","SIDE","VIEW","ABNORMALITY_ID","ABNORMALITY_TYPE","MASS_SHAPE","MASS_MARGINS","ASSESSMENT","PATHOLOGY", "SUBLETY","IMAGE_NAME"]

calc_train_data = pd.read_csv(os.path.join("data", "new", "calc_case_description_train_set.csv"))
calc_train_data['image_name'] = calc_train_data.patient_id + '_' + calc_train_data['image view'] + '_' + calc_train_data['left or right breast'] + '.jpg'
calc_train_data.drop(["image file path","cropped image file path","ROI mask file path"], axis=1, inplace=True)
calc_train_data.columns = ["PATIENT_ID","BREAST_DENSITY","SIDE","VIEW","ABNORMALITY_ID","ABNORMALITY_TYPE","MASS_SHAPE","MASS_MARGINS","ASSESSMENT","PATHOLOGY", "SUBLETY","IMAGE_NAME"]

train_labels = pd.concat([mass_train_data, calc_train_data], axis=0)
# replace BENIGN_WITHOUT_CALLBACK with BENIGN for easier classification
train_labels['PATHOLOGY'][train_labels['PATHOLOGY'] == 'BENIGN_WITHOUT_CALLBACK'] = 'BENIGN'

train_labels['CLASS'] = train_labels['PATHOLOGY'] + '_' + train_labels['ABNORMALITY_TYPE']

test_labels = pd.concat([mass_test_data, calc_test_data], axis=0)
# replace BENIGN_WITHOUT_CALLBACK with BENIGN for easier classification
test_labels['PATHOLOGY'][test_labels['PATHOLOGY'] == 'BENIGN_WITHOUT_CALLBACK'] = 'BENIGN'
test_labels['CLASS'] = test_labels['PATHOLOGY'] + '_' + test_labels['ABNORMALITY_TYPE']

# set the image name to be the index
train_labels.set_index("IMAGE_NAME", inplace=True)
test_labels.set_index("IMAGE_NAME", inplace=True)

# save the data
train_labels.to_pickle(os.path.join("data","train_labels.pkl"))
test_labels.to_pickle(os.path.join("data","test_labels.pkl"))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
