# Data Pickler and Seperator

In [9]:
import pickle
import dicom
import os
import numpy as np
from sklearn.utils import shuffle

path = './sample_images/'
dcmFiles = []
for dirName, subDirList, fileList in os.walk(path):
    for fileName in fileList:
        if '.dcm' in fileName.lower():
            dcmFiles.append(os.path.join(dirName,fileName))
            
id_labels = dict()
with open('stage1_labels.csv', 'r') as labelFile:
    for i,line in enumerate(labelFile):
        if i == 0: continue
        p_id, cancer = line.strip().split(',')
        id_labels[p_id] = int(cancer)


In [10]:
def convert_images(id_labels, dFiles):
    labeless_images = []
    labeled_images = []
    labels = []
    for file in dFiles:
        ref = dicom.read_file(file)
        p_id = ref.PatientID
        if p_id in id_labels:
            labeled_images.append(ref.pixel_array)
            labels.append(id_labels[p_id])
        else:
            labeless_images.append(ref.pixel_array)
    return np.array(labeled_images, dtype=np.float32), np.array(labels), np.array(labeless_images, dtype=np.float32)

In [11]:
labeled_data, data_labels, labeless_data = convert_images(id_labels, dcmFiles)
print("Labeled Data Shape (n_samples,rowpix,colpix): " + str(labeled_data.shape))
print("Unlabeled Data Shape (n_samples,rowpix,colpix): " + str(labeless_data.shape))


Labeled Data Shape (n_samples,rowpix,colpix): (3408, 512, 512)
Unlabeled Data Shape (n_samples,rowpix,colpix): (196, 512, 512)


In [13]:
labeled_data, data_labels = shuffle(labeled_data, data_labels)

train_index = int(labeled_data.shape[0]*0.75)
valid_index = int(labeled_data.shape[0]*0.85)

datas = [labeled_data[:train_index], labeled_data[train_index:valid_index], labeled_data[valid_index:]]
labels = [data_labels[:train_index], data_labels[train_index:valid_index], data_labels[valid_index:]]

dictionaries = [dict() for x in range(3)]
for i,d in enumerate(dictionaries):    
    d['features'] = datas[i]
    d['labels'] = labels[i]


print("Training Shape: " + str(datas[0].shape))
print("\t Training Cancer Ratio: " + str(np.sum(labels[0])/labels[0].shape[0]))
print("Validation Shape: " + str(datas[1].shape))
print("\t Validation Cancer Ratio: " + str(np.sum(labels[1])/labels[1].shape[0]))
print("Test Shape: " + str(datas[2].shape))
print("\t Test Cancer Ratio: " + str(np.sum(labels[2])/labels[2].shape[0]))

Training Shape: (2556, 512, 512)
	 Training Cancer Ratio: 0.319248826291
Validation Shape: (340, 512, 512)
	 Validation Cancer Ratio: 0.276470588235
Test Shape: (512, 512, 512)
	 Test Cancer Ratio: 0.294921875


In [14]:
## Dictionary testing
print("Training Shape: " + str(dictionaries[0]['features'].shape))
print("\t Training Cancer Ratio: " + str(np.sum(dictionaries[0]['labels'])/dictionaries[0]['labels'].shape[0]))
print("Validation Shape: " + str(dictionaries[1]['features'].shape))
print("\t Validation Cancer Ratio: " + str(np.sum(dictionaries[1]['labels'])/dictionaries[1]['labels'].shape[0]))
print("Test Shape: " + str(dictionaries[2]['features'].shape))
print("\t Test Cancer Ratio: " + str(np.sum(dictionaries[2]['labels'])/dictionaries[2]['labels'].shape[0]))

Training Shape: (2556, 512, 512)
	 Training Cancer Ratio: 0.319248826291
Validation Shape: (340, 512, 512)
	 Validation Cancer Ratio: 0.276470588235
Test Shape: (512, 512, 512)
	 Test Cancer Ratio: 0.294921875


In [21]:
# names = ['train', 'validation', 'test']
# for name,d in zip(names,dictionaries):
#     print(name)
#     pickle.dump(d, open(name+'.p', 'wb'))

pickle.dump(dictionaries[0], open('foo.p', 'wb'))

OSError: [Errno 22] Invalid argument