# Simple MNist like model for detection

Using as example:
https://github.com/keras-team/keras/blob/master/examples/mnist_cnn.py

In [7]:
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras import backend as K

from os import listdir
from os.path import isfile, join
from pathlib import Path

import gzip
import pickle
import gc
from random import shuffle

import numpy as np

PATH_TO_INTERMEDIATE = "../data/intermediate"

In [8]:
for p in Path(PATH_TO_INTERMEDIATE).glob("train*.pickle"):
    p.unlink()
for p in Path(PATH_TO_INTERMEDIATE).glob("test*.pickle"):
    p.unlink()
for p in Path(PATH_TO_INTERMEDIATE).glob("validation*.pickle"):
    p.unlink()    

In [9]:
onlyfiles = [join(PATH_TO_INTERMEDIATE, x) for x in listdir(PATH_TO_INTERMEDIATE) if isfile(join(PATH_TO_INTERMEDIATE, x)) and x.endswith(".pickle")]
shuffle(onlyfiles)
print("Original number of files:", len(onlyfiles))

Original number of files: 350


In [10]:
train_files_size = 20
validation_files_size = 5
test_files_size = 5
batch_size = 5

In [11]:
number_of_files_to_use = train_files_size + validation_files_size + test_files_size
number_of_files_available = len(onlyfiles)

if (number_of_files_to_use) > number_of_files_available:
    raise ValueError("Total files to use {} is greater than the number of available files {}  ".format(number_of_files_to_use,number_of_files_available))
    
if (train_files_size % batch_size != 0):
    raise ValueError("The number of training files need to be a multiple of the batch size")
if (validation_files_size % batch_size != 0):
    raise ValueError("The number of validation files need to be a multiple of the batch size")
if (test_files_size % batch_size != 0):
    raise ValueError("The number of test files need to be a multiple of the batch size")    

In [12]:
train_files = onlyfiles[0:train_files_size]
validation_files = onlyfiles[train_files_size:train_files_size + validation_files_size]
test_files = onlyfiles[train_files_size + validation_files_size:train_files_size + validation_files_size + test_files_size]

print('length of train_files:', len(train_files))
print('length of validation_files:', len(validation_files))
print('length of test_files:', len(test_files))

length of train_files: 20
length of validation_files: 5
length of test_files: 5


In [13]:
gc.collect()
for current_batch in range(int(train_files_size/batch_size)):
    train_x = []
    train_y = []
    for current_file in train_files[current_batch * batch_size : (current_batch + 1)  * batch_size]:
        sample = []
        with gzip.open(current_file,'rb') as f:
            sample = pickle.load(f)
        for current_sample in sample['slices']:
            train_x.append(current_sample['slice'])
            train_y.append(current_sample['is_nuclei'])
    train_x = np.array(train_x)
    train_y = np.array(train_y)
    with gzip.open(join(PATH_TO_INTERMEDIATE, 'train.batch.{}.x.pickle'.format(current_batch)),'wb') as f:
        pickle.dump(train_x, f, protocol=pickle.HIGHEST_PROTOCOL)  
    with gzip.open(join(PATH_TO_INTERMEDIATE, 'train.batch.{}.y.pickle'.format(current_batch)),'wb') as f:
        pickle.dump(train_y, f, protocol=pickle.HIGHEST_PROTOCOL)  
    gc.collect()

In [14]:
gc.collect()
for current_batch in range(int(validation_files_size/batch_size)):
    validation_x = []
    validation_y = []
    for current_file in validation_files[current_batch * batch_size : (current_batch + 1)  * batch_size]:
        sample = []
        with gzip.open(current_file,'rb') as f:
            sample = pickle.load(f)
        for current_sample in sample['slices']:
            validation_x.append(current_sample['slice'])
            validation_y.append(current_sample['is_nuclei'])
    validation_x = np.array(validation_x)
    validation_y = np.array(validation_y)
    with gzip.open(join(PATH_TO_INTERMEDIATE, 'validation.batch.{}.x.pickle'.format(current_batch)),'wb') as f:
        pickle.dump(validation_x, f, protocol=pickle.HIGHEST_PROTOCOL)  
    with gzip.open(join(PATH_TO_INTERMEDIATE, 'validation.batch.{}.y.pickle'.format(current_batch)),'wb') as f:
        pickle.dump(validation_y, f, protocol=pickle.HIGHEST_PROTOCOL)  
    gc.collect()

In [15]:
gc.collect()
for current_batch in range(int(test_files_size/batch_size)):
    test_x = []
    test_y = []
    for current_file in test_files[current_batch * batch_size : (current_batch + 1)  * batch_size]:
        sample = []
        with gzip.open(current_file,'rb') as f:
            sample = pickle.load(f)
        for current_sample in sample['slices']:
            test_x.append(current_sample['slice'])
            test_y.append(current_sample['is_nuclei'])
    test_x = np.array(test_x)
    test_y = np.array(test_y)
    with gzip.open(join(PATH_TO_INTERMEDIATE, 'test.batch.{}.x.pickle'.format(current_batch)),'wb') as f:
        pickle.dump(test_x, f, protocol=pickle.HIGHEST_PROTOCOL)  
    with gzip.open(join(PATH_TO_INTERMEDIATE, 'test.batch.{}.y.pickle'.format(current_batch)),'wb') as f:
        pickle.dump(test_y, f, protocol=pickle.HIGHEST_PROTOCOL)  
    gc.collect()

MemoryError: 