# Simple MNist like model for detection

Using as example:
https://github.com/keras-team/keras/blob/master/examples/mnist_cnn.py

In [None]:
from os import listdir
from multiprocessing import Pool, TimeoutError
from os.path import isfile, join
from pathlib import Path

import gzip
import pickle
import gc
import keras
from random import shuffle

import numpy as np

PATH_TO_INTERMEDIATE = "../../data/segmentation.pickle/0"

In [None]:
for p in Path(PATH_TO_INTERMEDIATE).glob("train*.pickle"):
    p.unlink()
for p in Path(PATH_TO_INTERMEDIATE).glob("test*.pickle"):
    p.unlink()
for p in Path(PATH_TO_INTERMEDIATE).glob("validation*.pickle"):
    p.unlink()    

In [None]:
onlyfiles = [join(PATH_TO_INTERMEDIATE, x) for x in listdir(PATH_TO_INTERMEDIATE) if isfile(join(PATH_TO_INTERMEDIATE, x)) and x.endswith(".pickle")]
shuffle(onlyfiles)
print("Original number of files:", len(onlyfiles))

In [None]:
kernel_size = 16
num_classes = 2

train_files_size = 400
validation_files_size = 116
test_files_size = 0

batch_size_train = 400
batch_size_validation = 116
batch_size_test = 10

In [None]:
number_of_files_to_use = train_files_size + validation_files_size + test_files_size
number_of_files_available = len(onlyfiles)

if (number_of_files_to_use) > number_of_files_available:
    raise ValueError("Total files to use {} is greater than the number of available files {}  ".format(number_of_files_to_use,number_of_files_available))
    
if (train_files_size % batch_size_train != 0):
    raise ValueError("The number of training files need to be a multiple of the batch size")
if (validation_files_size % batch_size_validation != 0):
    raise ValueError("The number of validation files need to be a multiple of the batch size")
if (test_files_size % batch_size_test != 0):
    raise ValueError("The number of test files need to be a multiple of the batch size")    

In [None]:
train_files = onlyfiles[0:train_files_size]
validation_files = onlyfiles[train_files_size:train_files_size + validation_files_size]
test_files = onlyfiles[train_files_size + validation_files_size:train_files_size + validation_files_size + test_files_size]

print('length of train_files:', len(train_files))
print('length of validation_files:', len(validation_files))
print('length of test_files:', len(test_files))

In [None]:
gc.collect()

def create_nparray_train(current_batch):
    train_x = []
    train_y = []
    sample_tuples_1 = []
    sample_tuples_0 = []
    for current_file in train_files[current_batch * batch_size_train : (current_batch + 1)  * batch_size_train]:
        print("Processing:{}".format(current_file[:40]))
        sample = []
        with gzip.open(current_file,'rb') as f:
            sample = pickle.load(f)
        for current_sample in sample['slices']:
            if not (current_sample['slice'].shape[0] == kernel_size and current_sample['slice'].shape[1] == kernel_size):
                continue
            if current_sample['is_nuclei'] == 1:
                sample_tuples_1.append((current_sample['slice'], current_sample['is_nuclei']))
            else:
                sample_tuples_0.append((current_sample['slice'], current_sample['is_nuclei']))
    
    print("Sampled")
    sample_tuples_0 = sample_tuples_0[:len(sample_tuples_1) * 12]
    sample_tuples = sample_tuples_0 + sample_tuples_1
    gc.collect()
    
    for _ in range(5):
        shuffle(sample_tuples)
        
    print("Shuffed")
    
    for current_sample_tuple in sample_tuples:
        train_x.append(current_sample_tuple[0])
        train_y.append(current_sample_tuple[1])
    train_x = np.array(train_x)
    train_x = train_x.astype('float16')
    train_y = np.array(train_y)
    train_y = train_y.astype('float16')
    
    train_x = train_x.reshape(train_x.shape[0], kernel_size, kernel_size,1)
    train_y = train_y.reshape(train_y.shape[0])
    train_y = keras.utils.to_categorical(train_y, num_classes)
    
    print("Added Batch:{}".format(current_batch))
    with open(join(PATH_TO_INTERMEDIATE, 'train.batch.{}.x.pickle'.format(current_batch)),'wb') as f:
        pickle.dump(train_x, f, protocol=pickle.HIGHEST_PROTOCOL)  
    with open(join(PATH_TO_INTERMEDIATE, 'train.batch.{}.y.pickle'.format(current_batch)),'wb') as f:
        pickle.dump(train_y, f, protocol=pickle.HIGHEST_PROTOCOL)  
    train_x = None
    train_y = None
    sample_tuples = None
    sample_tuples_1 = None
    sample_tuples_0 = None
    gc.collect()    
    
pool = Pool(processes=2)    
pool.map(create_nparray_train,range(int(train_files_size/batch_size_train)))
pool.close()
pool.terminate()
#pool.map(create_nparray_train,range(1))


In [None]:
gc.collect()
def create_nparray_validation(current_batch):
    validation_x = []
    validation_y = []
    sample_tuples = []
    sample_tuples_0 = []
    sample_tuples_1 = []
    for current_file in validation_files[current_batch * batch_size_validation : (current_batch + 1)  * batch_size_validation]:
        print("Processing:{}".format(current_file[:40]))
        sample = []
        with gzip.open(current_file,'rb') as f:
            sample = pickle.load(f)
        for current_sample in sample['slices']:
            if not (current_sample['slice'].shape[0] == kernel_size and current_sample['slice'].shape[1] == kernel_size):
                continue            
            if current_sample['is_nuclei'] == 1:
                sample_tuples_1.append((current_sample['slice'], current_sample['is_nuclei']))
            else:
                sample_tuples_0.append((current_sample['slice'], current_sample['is_nuclei']))
    
    print("Sampling")    
    sample_tuples = sample_tuples_0 + sample_tuples_1
    gc.collect()
    
    for _ in range(5):
        shuffle(sample_tuples)
        
    print("Shuffling")
       
    for current_sample_tuple in sample_tuples:
        validation_x.append(current_sample_tuple[0])
        validation_y.append(current_sample_tuple[1])
    validation_x = np.array(validation_x)
    validation_x = validation_x.astype('float16')
    validation_y = np.array(validation_y)
    validation_y = validation_y.astype('float16')
    
    validation_x= validation_x.reshape(validation_x.shape[0], kernel_size, kernel_size,1)
    validation_y = validation_y.reshape(validation_y.shape[0])
    validation_y = keras.utils.to_categorical(validation_y, num_classes)
    
    with open(join(PATH_TO_INTERMEDIATE, 'validation.batch.{}.x.pickle'.format(current_batch)),'wb') as f:
        pickle.dump(validation_x, f, protocol=pickle.HIGHEST_PROTOCOL)  
    with open(join(PATH_TO_INTERMEDIATE, 'validation.batch.{}.y.pickle'.format(current_batch)),'wb') as f:
        pickle.dump(validation_y, f, protocol=pickle.HIGHEST_PROTOCOL)  
    validation_x = None
    validation_y = None
    sample_tuples = None
    gc.collect()    
    
pool = Pool(processes=2)    
pool.map(create_nparray_validation,range(int(validation_files_size/batch_size_validation)))
pool.close()
pool.terminate()
    

In [None]:
gc.collect()
def create_nparray_test(current_batch):
    test_x = []
    test_y = []
    sample_tuples = []
    sample_tuples_0 = []
    sample_tuples_1 = []
    for current_file in test_files[current_batch * batch_size_test : (current_batch + 1)  * batch_size_test]:
        sample = []
        with gzip.open(current_file,'rb') as f:
            sample = pickle.load(f)
        for current_sample in sample['slices']:
            if not (current_sample['slice'].shape[0] == kernel_size and current_sample['slice'].shape[1] == kernel_size):
                continue
            if current_sample['is_nuclei'] == 1:
                sample_tuples_1.append((current_sample['slice'], current_sample['is_nuclei']))
            else:
                sample_tuples_0.append((current_sample['slice'], current_sample['is_nuclei']))
    for _ in range(5):
        shuffle(sample_tuples_0)
        
    sample_tuples_0 = sample_tuples_0[:len(sample_tuples_1) * 4]
    sample_tuples = sample_tuples_0 + sample_tuples_1
    gc.collect()
                                   
    for current_sample_tuple in sample_tuples:
        test_x.append(current_sample_tuple[0])
        test_y.append(current_sample_tuple[1])
    test_x = np.array(test_x)
    test_x = test_x.astype('float16')
    test_y = np.array(test_y)
    test_y = test_y.astype('float16')
    
    test_x = test_x.reshape(test_x.shape[0], kernel_size, kernel_size,1)
    test_y = test_y.reshape(test_y.shape[0])
    test_y = keras.utils.to_categorical(test_y, num_classes)
    
    
    with open(join(PATH_TO_INTERMEDIATE, 'test.batch.{}.x.pickle'.format(current_batch)),'wb') as f:
        pickle.dump(test_x, f, protocol=pickle.HIGHEST_PROTOCOL)  
    with open(join(PATH_TO_INTERMEDIATE, 'test.batch.{}.y.pickle'.format(current_batch)),'wb') as f:
        pickle.dump(test_y, f, protocol=pickle.HIGHEST_PROTOCOL)  
    test_x = None
    test_y = None
    sample_tuples = None
    gc.collect()    
    
pool = Pool(processes=1)    
pool.map(create_nparray_test,range(int(test_files_size/batch_size_test)))
pool.close()
pool.terminate()
    
    