### Settings

In [None]:
from IPython.core.display import display, HTML
import sys,cv2,gc
sys.path.append('../')
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas
from Utils.utils import *
from ipywidgets import interact
import deepdish as dd
%matplotlib inline
display(HTML("<style>.container { width:100% !important; }</style>"))

%load_ext autoreload
%autoreload 2

from jupyterthemes import jtplot
# set "context" (paper, notebook, talk, poster)
jtplot.style(theme='grade3',context='talk', fscale=2.5, spines=True, gridlines='-',ticks=True, grid=True, figsize=(6, 4.5))
plotcolor = (0, 0.6, 1.0)

data_folder = 'D:/data/HPA/all/'

print('Done.')

### Create HDFs for training data

In [None]:
print("Collecting garbage...")
gc.collect()

#Read Labels
label_csv = pandas.read_csv(data_folder+'train.csv')
samplecount = label_csv['Id'].size
buffer_size = 5000 # How big should the buffers be?
idx = 0
buffer_idx = 0
partNr = 0

print(label_csv.head(5))

buffer_poi_channel = np.zeros([buffer_size,512,512,1],dtype = np.uint8) #protein of intereset channel only
buffer_all_channels = np.zeros([buffer_size,512,512,4],dtype = np.uint8) #all channels
labels = np.zeros([buffer_size,28],dtype = np.bool)
for _, row in label_csv.iterrows():
    
    if buffer_idx == buffer_size:
        #save
        data = {'X': buffer_poi_channel, 'labels': labels}
        dd.io.save(data_folder+'poi_'+str(partNr)+'.h5', data,compression=('blosc', 8))
        
        data = {'X': buffer_all_channels, 'labels': labels}
        dd.io.save(data_folder+'all_channel_'+str(partNr)+'.h5', data,compression=('blosc', 8))
        data = None #free memory
        
        if samplecount - idx < buffer_size:
            buffer_poi_channel = np.zeros([samplecount - idx,512,512,1],dtype = np.uint8) #protein of intereset channel only
            buffer_all_channels = np.zeros([samplecount - idx,512,512,4],dtype = np.uint8) #all channels
            labels = np.zeros([samplecount - idx,28],dtype = np.bool)
        
        #move on
        buffer_idx = 0
        partNr += 1
        
    fn = data_folder+'train/'+row['Id']
    
    if idx % 25 == 0:
        printProgressBar (idx, samplecount, prefix = 'Creating HDF5...', suffix = '(' + str(idx) + '/' + str(samplecount) + ')')
        
    blue,green,red,yellow = cv2.imread(fn+'_blue.png',0),cv2.imread(fn+'_green.png',0),cv2.imread(fn+'_red.png',0),cv2.imread(fn+'_yellow.png',0)
    
    buffer_poi_channel[buffer_idx] = np.expand_dims(green,axis=2)
    buffer_all_channels[buffer_idx,:,:,0] = green
    buffer_all_channels[buffer_idx,:,:,1] = red
    buffer_all_channels[buffer_idx,:,:,2] = blue
    buffer_all_channels[buffer_idx,:,:,3] = yellow
    
    labelNr = list(map(int,row['Target'].split(' ')))

    labels[buffer_idx,labelNr] = True #Convert labels to bool, where entry is true if class is present

    if blue is None:
        print("Error: File not found.")
        
    idx += 1
    buffer_idx +=1

#save last, smaller buffer
data = {'X': buffer_poi_channel, 'labels': labels}
dd.io.save(data_folder+'poi_'+str(partNr)+'.h5', data,compression=('blosc', 8))

data = {'X': buffer_all_channels, 'labels': labels}
dd.io.save(data_folder+'all_channel_'+str(partNr)+'.h5', data,compression=('blosc', 8))

data = None #free memory
buffer_poi_channel = None
buffer_all_channels = None

print()
print("Done.")

### Create HDF5 for test data

In [None]:
print("Collecting garbage...")
gc.collect()

#Read Labels
fn_csv = pandas.read_csv(data_folder+'sample_submission.csv')
samplecount = fn_csv['Id'].size
buffer_size = 11702 # How big should the buffers be?
idx = 0

print(fn_csv.head(5))

poi_channel = np.zeros([buffer_size,512,512,1],dtype = np.uint8) #protein of intereset channel only
all_channels = np.zeros([buffer_size,512,512,4],dtype = np.uint8) #all channels

for _, row in fn_csv.iterrows():   
    fn = data_folder+'test/'+row['Id']
    
    if idx % 25 == 0:
        printProgressBar (idx, samplecount, prefix = 'Creating HDF5...', suffix = '(' + str(idx) + '/' + str(samplecount) + ')')
        
    blue,green,red,yellow = cv2.imread(fn+'_blue.png',0),cv2.imread(fn+'_green.png',0),cv2.imread(fn+'_red.png',0),cv2.imread(fn+'_yellow.png',0)
    
    poi_channel[idx] = np.expand_dims(green,axis=2)
    all_channels[idx,:,:,0] = green
    all_channels[idx,:,:,1] = red
    all_channels[idx,:,:,2] = blue
    all_channels[idx,:,:,3] = yellow

    if blue is None:
        print("Error: File not found.")
        
    idx += 1
    
#save
data = {'X': poi_channel}
dd.io.save(data_folder+'test_poi.h5', data,compression=('blosc', 8))

data = {'X': all_channels}
dd.io.save(data_folder+'test_all_channel.h5', data,compression=('blosc', 8))

data = None #free memory
poi_channel = None
all_channels = None

print()
print("Done.")


### Create training sets with size 224x224

In [None]:
%%time
from skimage import io, transform
for partNr in range(7):
    print("Resizing part ", str(partNr))
    gc.collect()
    ########################### ONLY POI ###########################
    print("Resizing ",data_folder+'poi_'+str(partNr)+'.h5')
    
    d = dd.io.load(data_folder+'poi_'+str(partNr)+'.h5')
    
    X = d['X'] # torch likes float images
    y = d['labels']
    X_small = np.zeros((X.shape[0],224,224,1),dtype=np.uint8)

    for i,img in enumerate(X):
        if i % 25 == 0:
            printProgressBar (i, X.shape[0], prefix = 'Resizing images...', suffix = '(' + str(i) + '/' + str(X.shape[0]) + ')')
        X_small[i] = np.expand_dims(transform.resize(img.squeeze(), (224, 224), preserve_range=True),axis=2)

    print(X_small.shape)
    data = {'X': X_small, 'labels': y}
    print()
    print("Writing HDF5...")
    dd.io.save(data_folder+'poi_'+str(partNr)+'_small.h5', data,compression=('blosc', 8))
    
    ########################### ALL CHANNELS #######################
    gc.collect()
    print("Resizing ",data_folder+'all_channel_'+str(partNr)+'.h5')
    
    d = dd.io.load(data_folder+'all_channel_'+str(partNr)+'.h5')
    
    X = d['X'] # torch likes float images
    y = d['labels']
    X_small = np.zeros((X.shape[0],224,224,4),dtype=np.uint8)

    for i,img in enumerate(X):
        if i % 25 == 0:
            printProgressBar (i, X.shape[0], prefix = 'Resizing images...', suffix = '(' + str(i) + '/' + str(X.shape[0]) + ')')
        X_small[i] = transform.resize(img.squeeze(), (224, 224), preserve_range=True)

    data = {'X': X_small, 'labels': y}
    print()
    print("Writing HDF5...")
    dd.io.save(data_folder+'all_channel_'+str(partNr)+'_small.h5', data,compression=('blosc', 8))

print("Done.")

### Convert test data to 224x224

In [None]:
%%time
from skimage import io, transform

gc.collect()
########################### ONLY POI ###########################
print("Resizing ",data_folder+'test_poi'+'.h5')

d = dd.io.load(data_folder+'test_poi'+'.h5')

X = d['X'] # torch likes float images

X_small = np.zeros((X.shape[0],224,224,1),dtype=np.uint8)

for i,img in enumerate(X):
    if i % 25 == 0:
        printProgressBar (i, X.shape[0], prefix = 'Resizing images...', suffix = '(' + str(i) + '/' + str(X.shape[0]) + ')')
    X_small[i] = np.expand_dims(transform.resize(img.squeeze(), (224, 224), preserve_range=True),axis=2)

print(X_small.shape)
data = {'X': X_small}
print()
print("Writing HDF5...")
dd.io.save(data_folder+'test_poi'+'_small.h5', data,compression=('blosc', 8))

########################### ALL CHANNELS #######################
gc.collect()
print("Resizing ",data_folder+'test_all_channel'+'.h5')

d = dd.io.load(data_folder+'test_all_channel'+'.h5')

X = d['X'] # torch likes float images

X_small = np.zeros((X.shape[0],224,224,4),dtype=np.uint8)

for i,img in enumerate(X):
    if i % 25 == 0:
        printProgressBar (i, X.shape[0], prefix = 'Resizing images...', suffix = '(' + str(i) + '/' + str(X.shape[0]) + ')')
    X_small[i] = transform.resize(img.squeeze(), (224, 224), preserve_range=True)

data = {'X': X_small}
print()
print("Writing HDF5...")
dd.io.save(data_folder+'test_all_channel'+'_small.h5', data,compression=('blosc', 8))

print("Done.")

### Show sample data

In [None]:
d = dd.io.load(data_folder+'poi_0_small.h5')
print("Done.")

In [None]:
X = d['X']
print(X.shape)
plt.imshow(X[42].squeeze(),cmap='gray')