# Parsing training and validations sets + saving test set

https://github.com/conor-horgan/DeepeR has a link to a dataset of spectra. 

The database is composed of paired examples (noisy and high SNR versions of the same spectra)

Here, we parse this dataset to train our unsupervised spectral denoising framework.

In [1]:
# .mat files are nto in hdf5 format for some reason, so we use scipy.io to read them...
import numpy as np
import scipy.io
import random

#set how much of the training data to split for train and valid
train_split = 0.8
validation_split = 0.2
assert((train_split + validation_split) <= 1)
#use this if we'd like plit across the combined set formed by their train/test files
#test_split = 1 - (train_split + validation_split)

hn_train = scipy.io.loadmat('Train_Inputs.mat')
hn_train = hn_train['Train_Inputs']
ln_train = scipy.io.loadmat('Train_Outputs.mat')
ln_train = ln_train['Train_Outputs']

hn_test = scipy.io.loadmat('Test_Inputs.mat')
hn_test = hn_test['Test_Inputs']
ln_test = scipy.io.loadmat('Test_Outputs.mat')
ln_test = ln_test['Test_Outputs']

In [2]:
#use this if we'd like plit across the combined set formed by their train/test files
#hn_all = np.concatenate((hn_train,hn_test), axis=0)
#ln_all = np.concatenate((ln_train,ln_test), axis=0)

In [3]:
hn_all = hn_train
ln_all = ln_train

In [4]:
# randomise the order of spectra by getting a random set of indices
randomised_indices = random.sample(range(0, np.round(np.shape(hn_all)[0])), int(np.floor(np.shape(hn_all)[0])))
# split these indices to define the training and validation examples
train_indices = randomised_indices[:int(np.round(train_split*np.shape(hn_all)[0]))]
valid_indices = randomised_indices[int(np.round(train_split*np.shape(hn_all)[0])):(int(np.round(train_split*np.shape(hn_all)[0]))+int(np.round(validation_split*np.shape(hn_all)[0])))]
#use this if we'd like split across the combined set formed by their train/test files
#test_indices = randomised_indices[(int(np.round(train_split*np.shape(hn_all)[0]))+int(np.round(validation_split*np.shape(hn_all)[0]))):]


In [5]:
# parse the training and validation examples. 
# the training set is unpaired, so we only use half the examples for each spectra type
hn_train_set = hn_all[train_indices[:int(round(len(train_indices)/2))]]
ln_train_set = ln_all[train_indices[int(round(len(train_indices)/2)):]]

# create an unpaired validation set 
hn_valid_set = hn_all[valid_indices[:int(round(len(valid_indices)/2))]]
ln_valid_set = ln_all[valid_indices[int(round(len(valid_indices)/2)):]]

# create a paired version of the validation set, using the same number of examples/indices as 
# the low snr unpaired validation data to enable a comparison of a supervised validation loss 
# with the unsupervised validation loss
hn_valid_set_sup = hn_all[valid_indices[:int(round(len(valid_indices)/2))]]
ln_valid_set_sup = ln_all[valid_indices[:int(round(len(valid_indices)/2))]]


#hn_test_set = hn_all[test_indices]
#ln_test_set = ln_all[test_indices]

In [6]:
#use their original test set as ours. 
hn_test_set = hn_test
ln_test_set = ln_test

In [7]:
np.save('hn_train_set.npy', hn_train_set)
np.save('ln_train_set.npy', ln_train_set)

np.save('hn_valid_set.npy', hn_valid_set)
np.save('ln_valid_set.npy', ln_valid_set)

np.save('hn_test_set.npy', hn_test_set)
np.save('ln_test_set.npy', ln_test_set)

np.save('hn_valid_set_sup.npy', hn_valid_set_sup)
np.save('ln_valid_set_sup.npy', ln_valid_set_sup)



In [8]:
#test whether any indices are shared between datasets
print(np.max(np.in1d(train_indices,valid_indices)))

False


In [9]:
hn = train_indices[:int(round(len(train_indices)/2))]

In [10]:
ln = train_indices[int(round(len(train_indices)/2)):]

In [11]:
print(np.max(np.in1d(hn,ln)))

False
