# PREPROCESSING

I want to avoid the problem I had while training the autoencoder, which is that I would have to re-shuffle the training and validation set everytime I closed the notebook, so Instead I will do that once here and save the shuffled data + labels as .npy files

In [58]:
import numpy as np
import random
import os

# Load redshift data labels

In [59]:
os.listdir()

['preprocessing.ipynb',
 'CNN1_fixed_esc_frac_0070_redshift_train_with_autoencoder.ipynb',
 'esc_frac_0070_labels.npy',
 'CNN1_fixed_esc_frac_0070_redshift_train_no_encoder.ipynb',
 'esc_frac_0070_data.npy',
 '.ipynb_checkpoints']

In [60]:
labels_all = np.load("esc_frac_0070_labels.npy")
images = np.load("esc_frac_0070_data.npy")

print(images.shape)
print(labels_all.shape)

images = list(images)
labels_all = list(labels_all)

(3400, 200, 200)
(3400,)


# Shuffle

In [61]:
shuffled_images = []
shuffled_labels_all = []
while images:
    i = random.randrange(len(images))
    shuffled_images.append(images[i])
    del images[i]
    shuffled_labels_all.append(labels_all[i])
    del labels_all[i]
    
images = np.array(shuffled_images[:])
labels_all = np.array(shuffled_labels_all[:])

In [62]:
print(images.shape)

(3400, 200, 200)


### while im here i might aswell make the labels so that the model can read them... and do the preprocessing

In [63]:
# normalizes images
images_n = images / 70.0 # n for normalized
print("max : ", np.max(images_n))
print("min : ", np.min(images_n))
images.shape

max :  0.9531142
min :  0.0


(3400, 200, 200)

# Reshape the images for feeding through the encoder

In [64]:
im_reshape = np.array(images_n[:])
im_reshape = np.expand_dims(im_reshape, axis = 3)

In [65]:
im_reshape = np.array(im_reshape)
print("this is the shape", im_reshape.shape)

this is the shape (3400, 200, 200, 1)


In [66]:
for i in range(15):
    print(labels_all[i])

6.5
8.5
13.0
8.5
10.5
12.0
11.5
6.0
6.5
8.0
6.5
7.5
10.5
6.0
8.0


## Format the labels so the network can read them

In [67]:
# display the labels
l = []
for i in labels_all:
    if i not in l: l.append(i)
l.sort()
print(l, "\nThere are", len(l), "labels. So the want",
     len(l), "classes")

[5.0, 5.5, 6.0, 6.5, 7.0, 7.5, 8.0, 8.5, 9.0, 9.5, 10.0, 10.5, 11.0, 11.5, 12.0, 12.5, 13.0] 
There are 17 labels. So the want 17 classes


In [68]:
# define the train labels in len 17 arrays
labels=[]
for i in labels_all:
    arr = np.zeros(len(l))
    index = l.index(i)
    arr[index] = 1
    labels.append(arr[:])
    
labels = np.array(labels)
print("train_labels shape =", labels.shape,
     "\n\nFirst 3 entries:")
print(labels[:3])

train_labels shape = (3400, 17) 

First 3 entries:
[[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]


## normalize

In [70]:
# normalize images
images_n = images / 70.0 # for some reason this one is 40...
# it's the data not me
print("max :", np.max(images_n))
print("min :", np.min(images_n))

max : 0.9531142
min : 0.0


## augmentation dimension

In [71]:
im_reshape = np.array(images_n[:])
im_reshape = np.expand_dims(im_reshape, axis=3)
im_reshape = np.array(im_reshape)
print("this is the shape", im_reshape.shape)

this is the shape (3400, 200, 200, 1)


## train test split

In [76]:
ratios = [0.75,0.85]
cut1,cut2 = int(len(im_reshape)*ratios[0]),int(len(im_reshape)*ratios[1])
images_train = im_reshape[:cut1]
images_val = im_reshape[cut1:cut2]
images_test = im_reshape[cut2:]
labels_train = labels[:cut1]
labels_val = labels[cut1:cut2]
labels_test = labels[cut2:]

In [75]:
images_train.shape

(2550, 200, 200, 1)

# save them

In [77]:
np.save("esc0070_data_train.npy", images_train)
np.save("esc0070_data_val.npy", images_val)
np.save("esc0070_data_test.npy", images_test)
np.save("esc0070_labels_train.npy", labels_train)
np.save("esc0070_labels_val.npy", labels_val)
np.save("esc0070_labels_test.npy", labels_test)