# PREPROCESSING

I want to avoid the problem I had while training the autoencoder, which is that I would have to re-shuffle the training and validation set everytime I closed the notebook, so Instead I will do that once here and save the shuffled data + labels as .npy files

In [8]:
import numpy as np
import random

# Load redshift data and labels

In [10]:
images = np.load("redshift0750_data.npy")
labels_all = np.load("redshift0750_labels.npy")

print(images.shape)
print(labels_all.shape)

images = list(images)
labels_all = list(labels_all)

(2400, 200, 200)
(2400,)


# Shuffle

In [11]:
shuffled_images = []
shuffled_labels_all = []
while images:
    i = random.randrange(len(images))
    shuffled_images.append(images[i])
    del images[i]
    shuffled_labels_all.append(labels_all[i])
    del labels_all[i]
    
images = np.array(shuffled_images[:])
labels_all = np.array(shuffled_labels_all)

In [12]:
print(images.shape)
print(labels_all.shape)

(2400, 200, 200)
(2400,)


### while im here I might aswell make the labels so that the model can read them... and do the preprocessing

In [13]:
# normalize images
images_n = images / 70.0 # n for normalized
print("max : ", np.max(images_n))
print("min : ", np.min(images_n))
images.shape

max :  0.5033855
min :  0.0


(2400, 200, 200)

# Reshape the images for feeding through the encoder

In [14]:
im_reshape = np.array(images_n[:])
im_reshape = np.expand_dims(im_reshape, axis = 3)

In [15]:
im_reshape = np.array(im_reshape)
print("this is the shape", im_reshape.shape)

this is the shape (2400, 200, 200, 1)


In [16]:
for i in range(15):
    print(labels_all[i])

0.07
0.08
0.108
0.104
0.098
0.098
0.092
0.098
0.13
0.106
0.098
0.098
0.106
0.104
0.07


# format the labels so the network can read them

In [17]:
# display the labels
l = []
for i in labels_all:
    if i not in l: l.append(i)
l.sort()
print(l, "\nThere are", len(l), "labels. So we want", 
      len(l), "classes")

[0.07, 0.08, 0.092, 0.094, 0.096, 0.098, 0.102, 0.104, 0.106, 0.108, 0.12, 0.13] 
There are 12 labels. So we want 12 classes


In [18]:
# define the train labels in len 12 arrays
labels = []
for i in labels_all:
    arr = np.zeros(len(l))
    index = l.index(i)
    arr[index]=1
    labels.append(arr[:])
    
labels = np.array(labels)
print("train_labels shape =", labels.shape, 
      "\n\nFirst 3 entries:")
print(labels[:3])

train_labels shape = (2400, 12) 

First 3 entries:
[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]]


## normalize

In [19]:
# normalize images
images_n = images / 40.0 # for some reason this one is 40... 
# it's the data not the me
print("max :", np.max(images_n))
print("min :", np.min(images_n))

max : 0.8809246
min : 0.0


# augment dimention

In [None]:
im_reshape = np.array(images_n[:])
im_reshape = np.expand_dims(im_reshape, axis=3)
im_reshape = np.array(im_reshape)
print("this is the shape", im_reshape.shape)

# train test split

In [20]:
ratio = 0.9
cut = int(len(im_reshape)*ratio)
images_train = im_reshape[:cut]
labels_train = labels[:cut]
images_test = im_reshape[cut:]
labels_test = labels[cut:]
print("the shape of the arrays are", images_train.shape, "modify the nn appropriately")

the shape of the arrays are (2160, 200, 200) modify the nn appropriately


# save them

In [21]:
np.save("redshift0750_data_train.npy", images_train)
np.save("redshift0750_data_test.npy", images_test)
np.save("redshift0750_labels_train.npy", labels_train)
np.save("redshift0750_labels_test.npy", labels_test)