# Preprocessing

In [7]:
import numpy as np
import random
import os

# Load redshift data labels

In [8]:
os.listdir()

['CNN4_classify_esc_frac_direct_cnn.ipynb',
 'preprocessing.ipynb',
 'CNN4_classify_esc_frac_with_autoencoder.ipynb',
 'CNN5_classify_redshift_direct_cnn.ipynb',
 'big_mix_labels_6528.npy',
 '.ipynb_checkpoints',
 'big_mix_data_6528.npy',
 'CNN3_classify_redshift_with_autoencoder.ipynb']

In [9]:
labels_all = np.load("big_mix_labels_6528.npy")
images = np.load("big_mix_data_6528.npy")

print(images.shape)
print(labels_all.shape)

images = list(images)
labels_all = list(labels_all)

(6528, 200, 200)
(6528, 2)


# Shuffle

In [10]:
shuffled_images = []
shuffled_labels_all = []
while images:
    i = random.randrange(len(images))
    shuffled_images.append(images[i])
    del images[i]
    shuffled_labels_all.append(labels_all[i])
    del labels_all[i]
    
images = np.array(shuffled_images[:])
labels_all = np.array(shuffled_labels_all[:])
print(images.shape)

(6528, 200, 200)


## make the labels machine readable

In [11]:
# normalizes images
images_n = images / 70.0 # n for normalized
print("max : ", np.max(images_n))
print("min : ", np.min(images_n))
images.shape

max :  0.9526767
min :  0.0


(6528, 200, 200)

In [13]:
# reshape images 
im_reshape = np.array(images_n[:])
im_reshape = np.expand_dims(im_reshape, axis=3)
im_reshape = np.array(im_reshape)
print("this is the shape", im_reshape.shape)

this is the shape (6528, 200, 200, 1)


In [17]:
for i in range(15): print(labels_all[i])
labels_all_esc = np.array([i[0] for i in labels_all])
labels_all_redshift = np.array([i[1] for i in labels_all])

[0.108 6.   ]
[0.092 6.5  ]
[0.08 5.  ]
[ 0.094 12.5  ]
[0.092 5.5  ]
[ 0.108 10.   ]
[0.08 5.5 ]
[ 0.106 10.5  ]
[0.08 7.  ]
[ 0.094 13.   ]
[0.108 7.   ]
[0.094 9.5  ]
[0.102 9.   ]
[ 0.106 12.5  ]
[0.106 8.   ]


#### Escape frac labels

In [18]:
# display the esc frac labels
l = []
for i in labels_all_esc:
    if i not in l: l.append(i)
l.sort()
print(l, "\nThere are", len(l), "labels. So we want",
     len(l), "classes")

[0.07, 0.08, 0.092, 0.094, 0.096, 0.098, 0.102, 0.104, 0.106, 0.108, 0.12, 0.13] 
There are 12 labels. So the want 12 classes


In [19]:
# define the train labels for the escape frac in 12 classes
labels_esc = []
for i in labels_all_esc:
    arr = np.zeros(len(l))
    index = l.index(i)
    arr[index] = 1
    labels_esc.append(arr[:])
    
labels_esc = np.array(labels_esc)
print("labels_esc shape =", labels_esc.shape,
     "\n\nFirst 3 entries:")
print(labels_esc[:3])

labels_esc shape = (6528, 12) 

First 3 entries:
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


#### redshift labels

In [20]:
# display the esc frac labels
l = []
for i in labels_all_redshift:
    if i not in l: l.append(i)
l.sort()
print(l, "\nThere are", len(l), "labels. So we want",
     len(l), "classes")

[5.0, 5.5, 6.0, 6.5, 7.0, 7.5, 8.0, 8.5, 9.0, 9.5, 10.0, 10.5, 11.0, 11.5, 12.0, 12.5, 13.0] 
There are 17 labels. So we want 17 classes


In [28]:
# define the train labels for the redshift 17 classes
labels_red = []
for i in labels_all_redshift:
    arr = np.zeros(len(l))
    index= l.index(i)
    arr[index] = 1
    labels_red.append(arr[:])
    
labels_red = np.array(labels_red)
print("labels_red shape =", labels_red.shape,
      "\n\nFirst 3 entries:")
print(labels_red[:3])
print(len(labels_red) == len(labels_esc))

labels_red shape = (6528, 17) 

First 3 entries:
[[0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
True


### recombine the labels into one array

In [29]:
l1 = len(labels_red)
labels = np.array([[labels_esc[i],labels_red[i]] for i in range(l1)])

## Normalise

In [30]:
# normalise images
images_n = images / 70.0 
print("max :", np.max(images_n))
print("min :", np.min(images_n))

max : 0.9526767
min : 0.0


## augmentation of dimension

In [31]:
im_reshape = np.array(images_n[:])
im_reshape = np.expand_dims(im_reshape, axis=3)
im_reshape = np.array(im_reshape)
print("this is the shape", im_reshape.shape)

this is the shape (6528, 200, 200, 1)


## train test split

In [32]:
ratios = [0.75,0.85]
cut1,cut2 = int(len(im_reshape)*ratios[0]),int(len(im_reshape)*ratios[1])
images_train = im_reshape[:cut1]
images_val = im_reshape[cut1:cut2]
images_test = im_reshape[cut2:]

labels_train = labels[:cut1]
labels_val = labels[cut1:cut2]
labels_test = labels[cut2:]

## Save them

In [34]:
np.save("data_train.npy", images_train)
np.save("data_val.npy", images_val)
np.save("data_test.npy", images_test)
np.save("labels_train.npy", labels_train)
np.save("labels_val.npy", labels_val)
np.save("labels_test.npy", labels_test)