In [None]:
# Import library
import numpy as np
import matplotlib.pyplot as plt
from ipywidgets import interact
import tensorflow as tf
from tensorflow.keras import utils
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import time
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score

%matplotlib inline
%config InlineBackend.figure_format='retina'

# These libraries are for importing the data
from astroNN.models import Galaxy10CNN
from astroNN.datasets import galaxy10
from astroNN.datasets.galaxy10 import galaxy10cls_lookup, galaxy10_confusion

In [None]:
# Load in Data
images, labels = galaxy10.load_data()

## Data Manipulation

In [None]:
# Convert the labels to 10 categorical classes
labels_cat = utils.to_categorical(labels, 10)

# Convert to desirable type: float32
labels_cat = labels_cat.astype(np.float32)
images = images.astype(np.float32)

## Data Visualization

In [None]:
print('Image Length:', len(images))
print('Image Shape:', images.shape)

print('Label Length:', len(labels))

In [None]:
print(images)
print(labels)
print(labels_cat)

In [None]:
# Slider to browse all of the galaxy images
def browse_images(images, labels, categories, size):
    images = images[:size]
    labels = labels[:size]
    n = len(images)
    def view_image(i):
        plt.imshow(images[i].astype('uint8'), cmap=plt.cm.gray_r, interpolation='nearest')
        plt.title('Class {}: {} \n Random Demo images {} of 10'.format(np.argmax(labels_cat[i]), galaxy10cls_lookup(labels_cat[i]), i+1))
        plt.axis('off')
        plt.show()
    interact(view_image, i=(0,n-1))
    
unique_cat = np.unique(labels)

In [None]:
browse_images(images, labels, unique_cat, size = 10)

In [None]:
# Count Bar Plot
plt.figure(figsize=(14, 3))

y_unique = np.unique(labels)
counts = [(labels == i).sum() for i in y_unique]

plt.xticks(y_unique,  unique_cat[y_unique])
locs, labels = plt.xticks()
plt.setp(labels, size=20)
plt.bar(y_unique, counts)
plt.title('Category Count Bar Plot')
plt.xlabel('Category')
plt.ylabel('Count')

## Model Building

In [None]:
# Train data
train_vectors, test_vectors, train_labels, test_labels = train_test_split(images, labels_cat, test_size = 0.25)

In [None]:
# Find shape/dimension of training vectors
print(train_vectors.shape)

# Find shape/dimension of training labels
print(train_labels.shape)

In [None]:
# Reshaping the training and testing vectors
train_vectors = train_vectors.reshape(train_vectors.shape[0],-1)/255
test_vectors = test_vectors.reshape(test_vectors.shape[0],-1)/255

In [None]:
# New shape of the traininv vectors
train_vectors.shape

In [None]:
# Creating the Sequential Neural Network
network = Sequential()
network.add(Dense(10, input_dim=14283, activation='relu'))
network.add(Dense(10, activation='softmax'))
network.compile(optimizer='rmsprop',
                loss='categorical_crossentropy',
                metrics=['accuracy'])

In [None]:
# Fitting the model to the training data
start = time.time()
history=network.fit(train_vectors, train_labels, epochs=10,  validation_split=0.1)
end = time.time()
print("Runtime",end - start)

In [None]:
# Create a subset of the training data to only contain 200 observations
random_indices = np.random.choice(16338, size=200, replace=False)

tmp_vectors = train_vectors[random_indices, :]
tmp_labels = train_labels[random_indices,0]

In [None]:
# GridSearch the subset of data to determine best estimators
start = time.time()

param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5],
              'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }
clf = GridSearchCV(SVC(kernel='linear', class_weight='balanced'), param_grid)

clf = clf.fit(tmp_vectors, tmp_labels)
print("Best estimator found by grid search:")
print(clf.best_estimator_)

end = time.time()
print("Runtime",end - start)

In [None]:
# Create temporary training data to use on the model
random_indices = np.random.choice(16338, size=1000, replace=False)

act_tmp_vectors = train_vectors[random_indices,:]
act_tmp_labels = train_labels[random_indices,0]

In [None]:
# Create the model with the entire training set using the best estimators
start = time.time()

clf = SVC(kernel = 'linear', C = 1000.0, gamma = 0.0001)
clf.fit(act_tmp_vectors, act_tmp_labels)

end = time.time()
print("Runtime",end - start)

In [None]:
# Predict the classes of the testing data
random_indices = np.random.choice(5447, size=333, replace=False)

predict_vectors = test_vectors[random_indices,:]
true_labels = test_labels[random_indices,0]

pred_labels = clf.predict(predict_vectors)

## Overall Model Performance

In [None]:
# Neural Network model
plt.clf()   # clear figure

loss = history.history['loss'] 
val_loss = history.history['val_loss']

epochs = range(1, len(loss) + 1)

plt.plot(epochs, loss, 'bo', label='Training loss') # creating plots to show loss with increased epochs
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

acc = history.history['acc']
val_acc = history.history['val_acc']
epochs=range(1, len(acc)+1)

plt.plot(epochs, acc, 'bo', label='Training acc') # creating plots to show accuracy with increased epochs
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.show()

In [None]:
test_loss, test_acc = network.evaluate(test_vectors, test_labels)
print('test_acc:', test_acc) # calculating the accuracy of the model

In [None]:
# Support Vector Model (SVM)

print("ACC:",accuracy_score(true_labels, pred_labels)) # we want to know if this model is more accurate than our artificial neural network
print('Confusion Matrix: \n', confusion_matrix(true_labels, pred_labels)) # we want to see how many false positives and false negatives there are 
      
fpr, tpr, thresholds = roc_curve(true_labels, pred_labels)
plt.plot(fpr, tpr, "x-")
plt.plot([0,1],[0,1],"k-")
plt.xlabel("FPR")
plt.ylabel("TPR")

print("AUC", roc_auc_score(true_labels, pred_labels)) # we want to see the AUC score of the model to see how predictive it is