Galaxy morphology classifier
Data from Galaxy Zoo

In [145]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, LeaveOneOut, LeavePOut
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, roc_curve
from sklearn.ensemble import RandomForestClassifier
from skimage.io import ImageCollection
import matplotlib.pyplot as plt
from glob import glob

seed = 21

Read in all the images from the training set

In [157]:
# define the size to crop each image
# definied as central pixels
cropped_size = 200
image_dir = "images_training_rev1"

# make sure to sort like the training solutions
image_files = np.sort(glob(image_dir + "/*"))

image_ex = plt.imread(image_files[0])
image_dim = len(image_ex)

# Crops an image to the central crop_size x crop_size
# inputs - numpy array image, length of cropped image
# outputs - numpy array image
def crop(im, crop_size):
    dim = len(im)
    dim_low = dim//2 - crop_size//2
    dim_high = dim//2 + crop_size//2
    return im[dim_low:dim_high, dim_low:dim_high,:].flatten()

# Load function for image collection and cropping
# inputs - single image file name (string)
# outputs - numpy array image
def load_crop(f):
    return crop(plt.imread(f), cropped_size)

image_coll = ImageCollection(load_pattern=image_files, conserve_memory=True, load_func=load_crop)

numpy.ndarray

Read in and process the classifications of the training set

In [116]:
data = pd.read_csv("training_solutions_rev1.csv")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61578 entries, 0 to 61577
Data columns (total 38 columns):
GalaxyID     61578 non-null int64
Class1.1     61578 non-null float64
Class1.2     61578 non-null float64
Class1.3     61578 non-null float64
Class2.1     61578 non-null float64
Class2.2     61578 non-null float64
Class3.1     61578 non-null float64
Class3.2     61578 non-null float64
Class4.1     61578 non-null float64
Class4.2     61578 non-null float64
Class5.1     61578 non-null float64
Class5.2     61578 non-null float64
Class5.3     61578 non-null float64
Class5.4     61578 non-null float64
Class6.1     61578 non-null float64
Class6.2     61578 non-null float64
Class7.1     61578 non-null float64
Class7.2     61578 non-null float64
Class7.3     61578 non-null float64
Class8.1     61578 non-null float64
Class8.2     61578 non-null float64
Class8.3     61578 non-null float64
Class8.4     61578 non-null float64
Class8.5     61578 non-null float64
Class8.6     61578 non-null f

In [117]:
data.head()

Unnamed: 0,GalaxyID,Class1.1,Class1.2,Class1.3,Class2.1,Class2.2,Class3.1,Class3.2,Class4.1,Class4.2,...,Class9.3,Class10.1,Class10.2,Class10.3,Class11.1,Class11.2,Class11.3,Class11.4,Class11.5,Class11.6
0,100008,0.383147,0.616853,0.0,0.0,0.616853,0.038452,0.578401,0.418398,0.198455,...,0.0,0.279952,0.138445,0.0,0.0,0.092886,0.0,0.0,0.0,0.325512
1,100023,0.327001,0.663777,0.009222,0.031178,0.632599,0.46737,0.165229,0.591328,0.041271,...,0.018764,0.0,0.131378,0.45995,0.0,0.591328,0.0,0.0,0.0,0.0
2,100053,0.765717,0.177352,0.056931,0.0,0.177352,0.0,0.177352,0.0,0.177352,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,100078,0.693377,0.238564,0.068059,0.0,0.238564,0.109493,0.129071,0.189098,0.049466,...,0.0,0.094549,0.0,0.094549,0.189098,0.0,0.0,0.0,0.0,0.0
4,100090,0.933839,0.0,0.066161,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Now do the simplest possible binary classifier. Is it smooth? (Class 1.1)

In [118]:
smooth_train = data["Class1.1"].to_numpy()
disk_train = data["Class1.2"].to_numpy()
artifact_train = data["Class1.3"].to_numpy()

# Returns true if the given element of the numpy array made from the pandas dataframe has the highest probability of being a smooth galaxy
def is_smooth(d):
    if(d[1] > d[2] and d[1] > d[3]):
        return True
    return False

data_arr = data.to_numpy()
train_targets = np.apply_along_axis(is_smooth, 1, data_arr)

Build the classifier. Dataset is too large to fit at the same time, so use SGD partial fits on split up data

In [127]:
# function to build to classifier, used later for cross validation
def sgd_increment(data, targets, batch_size, classes, state=seed):
    sgd_clf = SGDClassifier(random_state=state)
    start = 0
    while(start < len(targets)):
        print(start)
        end = start + batch_size
        sgd_clf.partial_fit(data[start:end], targets[start:end], classes=classes)
        start = end
    return sgd_clf

clf_full = sgd_increment(image_coll, train_targets, 5000, [True, False])

0
5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
55000
60000


In [151]:
# Do the cross validation. Have to write our own cross validation function for this to use incremental
#cross_val_score(sgd_clf, image_coll, train_targets, cv=3, scoring="accuracy")

# This function does what I want, but it doesn't handle the image collections properly
def cross_val_increment(data, targets, cv, batch_size, classes):
    i = 0
    train_pred = np.array([])
    length = len(data)
    #split the data and targets up into cv samples for leave one out
    data_split = []
    targets_split = []
    pred = []
    while(i < cv):
        data_split.append(np.array(data[i*length//cv:(i+1)*length//cv]))
        targets_split.append(np.array(targets[i*length//cv:(i+1)*length//cv]))
        i+=1
    i = 0
    while(i < cv):
        test_data = data_split[i]
        train_data = np.append(data_split[0:i], data_split[i+1:len(data_split)])
        test_targets = targets_split[i]
        train_targets = np.append(targets_split[0:i], targets_split[i+1:len(targets_split)])
        clf = sgd_increment(train_data, train_targets, batch_size, classes)
        pred += clf.fit(test_data)
        i+=1
    return np.array(pred)

#pred = cross_val_increment(image_coll, train_targets, 3, 5000, [True, False])
