# Galaxy Classification using Supervised Learning with Deep Convolutional Neural Networks for Multi-Class Image Classification
## Group 7 AI Class Final Projects
Members:
- Abi
- Gavin
- Rasyid
- Hikmal

---

### Import Libraries

In [None]:
#Change/add any additional imports here
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow import keras
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical

### Data Wrangling & Preprocessing

In [5]:
#Process the labels first
def major_class(t):
    t = str(t).strip()
    if t.startswith("E"):
        return "E"
    elif t.startswith("S0"):
        return "E" #S0 is closer to E (Lenticular)
    elif t.startswith("SAB"):
        return "SB" #SAB is closer to SB (Barred Spiral)
    elif t.startswith("SB"):
        return "SB"
    elif t.startswith("S"):
        return "S"
    else:
        return "Other"  #Catchall for irregular/unknown types

df = pd.read_csv('efigiuse/label.csv')
print(df.head())
print("\nunique classes:")
print(df["type"].unique()) #Checking the unique classes
df["major_class"] = df["type"].apply(major_class)
print(df["major_class"].value_counts()) #Checking the number of major classes

#Create new dataframe with only PGC_name and major_class columns
df_major = df[["PGC_name", "major_class"]]
print(df_major.head(15)) #df_major at a glance

#Create a filtered version of df_major ("Other" class removed)
df_use = df_major[df_major["major_class"] != "Other"].reset_index(drop=True)
print(df_use.head(15)) #df_use at a glance

#Label processing complete

     PGC_name  PGC_no     vrad  e_vrad     vopt  e_vopt        v   e_v  \
0  PGC0000212   212.0  11230.4     4.5  11110.0    47.0  11229.3   9.0   
1  PGC0000218   218.0   1050.3     4.8   1027.4    25.0   1049.5   4.5   
2  PGC0000243   243.0  -9999.0 -9999.0   8914.3    16.3   8914.3  16.3   
3  PGC0000255   255.0    878.1     4.1  -9999.0 -9999.0    878.1   4.1   
4  PGC0000281   281.0  -9999.0 -9999.0  11490.7    16.4  11490.7  16.4   

      vvir      zvir     z_err type    objname            hl_names  
0  11287.9  0.037650  0.000030  Sab     IC5381           PGC000212  
1   1109.0  0.003699  0.000015  Sab    NGC7814          PGC1501809  
2   8841.8  0.029490  0.000054   S0    NGC7808  6dFJ0003321-104441  
3    932.9  0.003112  0.000014   Sm   UGC00017           PGC000255  
4  11416.0  0.038080  0.000055   Sc  PGC000281       MCG-02-01-015  

unique classes:
['Sab' 'S0' 'Sm' 'Sc' 'Sa' 'Sd' 'SBa' 'IB' 'Sb' 'SABc' 'SBc' 'SBb' 'Scd'
 'S0-a' 'I' 'SBbc' 'SBd' 'E' 'SBm' 'SABa' 'SABb' '|

In [6]:
#Image processing
image_dir = "efigiuse/png/"
image_size = (255, 255)
X, y = [], [] #Initialize empty lists for images and labels
for idx, row in df_use.iterrows():
    image_path = os.path.join(image_dir, f"{row['PGC_name']}.png")
    if os.path.exists(image_path):
        img = load_img(image_path, target_size=image_size)
        img_array = img_to_array(img) / 255.0  #Normalization to [0, 1]
        X.append(img_array) #X is for the images
        y.append(row["major_class"]) #y is for the labels
    else:
        print(f"Missing image: {image_path}")
#No images should be missing. If any are, check efigiuse/png/ and redownload the images from: 
#https://www.astromatic.net/download/efigi/efigi_png_gri-1.6.tgz
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y) #Label encoding
print("Classes: " + str(encoder.classes_)) #Checking the classes, should be ["E", "S", "SB"]

#Train, validation, and test split
X = np.array(X) #Convert X & y to numpy arrays
y_encoded = np.array(y_encoded)
#Train (70%) - Rest (30%) split
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y_encoded, test_size=0.30, random_state=42, stratify=y_encoded)
#Validation (15%) - Test (15%) split
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp)

Classes: ['E' 'S' 'SB']


### CNN Architecture Building

In [7]:
num_classes = len(df_use["major_class"].unique()); print("Number of classes: " + str(num_classes)) #Checking the number of classes
#Current arrays are integer coded (0,1,2). Convert to one-hot encoding
y_train = to_categorical(y_train, num_classes=num_classes)
y_val = to_categorical(y_val, num_classes=num_classes)
y_test = to_categorical(y_test, num_classes=num_classes)
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(255, 255, 3)),
    MaxPooling2D(pool_size=(2, 2)),

    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D(pool_size=(2, 2)),

    Conv2D(128, (3, 3), activation='relu'),
    MaxPooling2D(pool_size=(2, 2)),

    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),  # Prevents overfitting
    Dense(num_classes, activation='softmax')
])

model.compile(optimizer=Adam(0.001), loss='categorical_crossentropy', metrics=['accuracy'])
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=30, callbacks=[early_stop])

#WARNING: MAKE SURE TO HAVE A GENUINELY GOOD COMPUTING SETUP WITH MAXIMUM COOLING AND POWER BEFORE EXECUTING
#DO NOT LET YOUR MACHINE GET OVERHEATED


Number of classes: 3


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/30
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m332s[0m 3s/step - accuracy: 0.4591 - loss: 1.2324 - val_accuracy: 0.5682 - val_loss: 0.9011
Epoch 2/30
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m277s[0m 3s/step - accuracy: 0.6006 - loss: 0.8452 - val_accuracy: 0.5860 - val_loss: 0.8032
Epoch 3/30
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m324s[0m 3s/step - accuracy: 0.6027 - loss: 0.8043 - val_accuracy: 0.6315 - val_loss: 0.7743
Epoch 4/30
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m289s[0m 3s/step - accuracy: 0.6191 - loss: 0.7739 - val_accuracy: 0.6218 - val_loss: 0.7492
Epoch 5/30
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m272s[0m 3s/step - accuracy: 0.6254 - loss: 0.7323 - val_accuracy: 0.6185 - val_loss: 0.7886
Epoch 6/30
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m272s[0m 3s/step - accuracy: 0.6666 - loss: 0.7177 - val_accuracy: 0.6299 - val_loss: 0.7324
Epoch 7/30
[1m90/90[0m [32m━━━━

<keras.src.callbacks.history.History at 0x1347f5750>

In [11]:
loss, acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {acc:.4f}")

from sklearn.metrics import classification_report, confusion_matrix

y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true = np.argmax(y_test, axis=1)

print(classification_report(y_true, y_pred_classes))
print(confusion_matrix(y_true, y_pred_classes))

[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 764ms/step - accuracy: 0.6440 - loss: 0.7528
Test Accuracy: 0.6402
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 721ms/step
              precision    recall  f1-score   support

           0       0.76      0.84      0.80       165
           1       0.59      0.87      0.71       279
           2       0.52      0.07      0.12       173

    accuracy                           0.64       617
   macro avg       0.62      0.60      0.54       617
weighted avg       0.62      0.64      0.57       617

[[139  23   3]
 [ 27 244   8]
 [ 17 144  12]]
