In [None]:
if False:
    from google.colab import drive
    drive.mount('/content/drive')

#### 1. Preprocessing images

For images, it is necessary, at least as the bare minimum, to all have the same size. If the model underperforms, more pre processing steps are to be added 

In [None]:
import os
import numpy as np
from PIL import Image
from tqdm.auto import tqdm

home = "/home/philipm/Downloads/"


train_folder_img = home + 'images_training_rev1/'
test_folder_img = home + 'images_test_rev1/'

train_folder_proc = home + 'images_training_fin/'
test_folder_proc = home + 'images_test_fin/'

preprocessing = True #toggle this to enable preprocessing
width = 224 #typical for architectures like 

if preprocessing:
    for name in tqdm(os.listdir(train_folder_img), desc="Loading Train Files"): #managing a lot of files. added progress bars for tracking time
        img_path = train_folder_img + name

        with Image.open(img_path) as img:
            w, h = img.size            
            height = round(h * (width / w))
            resized = img.resize((width, height), Image.Resampling.LANCZOS) #resampling, suggested by the library
            resized.save(train_folder_proc + name)
    
    # for name in tqdm(os.listdir(test_folder_img), desc="Loading Test Files"):
    #     img_path = test_folder_img + name

    #     with Image.open(img_path) as img:
    #         w, h = img.size
    #         height = round(h * (width / w))
    #         resized = img.resize((width, height), Image.Resampling.LANCZOS)
    #         resized.save(test_folder_proc + name)
        
    

Loading Train Files: 100%|███████████████| 61578/61578 [01:43<00:00, 597.40it/s]


Making a mendatory check that every image is 227x227. This part is also necessary for creating the datasets.

In [30]:
test_image_names = []
train_image_names = []

for name in tqdm(os.listdir(train_folder_proc), desc="Validating Train Files"): #managing a lot of files. added progress bars for tracking time
    img_path = train_folder_proc + name
    train_image_names.append(name)

    with Image.open(img_path) as img:
        w, h = img.size
        if w != 224 or h != 224:
            print ("Not matching size")
            break

# #comment this for fast checks
# for name in tqdm(os.listdir(test_folder_proc), desc="Validating Test Files"):
#     img_path = test_folder_proc + name
#     test_image_names.append(name)

#     with Image.open(img_path) as img:
#         w, h = img.size
#         if w != 227 or h != 227:
#             print ("Not matching size")
#             break

Validating Train Files: 100%|██████████| 61578/61578 [00:03<00:00, 18705.22it/s]


#### 2. Loading labels 

In [31]:
import pandas as pd

train_solutions = home + 'training_solutions_rev1/training_solutions_rev1.csv'

train_sols = pd.read_csv(train_solutions)
train_sols.head()

Unnamed: 0,GalaxyID,Class1.1,Class1.2,Class1.3,Class2.1,Class2.2,Class3.1,Class3.2,Class4.1,Class4.2,...,Class9.3,Class10.1,Class10.2,Class10.3,Class11.1,Class11.2,Class11.3,Class11.4,Class11.5,Class11.6
0,100008,0.383147,0.616853,0.0,0.0,0.616853,0.038452,0.578401,0.418398,0.198455,...,0.0,0.279952,0.138445,0.0,0.0,0.092886,0.0,0.0,0.0,0.325512
1,100023,0.327001,0.663777,0.009222,0.031178,0.632599,0.46737,0.165229,0.591328,0.041271,...,0.018764,0.0,0.131378,0.45995,0.0,0.591328,0.0,0.0,0.0,0.0
2,100053,0.765717,0.177352,0.056931,0.0,0.177352,0.0,0.177352,0.0,0.177352,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,100078,0.693377,0.238564,0.068059,0.0,0.238564,0.109493,0.129071,0.189098,0.049466,...,0.0,0.094549,0.0,0.094549,0.189098,0.0,0.0,0.0,0.0,0.0
4,100090,0.933839,0.0,0.066161,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
#again, sanity check, seeing whether all labels are here
train_sols.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61578 entries, 0 to 61577
Data columns (total 38 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   GalaxyID   61578 non-null  int64  
 1   Class1.1   61578 non-null  float64
 2   Class1.2   61578 non-null  float64
 3   Class1.3   61578 non-null  float64
 4   Class2.1   61578 non-null  float64
 5   Class2.2   61578 non-null  float64
 6   Class3.1   61578 non-null  float64
 7   Class3.2   61578 non-null  float64
 8   Class4.1   61578 non-null  float64
 9   Class4.2   61578 non-null  float64
 10  Class5.1   61578 non-null  float64
 11  Class5.2   61578 non-null  float64
 12  Class5.3   61578 non-null  float64
 13  Class5.4   61578 non-null  float64
 14  Class6.1   61578 non-null  float64
 15  Class6.2   61578 non-null  float64
 16  Class7.1   61578 non-null  float64
 17  Class7.2   61578 non-null  float64
 18  Class7.3   61578 non-null  float64
 19  Class8.1   61578 non-null  float64
 20  Class8

In [33]:
#the number of labels matches the number of training examples. It is also important to remove the Galaxy index coumn
train_sols = train_sols.drop("GalaxyID", axis=1)
train_sols.head()

Unnamed: 0,Class1.1,Class1.2,Class1.3,Class2.1,Class2.2,Class3.1,Class3.2,Class4.1,Class4.2,Class5.1,...,Class9.3,Class10.1,Class10.2,Class10.3,Class11.1,Class11.2,Class11.3,Class11.4,Class11.5,Class11.6
0,0.383147,0.616853,0.0,0.0,0.616853,0.038452,0.578401,0.418398,0.198455,0.0,...,0.0,0.279952,0.138445,0.0,0.0,0.092886,0.0,0.0,0.0,0.325512
1,0.327001,0.663777,0.009222,0.031178,0.632599,0.46737,0.165229,0.591328,0.041271,0.0,...,0.018764,0.0,0.131378,0.45995,0.0,0.591328,0.0,0.0,0.0,0.0
2,0.765717,0.177352,0.056931,0.0,0.177352,0.0,0.177352,0.0,0.177352,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.693377,0.238564,0.068059,0.0,0.238564,0.109493,0.129071,0.189098,0.049466,0.0,...,0.0,0.094549,0.0,0.094549,0.189098,0.0,0.0,0.0,0.0,0.0
4,0.933839,0.0,0.066161,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### 3. Create dataloaders

A problem encountered here is, we cannot just load tens of thousands of images on RAM (neither on CPU nor GPU memory). Among the possible solutions, it was decided to create a dataloader that yields batches of training examples.

In [34]:
import tensorflow as tf

class GalaxyDataset:
    def __init__(self, images_path, images_names, labels, batch_size = 128):
        self.images_path = images_path
        self.images_names = images_names
        self.batch_size = batch_size
        self.labels = labels

    def generator(self):
        image_batch = []
        labels_batch = []
        counter = 0
        
        while True:
            if counter == len(self.images_names):
                break

            image_path = self.images_path + self.images_names[counter]
            with Image.open(image_path) as img:
                img = img.convert("RGB")
                image_batch.append(np.array(img).astype(np.float32))
            
            labels_batch.append(self.labels[counter])

            counter = counter + 1

            if (counter%self.batch_size == 0) or (counter == len(self.images_names)):
                #print (counter)
                X = np.stack(image_batch, axis=0).astype(np.float32)
                y = np.array(labels_batch).astype(np.float32)

                image_batch = []
                labels_batch = []

                yield X, y 
    
    def get_tf_dataset(self):
        output_signature = (
            tf.TensorSpec(shape=(None, 224, 224, 3), dtype=tf.float32),
            tf.TensorSpec(shape=(None, 37), dtype=tf.float32)
        )

        dataset = tf.data.Dataset.from_generator(
                    self.generator,
                    output_signature=output_signature
                )

        return dataset

In [35]:
labels = train_sols.to_numpy()

Picking last 20% as validation

In [36]:
total_n = len(train_image_names)
validation_n = int(round(total_n*0.2))

val_names  = train_image_names[:validation_n]
train_names = train_image_names[validation_n:]

val_labels  = labels[:validation_n]
train_labels = labels[validation_n:]

In [37]:
train_ds = GalaxyDataset(train_folder_proc, train_names, train_labels)
val_ds = GalaxyDataset(train_folder_proc, val_names, val_labels)

tf_train_ds = train_ds.get_tf_dataset()
tf_val_ds = val_ds.get_tf_dataset()

#sanity checking the dataloader's dimensions
# for batch_i, (X, y) in enumerate(tf_train_ds):
#     print(batch_i, X.shape, y.shape)
    
# for batch_i, (X, y) in enumerate(tf_val_ds):
#     print(batch_i, X.shape, y.shape)

#### 4. Defining the architecture

As a starting step, we took a vanilla implementation of AlexNet and tried to train it for the purposes of this project. AlexNet was selected due to its reputation as a well performing model in image classification tasks (at least for its time). Although our task is not really image classification, it is still interesting to see its performance for this task.

2 important things before the architecture is define are:  
- Normal AlexNet is made for image classification. That means that its normal implementation involves softmax as the activation function of the last dense layer. For our task, we elected to use signmoid as the activation function of the last layer, since the target values are continuous (from 0 to 1).
- AlexNet does not really contain any method of generalization, other that adding dropout between the dense layers. This was not immediatelly tackled, but there is experimentation with additional generalization methods later on.

In [38]:
from tensorflow.keras import layers, models
from tensorflow.keras import Input
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, BatchNormalization, Activation, MaxPooling2D, Dropout, ReLU
from tensorflow.keras.layers import Flatten, Dense

In [39]:
def compileModel (model):
    model.compile(
            optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
            loss="mse",
            metrics=[tf.keras.metrics.RootMeanSquaredError(name="rmse")] #tracking rmse, as suggested by the competition
        )

    return model

In [40]:
def buildVanillaAlexnet ():
    model = Sequential()
    
    model.add(Input((224, 224, 3)))
    
    model.add(Conv2D(96, 11, 4, activation='relu'))
    model.add(MaxPooling2D(3, 2))

    model.add(Conv2D(256, 5, 1, activation='relu'))
    model.add(MaxPooling2D(3, 2))

    model.add(Conv2D(384, 3, 1, activation='relu'))
    model.add(Conv2D(384, 3, 1, activation='relu'))
    model.add(Conv2D(256, 3, 1, activation='relu'))
    model.add(MaxPooling2D(3, 2))

    model.add(Flatten())

    model.add(Dense(4096, activation='relu'))
    model.add(Dropout(0.5))
    
    model.add(Dense(4096, activation='relu'))
    model.add(Dropout(0.5))

    model.add(Dense(37, activation='sigmoid'))

    model.summary()

    return model

In [41]:
anet = buildVanillaAlexnet()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_28 (Conv2D)          (None, 54, 54, 96)        34944     
                                                                 
 max_pooling2d_14 (MaxPooli  (None, 26, 26, 96)        0         
 ng2D)                                                           
                                                                 
 conv2d_29 (Conv2D)          (None, 22, 22, 256)       614656    
                                                                 
 max_pooling2d_15 (MaxPooli  (None, 10, 10, 256)       0         
 ng2D)                                                           
                                                                 
 conv2d_30 (Conv2D)          (None, 8, 8, 384)         885120    
                                                                 
 conv2d_31 (Conv2D)          (None, 6, 6, 384)        

In [42]:
anet = compileModel(anet)

#### 5. Training the vanilla model

In [43]:
checkpoint_cb = tf.keras.callbacks.ModelCheckpoint(  #saving the best model
    "best_model.weights.h5",
    monitor="val_loss",
    save_best_only=True,
    save_weights_only=True
)

In [44]:
history = anet.fit(
    tf_train_ds,
    validation_data=tf_val_ds,
    epochs=30,
    callbacks=[checkpoint_cb]
)

Epoch 1/30
    385/Unknown - 49s 123ms/step - loss: 0.0574 - rmse: 0.2397

2026-01-31 11:21:17.140383: I tensorflow/core/framework/local_rendezvous.cc:421] Local rendezvous recv item cancelled. Key hash: 7913565352672323652
2026-01-31 11:21:28.704136: I tensorflow/core/framework/local_rendezvous.cc:421] Local rendezvous recv item cancelled. Key hash: 16480710198172150519
2026-01-31 11:21:28.704181: I tensorflow/core/framework/local_rendezvous.cc:421] Local rendezvous recv item cancelled. Key hash: 4270919493226302083


Epoch 2/30
Epoch 3/30
Epoch 4/30
 79/385 [=====>........................] - ETA: 36s - loss: 0.0569 - rmse: 0.2386

KeyboardInterrupt: 

The validation rmse remains 0.2425. Considering that the final score achieved was 0.07491, there is a long way to go.

#### 6. Try generalization methods

Our model did not seem to have problem with overfitting. That said, it should be interesting to add generalization methods, seeing how would it affect the model

1. BatchNorm

In [45]:
def buildAlexnetBatchNorm ():
    model = Sequential()
    
    model.add(Input((224, 224, 3)))
    
    model.add(Conv2D(96, 11, 4, activation='relu'))
    model.add(BatchNormalization())
    model.add(MaxPooling2D(3, 2))

    model.add(Conv2D(256, 5, 1, activation='relu'))
    model.add(BatchNormalization())
    model.add(MaxPooling2D(3, 2))

    model.add(Conv2D(384, 3, 1, activation='relu'))
    model.add(BatchNormalization())
    model.add(Conv2D(384, 3, 1, activation='relu'))
    model.add(BatchNormalization())
    model.add(Conv2D(256, 3, 1, activation='relu'))
    model.add(BatchNormalization())
    model.add(MaxPooling2D(3, 2))
    model.add(Flatten())

    model.add(Dense(4096, activation='relu'))
    model.add(Dropout(0.5))
    
    model.add(Dense(4096, activation='relu'))
    model.add(Dropout(0.5))

    model.add(Dense(37, activation='sigmoid'))

    model.summary()

    return model

In [46]:
anetBN = buildAlexnetBatchNorm()
anetBN = compileModel(anetBN)

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_33 (Conv2D)          (None, 54, 54, 96)        34944     
                                                                 
 batch_normalization_6 (Bat  (None, 54, 54, 96)        384       
 chNormalization)                                                
                                                                 
 max_pooling2d_17 (MaxPooli  (None, 26, 26, 96)        0         
 ng2D)                                                           
                                                                 
 conv2d_34 (Conv2D)          (None, 22, 22, 256)       614656    
                                                                 
 batch_normalization_7 (Bat  (None, 22, 22, 256)       1024      
 chNormalization)                                                
                                                      

In [47]:
history = anetBN.fit(
    tf_train_ds,
    validation_data=tf_val_ds,
    epochs=30,
    callbacks=[checkpoint_cb]
)

Epoch 1/30
    385/Unknown - 50s 124ms/step - loss: 0.0287 - rmse: 0.1695

2026-01-31 11:24:31.551988: I tensorflow/core/framework/local_rendezvous.cc:421] Local rendezvous recv item cancelled. Key hash: 5905900669088571413


Epoch 2/30
Epoch 3/30
Epoch 4/30
 49/385 [==>...........................] - ETA: 43s - loss: 0.0270 - rmse: 0.1644

KeyboardInterrupt: 

This time, we can see that the model performs far better. RMSE achieved: 0.1630

IMPORTANT thing to mention: BatchNorm AFTER ReLU did actually perform better than putting it before ReLU, which is the usual practice. 

2. Higher Dropout Rate

In [None]:
def buildAlexnetDr():
    model = Sequential()
    
    model.add(Input((224, 224, 3)))
    
    model.add(Conv2D(96, 11, 4, activation='relu'))
    model.add(BatchNormalization())
    model.add(MaxPooling2D(3, 2))

    model.add(Conv2D(256, 5, 1, activation='relu'))
    model.add(BatchNormalization())
    model.add(MaxPooling2D(3, 2))

    model.add(Conv2D(384, 3, 1, activation='relu'))
    model.add(Conv2D(384, 3, 1, activation='relu'))
    model.add(Conv2D(256, 3, 1, activation='relu'))
    model.add(BatchNormalization())
    model.add(MaxPooling2D(3, 2))
    model.add(Flatten())

    model.add(Dense(4096, activation='relu'))
    model.add(Dropout(0.7))
    
    model.add(Dense(4096, activation='relu'))
    model.add(Dropout(0.7))

    model.add(Dense(37, activation='sigmoid'))

    model.summary()

    return model

In [None]:
anetDr = buildAlexnetDr()
anetDr = compileModel(anetDr)

In [None]:
history = anetDr.fit(
    tf_train_ds,
    validation_data=tf_val_ds,
    epochs=30,
    callbacks=[checkpoint_cb]
)

Changing dropout did cause changes in the convergence time, yet not on the final accuracy

#### 7. Architectural Changes

1. Reducing stride, willing to allow more dense filter and therefore less information loss: No effect

2. Reducing MaxPooling for the same reason: No effect

3. Adding layers, wanting to capture more information (model does not overfit, so we were able to do that)

#### 8. Image Preprocessing

Most of the images have a large black area (aka space). It was attempted to reduce it by cropping it out. It did not help (if anything, increased validation RMSE)