**Idea:**
Note that this specific notebook is to get the raw baseline of the dataset, with little encoding.

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping


In [2]:
from google.colab import drive
drive.mount('/content/drive')
train_path = "/content/drive/MyDrive/VIR_AJL_Team_Glycerin/Dermatology_Dataset/bttai-ajl-2025_unzipped/train.csv"
test_path = "/content/drive/MyDrive/VIR_AJL_Team_Glycerin/Dermatology_Dataset/bttai-ajl-2025_unzipped/test.csv"

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

train_df['md5hash'] = train_df['md5hash'].astype(str) + '.jpg'
test_df['md5hash'] = test_df['md5hash'].astype(str) + '.jpg'
train_df['file_path'] = train_df['label'] + '/' + train_df['md5hash']

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [4]:
label_encoder = LabelEncoder()
train_df['encoded_label'] = label_encoder.fit_transform(train_df['label'])
train_data, val_data = train_test_split(train_df, test_size=0.2, random_state=42)
train_datagen = ImageDataGenerator(rescale=1./255)
val_datagen = ImageDataGenerator(rescale=1./255)
train_dir = '/content/drive/MyDrive/VIR_AJL_Team_Glycerin/Dermatology_Dataset/bttai-ajl-2025_unzipped/train/train'

In [5]:
def create_generator(dataframe, directory, batch_size=32, target_size=(128, 128)):
    """
    Template function to create image generators.
    Students should complete this function to load images and labels properly.
    """
    generator = train_datagen.flow_from_dataframe(
        dataframe=dataframe,
        directory=directory,
        x_col='file_path',
        y_col='encoded_label',
        target_size=target_size,
        batch_size=batch_size,
        class_mode='raw',
        validate_filenames=False
    )
    return generator

# train_generator = create_generator(
#     dataframe=train_df,
#     directory='/content/drive/MyDrive/VIR_AJL_Team_Glycerin/Dermatology_Dataset/bttai-ajl-2025_unzipped/train/train',
#     batch_size=32,
#     class_mode='categorical',
#     target_size=(128, 128)
# )


train_generator = create_generator(train_data, train_dir)
val_generator = create_generator(val_data, train_dir)

Found 2288 non-validated image filenames.
Found 572 non-validated image filenames.


In [6]:
from tensorflow.keras.applications import MobileNet
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from tensorflow.keras.models import Model



base_model = MobileNet(weights='imagenet', include_top=False, input_shape=(128, 128, 3))


base_model.trainable = False


x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dense(1024, activation='relu')(x)
# x = Dense(1024, activation='relu', kernel_regularizer=l2(0.01))(x)
num_classes = 21
predictions = Dense(num_classes, activation='softmax')(x)


model = Model(inputs=base_model.input, outputs=predictions)


model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True
)

model.fit(
    train_generator,
    steps_per_epoch=len(train_generator),
    validation_data=val_generator,
    validation_steps=len(val_generator),
    epochs=10
)

  self._warn_if_super_not_called()


Epoch 1/10
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 844ms/step - accuracy: 0.2351 - loss: 2.9385 - val_accuracy: 0.4336 - val_loss: 1.9503
Epoch 2/10
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 787ms/step - accuracy: 0.6013 - loss: 1.3233 - val_accuracy: 0.4563 - val_loss: 1.8012
Epoch 3/10
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 801ms/step - accuracy: 0.7609 - loss: 0.8286 - val_accuracy: 0.4860 - val_loss: 1.8067
Epoch 4/10
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 812ms/step - accuracy: 0.8788 - loss: 0.4824 - val_accuracy: 0.5000 - val_loss: 1.7520
Epoch 5/10
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 783ms/step - accuracy: 0.9524 - loss: 0.2527 - val_accuracy: 0.5017 - val_loss: 1.7964
Epoch 6/10
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 827ms/step - accuracy: 0.9853 - loss: 0.1419 - val_accuracy: 0.5052 - val_loss: 1.9406
Epoch 7/10
[1m72/72[

<keras.src.callbacks.history.History at 0x7ad91c6e1110>

In [7]:
model.save("/content/drive/MyDrive/VIR_AJL_Team_Glycerin/Saved_Models/my_model.keras")

In [8]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator


test_datagen = ImageDataGenerator(rescale=1./255)


test_generator = test_datagen.flow_from_dataframe(
    dataframe=test_df,
    directory='/content/drive/MyDrive/VIR_AJL_Team_Glycerin/Dermatology_Dataset/bttai-ajl-2025_unzipped/test/test',
    x_col='md5hash',
    y_col=None,
    target_size=(128, 128),
    batch_size=32,
    class_mode=None,
    shuffle=False
)

Found 1227 validated image filenames.


In [9]:
unique_labels = train_df['label'].unique()
class_to_index = {label: idx for idx, label in enumerate(unique_labels)}
index_to_class = {idx: label for label, idx in class_to_index.items()}
test_predictions = model.predict(test_generator, steps=len(test_generator))
predicted_class_indices = np.argmax(test_predictions, axis=1)
predicted_labels = [index_to_class[idx] for idx in predicted_class_indices]

[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 606ms/step


In [18]:
submission_df = pd.DataFrame({
    'md5hash': test_df['md5hash'],
    'label': predicted_labels
})
submission_df.to_csv('/content/drive/MyDrive/VIR_AJL_Team_Glycerin/Submissions/submission.csv', index=False)
print(submission_df)
print(submission_df.columns)

                                   md5hash                             label
0     0844ae634f0e6e7ef1f73c2aeecbae0e.jpg                     acne-vulgaris
1     3b290d262098f761d719aa07cf36c040.jpg              basal-cell-carcinoma
2     cf561d08ac46d0fda678bff6621005ee.jpg              basal-cell-carcinoma
3     e6371069be05c6b0a95b4b3f1bacc9a5.jpg  basal-cell-carcinoma-morpheiform
4     f76cddb37265f97508f159078dcc7e7c.jpg                     acne-vulgaris
...                                    ...                               ...
1222  270a217e6e961cf405a4ad46a110ff69.jpg                   dermatomyositis
1223  176892dfc10eabf4ad8b1f50ec5df7e5.jpg                      folliculitis
1224  91a4f52d7f99a33ec53d6d695e6c9f4d.jpg              basal-cell-carcinoma
1225  8ba5d48405d01accc3a218693a049ae9.jpg                            keloid
1226  e3e180ecaeadef19fd487caab7655316.jpg              basal-cell-carcinoma

[1227 rows x 2 columns]
Index(['md5hash', 'label'], dtype='object')


In [19]:
sample_submission_path = "/content/drive/MyDrive/VIR_AJL_Team_Glycerin/Dermatology_Dataset/bttai-ajl-2025_unzipped/sample_submission.csv"
sample_submission_df = pd.read_csv(sample_submission_path)
sample_submission_df = sample_submission_df.set_index('md5hash')
submission_df = submission_df.set_index('md5hash')

reordered_submission_df = submission_df.reindex(sample_submission_df.index)
reordered_submission_df = reordered_submission_df.reset_index()

reordered_submission_df = sample_submission_df[['md5hash']].merge(
    submission_df,
    on='md5hash',
    how='left'
)

reordered_submission_df.to_csv('/content/drive/MyDrive/VIR_AJL_Team_Glycerin/Submissions/reordered_submission.csv', index=False)
print(reordered_submission_df.head())

KeyError: "None of [Index(['md5hash'], dtype='object')] are in the [columns]"