**Idea:**
Note that this specific notebook is to get the raw baseline of the dataset, with little encoding.

In [5]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping


In [6]:
from google.colab import drive
drive.mount('/content/drive')
train_path = "/content/drive/MyDrive/VIR_AJL_Team_Glycerin/Dermatology_Dataset/bttai-ajl-2025_unzipped/train.csv"
test_path = "/content/drive/MyDrive/VIR_AJL_Team_Glycerin/Dermatology_Dataset/bttai-ajl-2025_unzipped/test.csv"

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

train_df['md5hash'] = train_df['md5hash'].astype(str) + '.jpg'
test_df['md5hash'] = test_df['md5hash'].astype(str) + '.jpg'
train_df['file_path'] = train_df['label'] + '/' + train_df['md5hash']

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [8]:
label_encoder = LabelEncoder()
train_df['encoded_label'] = label_encoder.fit_transform(train_df['label'])
train_data, val_data = train_test_split(train_df, test_size=0.2, random_state=42)
train_datagen = ImageDataGenerator(rescale=1./255)
val_datagen = ImageDataGenerator(rescale=1./255)
train_dir = '/content/drive/MyDrive/VIR_AJL_Team_Glycerin/Dermatology_Dataset/bttai-ajl-2025_unzipped/train/train'

In [41]:
def create_generator(dataframe, directory, batch_size=32, target_size=(128, 128)):
    """
    Template function to create image generators.
    Students should complete this function to load images and labels properly.
    """
    generator = train_datagen.flow_from_dataframe(
        dataframe=dataframe,
        directory=directory,
        x_col='file_path',
        y_col='encoded_label',
        target_size=target_size,
        batch_size=batch_size,
        class_mode='raw',
        validate_filenames=False
    )
    return generator

# train_generator = create_generator(
#     dataframe=train_df,
#     directory='/content/drive/MyDrive/VIR_AJL_Team_Glycerin/Dermatology_Dataset/bttai-ajl-2025_unzipped/train/train',
#     batch_size=32,
#     class_mode='categorical',
#     target_size=(128, 128)
# )


train_generator = create_generator(train_data, train_dir)
val_generator = create_generator(val_data, train_dir)

Found 2288 non-validated image filenames.
Found 572 non-validated image filenames.


In [None]:
from tensorflow.keras.applications import MobileNet
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from tensorflow.keras.models import Model



base_model = MobileNet(weights='imagenet', include_top=False, input_shape=(128, 128, 3))


base_model.trainable = False


x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dense(1024, activation='relu')(x)
# x = Dense(1024, activation='relu', kernel_regularizer=l2(0.01))(x)
num_classes = 21
predictions = Dense(num_classes, activation='softmax')(x)


model = Model(inputs=base_model.input, outputs=predictions)


model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True
)

model.fit(
    train_generator,
    steps_per_epoch=len(train_generator),
    validation_data=val_generator,
    validation_steps=len(val_generator),
    epochs=10
)

In [14]:
model.save("/content/drive/MyDrive/VIR_AJL_Team_Glycerin/Saved_Models/my_model.keras")

In [22]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator


test_datagen = ImageDataGenerator(rescale=1./255)


test_generator = test_datagen.flow_from_dataframe(
    dataframe=test_df,
    directory='/content/drive/MyDrive/VIR_AJL_Team_Glycerin/Dermatology_Dataset/bttai-ajl-2025_unzipped/test/test',
    x_col='md5hash',
    y_col=None,
    target_size=(128, 128),
    batch_size=32,
    class_mode=None,
    shuffle=False
)

Found 1227 validated image filenames.


In [50]:
unique_labels = train_df['label'].unique()
class_to_index = {label: idx for idx, label in enumerate(unique_labels)}
index_to_class = {idx: label for label, idx in class_to_index.items()}
test_predictions = model.predict(test_generator, steps=len(test_generator))
predicted_class_indices = np.argmax(test_predictions, axis=1)
predicted_labels = [index_to_class[idx] for idx in predicted_class_indices]

[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 629ms/step


In [48]:
submission_df = pd.DataFrame({
    'md5hash': test_df['md5hash'],
    'label': predicted_labels
})
submission_df.to_csv('/content/drive/MyDrive/VIR_AJL_Team_Glycerin/Submissions/submission.csv', index=False)

In [49]:
print(submission_df)

                                   md5hash                             label
0     0844ae634f0e6e7ef1f73c2aeecbae0e.jpg                    dermatofibroma
1     3b290d262098f761d719aa07cf36c040.jpg              basal-cell-carcinoma
2     cf561d08ac46d0fda678bff6621005ee.jpg                   dermatomyositis
3     e6371069be05c6b0a95b4b3f1bacc9a5.jpg  basal-cell-carcinoma-morpheiform
4     f76cddb37265f97508f159078dcc7e7c.jpg                   dermatomyositis
...                                    ...                               ...
1222  270a217e6e961cf405a4ad46a110ff69.jpg                   dermatomyositis
1223  176892dfc10eabf4ad8b1f50ec5df7e5.jpg                     acne-vulgaris
1224  91a4f52d7f99a33ec53d6d695e6c9f4d.jpg                pyogenic-granuloma
1225  8ba5d48405d01accc3a218693a049ae9.jpg  basal-cell-carcinoma-morpheiform
1226  e3e180ecaeadef19fd487caab7655316.jpg              basal-cell-carcinoma

[1227 rows x 2 columns]
