**Idea:**
Note that this specific notebook is to get the raw baseline of the dataset, with little encoding.

In [3]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping


In [4]:
from google.colab import drive
drive.mount('/content/drive')
train_path = "/content/drive/MyDrive/VIR_AJL_Team_Glycerin/Dermatology_Dataset/bttai-ajl-2025_unzipped/train.csv"
test_path = "/content/drive/MyDrive/VIR_AJL_Team_Glycerin/Dermatology_Dataset/bttai-ajl-2025_unzipped/test.csv"

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

train_df['md5hash'] = train_df['md5hash'].astype(str) + '.jpg'
test_df['md5hash'] = test_df['md5hash'].astype(str) + '.jpg'
train_df['file_path'] = train_df['label'] + '/' + train_df['md5hash']

Mounted at /content/drive


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [6]:
#Pre Preprocessing Step
label_encoder = LabelEncoder()
train_df['encoded_label'] = label_encoder.fit_transform(train_df['label'])
train_data, val_data = train_test_split(train_df, test_size=0.2, random_state=42)
train_datagen = ImageDataGenerator(rescale=1./255)
val_datagen = ImageDataGenerator(rescale=1./255)
train_dir = '/content/drive/MyDrive/VIR_AJL_Team_Glycerin/Dermatology_Dataset/bttai-ajl-2025_unzipped/train/train'

In [7]:
def create_generator(dataframe, directory, batch_size=32, target_size=(128, 128)):
    """
    Template function to create image generators.
    Students should complete this function to load images and labels properly.
    """
    generator = train_datagen.flow_from_dataframe(
        dataframe=dataframe,
        directory=directory,
        x_col='file_path',
        y_col='encoded_label',
        target_size=target_size,
        batch_size=batch_size,
        class_mode='raw',
        validate_filenames=False
    )
    return generator

train_generator = create_generator(train_data, train_dir)
val_generator = create_generator(val_data, train_dir)

Found 2288 non-validated image filenames.
Found 572 non-validated image filenames.


In [8]:
from tensorflow.keras.applications import MobileNet
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from tensorflow.keras.models import Model


base_model = MobileNet(weights='imagenet', include_top=False, input_shape=(128, 128, 3))
base_model.trainable = False


x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dense(1024, activation='relu')(x)
num_classes = 21
predictions = Dense(num_classes, activation='softmax')(x)


model = Model(inputs=base_model.input, outputs=predictions)


model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True
)

model.fit(
    train_generator,
    steps_per_epoch=len(train_generator),
    validation_data=val_generator,
    validation_steps=len(val_generator),
    epochs=10
)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet/mobilenet_1_0_128_tf_no_top.h5
[1m17225924/17225924[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


  self._warn_if_super_not_called()


Epoch 1/10
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1071s[0m 15s/step - accuracy: 0.2434 - loss: 2.8322 - val_accuracy: 0.3881 - val_loss: 1.9712
Epoch 2/10
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m89s[0m 760ms/step - accuracy: 0.5828 - loss: 1.3371 - val_accuracy: 0.4266 - val_loss: 1.8928
Epoch 3/10
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 761ms/step - accuracy: 0.7785 - loss: 0.8001 - val_accuracy: 0.4825 - val_loss: 1.8395
Epoch 4/10
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 891ms/step - accuracy: 0.8639 - loss: 0.4918 - val_accuracy: 0.4476 - val_loss: 1.8968
Epoch 5/10
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 810ms/step - accuracy: 0.9375 - loss: 0.2718 - val_accuracy: 0.4878 - val_loss: 1.8792
Epoch 6/10
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 798ms/step - accuracy: 0.9843 - loss: 0.1358 - val_accuracy: 0.5000 - val_loss: 1.9859
Epoch 7/10
[1m72/72[

<keras.src.callbacks.history.History at 0x7e56f1cf9950>

In [9]:
model.save("/content/drive/MyDrive/VIR_AJL_Team_Glycerin/Saved_Models/my_model.keras")

In [10]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator


test_datagen = ImageDataGenerator(rescale=1./255)

test_generator = test_datagen.flow_from_dataframe(
    dataframe=test_df,
    directory='/content/drive/MyDrive/VIR_AJL_Team_Glycerin/Dermatology_Dataset/bttai-ajl-2025_unzipped/test/test',
    x_col='md5hash',
    y_col=None,
    target_size=(128, 128),
    batch_size=32,
    class_mode=None,
    shuffle=False
)

Found 1227 validated image filenames.


In [11]:

unique_labels = train_df['label'].unique()
class_to_index = {label: idx for idx, label in enumerate(unique_labels)}
index_to_class = {idx: label for label, idx in class_to_index.items()}
test_predictions = model.predict(test_generator, steps=len(test_generator))
predicted_class_indices = np.argmax(test_predictions, axis=1)
predicted_labels = [index_to_class[idx] for idx in predicted_class_indices]

[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m442s[0m 12s/step


In [19]:
#SUBMISSION CHECKING

submission_df = pd.DataFrame({
    'md5hash': test_df['md5hash'],
    'label': predicted_labels
})

duplicate_rows = submission_df[submission_df.duplicated()]
if duplicate_rows.empty:
    print("The CSV has all unique rows.")
else:
    print("The CSV has duplicate rows.")
    print(f"Number of duplicate rows: {len(duplicate_rows)}")
    print("Duplicate rows:")
    print(duplicate_rows)



submission_df.to_csv('/content/drive/MyDrive/VIR_AJL_Team_Glycerin/Submissions/submission.csv', index=False)
print(submission_df)
print(submission_df.columns)
print("-------------------")
print(submission_df.iloc[1])

The CSV has all unique rows.
                                   md5hash                             label
0     0844ae634f0e6e7ef1f73c2aeecbae0e.jpg                     acne-vulgaris
1     3b290d262098f761d719aa07cf36c040.jpg              basal-cell-carcinoma
2     cf561d08ac46d0fda678bff6621005ee.jpg                   dermatomyositis
3     e6371069be05c6b0a95b4b3f1bacc9a5.jpg  basal-cell-carcinoma-morpheiform
4     f76cddb37265f97508f159078dcc7e7c.jpg                     acne-vulgaris
...                                    ...                               ...
1222  270a217e6e961cf405a4ad46a110ff69.jpg                   dermatomyositis
1223  176892dfc10eabf4ad8b1f50ec5df7e5.jpg                      folliculitis
1224  91a4f52d7f99a33ec53d6d695e6c9f4d.jpg              basal-cell-carcinoma
1225  8ba5d48405d01accc3a218693a049ae9.jpg                            keloid
1226  e3e180ecaeadef19fd487caab7655316.jpg              basal-cell-carcinoma

[1227 rows x 2 columns]
Index(['md5hash', 'lab

In [33]:
import pandas as pd


sample_submission_path = '/content/drive/MyDrive/VIR_AJL_Team_Glycerin/Submissions/sample_submission.csv'
your_submission_path = '/content/drive/MyDrive/VIR_AJL_Team_Glycerin/Submissions/submission.csv'
output_path = '/content/drive/MyDrive/VIR_AJL_Team_Glycerin/Submissions/reordered_submission.csv'


sample_submission = pd.read_csv(sample_submission_path)
your_submission = pd.read_csv(your_submission_path)


your_submission['md5hash'] = your_submission['md5hash'].str.replace('.jpg', '', regex=False)


label_mapping = your_submission.set_index('md5hash')['label'].to_dict()
sample_submission['label'] = sample_submission['md5hash'].map(label_mapping)


sample_submission['label'] = sample_submission['label'].fillna('unknown')  # Fill missing labels


print("First few rows of the updated submission:")
print(sample_submission.head())

sample_submission.to_csv(output_path, index=False)

print("\nReordered submission saved to:", output_path)

First few rows of the updated submission:
                            md5hash               label
0  16d1e6b4143c88cb158a50ea8bc3a595     dermatomyositis
1  aceebbdcfd419fa960ebe3933d721550       acne-vulgaris
2  85bfb7325d93bac71fcbc08ae0a9ba23  malignant-melanoma
3  0ee34d8cc387db6430bba2d2ce793ac9     epidermal-nevus
4  268e737f44bce5f54b4bcee52baf7e66            melanoma

Reordered submission saved to: /content/drive/MyDrive/VIR_AJL_Team_Glycerin/Submissions/reordered_submission.csv
