<a href="https://colab.research.google.com/github/enamacahiya/Equitable-AI-for-Derm-Competition/blob/main/v1_google_derm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Authenticate user for HuggingFace if needed. Enter token below if requested.
from huggingface_hub import login

hf_token = '' # enter token here
login(hf_token)

In [None]:
import os
import tensorflow as tf
import numpy as np
import pandas as pd
import cv2
from io import BytesIO
from PIL import Image
from huggingface_hub import from_pretrained_keras
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras import backend as K

In [None]:
np.save("content/train_embeddings.npy", train_embeddings)
np.save("content/train_labels.npy", train_labels)
np.save("content/val_embeddings.npy", val_embeddings)
np.save("content/val_labels.npy", val_labels)

print(f"Train embeddings shape: {train_embeddings.shape}")  # (1600, 6144)
print(f"Train labels shape: {train_labels.shape}")  # (1600, 21)

Train embeddings shape: (2288, 6144)
Train labels shape: (2288, 21)


In [None]:
base_model = from_pretrained_keras("google/derm-foundation")

train_df = pd.read_csv('bttai-ajl-2025/train.csv')

train_df['file_path'] = 'bttai-ajl-2025/train/train/' + train_df['label'] + '/' + train_df['md5hash'] + '.jpg'

label_encoder = LabelEncoder()
train_df['encoded_label'] = label_encoder.fit_transform(train_df['label'])

train_data, val_data = train_test_split(train_df, test_size=0.2, random_state=42)

def encode_image_to_tfexample(image_path):
    if not os.path.exists(image_path):
        print(f"file not exist: {image_path}")
        return None

    img = Image.open(image_path).convert('RGB')
    buf = BytesIO()
    img.save(buf, format="PNG")
    image_bytes = buf.getvalue()

    input_tensor = tf.train.Example(features=tf.train.Features(
        feature={
            'image/encoded': tf.train.Feature(
                bytes_list=tf.train.BytesList(value=[image_bytes]))
        }
    )).SerializeToString()

    return input_tensor

def extract_embeddings(df):
    embeddings = []
    labels = []

    batch_size = 32
    batch_images = []
    batch_labels = []
    total_samples = len(df)
    processed_samples = 0

    for idx, row in df.iterrows():
        encoded_image = encode_image_to_tfexample(row['file_path'])
        if encoded_image is None:
            continue

        batch_images.append(encoded_image)
        batch_labels.append(row['encoded_label'])

        if len(batch_images) == batch_size:
            input_tensor = tf.constant(batch_images)
            infer = base_model.signatures["serving_default"]
            output = infer(inputs=input_tensor)
            batch_embeddings = output['embedding'].numpy()

            embeddings.extend(batch_embeddings)
            labels.extend(batch_labels)

            processed_samples += len(batch_images)
            print(f"Processed {processed_samples}/{total_samples} images...")

            batch_images = []
            batch_labels = []

    if batch_images:
        input_tensor = tf.constant(batch_images)
        infer = base_model.signatures["serving_default"]
        output = infer(inputs=input_tensor)
        batch_embeddings = output['embedding'].numpy()

        embeddings.extend(batch_embeddings)
        labels.extend(batch_labels)

        processed_samples += len(batch_images)
        print(f"Processed {processed_samples}/{total_samples} images...")

    print("✅ Feature extraction complete!")

    return np.array(embeddings), np.array(labels)

train_embeddings, train_labels = extract_embeddings(train_data)
val_embeddings, val_labels = extract_embeddings(val_data)

num_classes = len(label_encoder.classes_)
train_labels = to_categorical(train_labels, num_classes=num_classes)
val_labels = to_categorical(val_labels, num_classes=num_classes)

Fetching 7 files:   0%|          | 0/7 [00:00<?, ?it/s]

Processed 32/2288 images...
Processed 64/2288 images...
Processed 96/2288 images...
Processed 128/2288 images...
Processed 160/2288 images...
Processed 192/2288 images...
Processed 224/2288 images...
Processed 256/2288 images...
Processed 288/2288 images...
Processed 320/2288 images...
Processed 352/2288 images...
Processed 384/2288 images...
Processed 416/2288 images...
Processed 448/2288 images...
Processed 480/2288 images...
Processed 512/2288 images...
Processed 544/2288 images...
Processed 576/2288 images...
Processed 608/2288 images...
Processed 640/2288 images...
Processed 672/2288 images...
Processed 704/2288 images...
Processed 736/2288 images...
Processed 768/2288 images...
Processed 800/2288 images...
Processed 832/2288 images...
Processed 864/2288 images...
Processed 896/2288 images...
Processed 928/2288 images...
Processed 960/2288 images...
Processed 992/2288 images...
Processed 1024/2288 images...
Processed 1056/2288 images...
Processed 1088/2288 images...
Processed 1120

FileNotFoundError: [Errno 2] No such file or directory: '/content/train_embeddings.npy'

In [None]:
mlp_model = Sequential([
    Dense(2048, activation='relu', input_shape=(6144,)),
    BatchNormalization(),
    Dropout(0.5),

    Dense(512, activation='relu', input_shape=(6144,)),
    BatchNormalization(),
    Dropout(0.5),

    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.5),

    Dense(21, activation='softmax')
])

def f1_score(y_true, y_pred):
    y_pred = K.round(y_pred)

    tp = K.sum(K.cast(y_true * y_pred, 'float'), axis=0)
    fp = K.sum(K.cast((1 - y_true) * y_pred, 'float'), axis=0)
    fn = K.sum(K.cast(y_true * (1 - y_pred), 'float'), axis=0)

    precision = tp / (tp + fp + K.epsilon())
    recall = tp / (tp + fn + K.epsilon())
    f1 = 2 * (precision * recall) / (precision + recall + K.epsilon())
    return K.mean(f1)

def focal_loss(alpha=0.25, gamma=2.0):
    def loss(y_true, y_pred):
        epsilon = K.epsilon()
        y_pred = K.clip(y_pred, epsilon, 1.0 - epsilon)
        cross_entropy = -y_true * K.log(y_pred)
        loss = alpha * K.pow(1 - y_pred, gamma) * cross_entropy
        return K.sum(loss, axis=1)
    return loss

early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

mlp_model.compile(optimizer=Adam(learning_rate=0.0001),
                  loss=focal_loss(),
                  metrics=['accuracy', f1_score])
mlp_model.fit(train_embeddings, train_labels,
              validation_data=(val_embeddings, val_labels),
              epochs=500, batch_size=64,
              callbacks=[early_stopping])

Epoch 1/500


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 42ms/step - accuracy: 0.1319 - f1_score: 0.0481 - loss: 0.8476 - val_accuracy: 0.5052 - val_f1_score: 0.0672 - val_loss: 0.4005
Epoch 2/500
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 37ms/step - accuracy: 0.3797 - f1_score: 0.2256 - loss: 0.4578 - val_accuracy: 0.5664 - val_f1_score: 0.1679 - val_loss: 0.3053
Epoch 3/500
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 36ms/step - accuracy: 0.4538 - f1_score: 0.2740 - loss: 0.3919 - val_accuracy: 0.6031 - val_f1_score: 0.2451 - val_loss: 0.2688
Epoch 4/500
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 36ms/step - accuracy: 0.4938 - f1_score: 0.3184 - loss: 0.3514 - val_accuracy: 0.6171 - val_f1_score: 0.2874 - val_loss: 0.2448
Epoch 5/500
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 37ms/step - accuracy: 0.5276 - f1_score: 0.3235 - loss: 0.3134 - val_accuracy: 0.6434 - val_f1_score: 0.3602 - val_loss:

<keras.src.callbacks.history.History at 0x21f93877a90>

In [None]:
# saved_mlp_model = tf.keras.models.save_model(filepath = 'content/model.keras', model = mlp_model)
# mlp_model.save('content/model.keras')

In [None]:
# load_mlp_model = load_model('content/model.keras')
# load_mlp_model.summary()
mlp_model.summary()

In [None]:
# Load the test data
test_df = pd.read_csv('bttai-ajl-2025/test.csv')

# Generate file paths for test images
test_df['file_path'] = 'bttai-ajl-2025/test/test/' + test_df['md5hash'] + '.jpg'

# Function to extract embeddings for test data
def extract_test_embeddings(df):
    embeddings = []
    batch_size = 100
    batch_images = []
    total_samples = len(df)
    print("total samples: " + str(total_samples))
    processed_samples = 0

    for idx, row in df.iterrows():
        encoded_image = encode_image_to_tfexample(row['file_path'])
        if encoded_image is None:
            continue

        batch_images.append(encoded_image)

        if len(batch_images) == batch_size:
            input_tensor = tf.constant(batch_images)
            infer = base_model.signatures["serving_default"]
            output = infer(inputs=input_tensor)
            batch_embeddings = output['embedding'].numpy()

            embeddings.extend(batch_embeddings)

            processed_samples += len(batch_images)
            print(f"Processed {processed_samples}/{total_samples} images...")

            batch_images = []

    if batch_images:
        input_tensor = tf.constant(batch_images)
        infer = base_model.signatures["serving_default"]
        output = infer(inputs=input_tensor)
        batch_embeddings = output['embedding'].numpy()

        embeddings.extend(batch_embeddings)

        processed_samples += len(batch_images)
        print(f"Processed {processed_samples}/{total_samples} images...")

    print("✅ Feature extraction complete!")
    return np.array(embeddings)

# Extract embeddings for test data
test_embeddings = extract_test_embeddings(test_df)

# Predict labels using the trained MLP model
predictions = mlp_model.predict(test_embeddings)

# Convert predictions back to original labels
predicted_labels = label_encoder.inverse_transform(np.argmax(predictions, axis=1))

# Create a submission DataFrame
submission_df = test_df[['md5hash']].copy()
submission_df['label'] = predicted_labels

# Save the submission to CSV
submission_df.to_csv('bttai-ajl-2025/submission.csv', index=False)

print("✅ Submission file saved as submission.csv")

total samples: 1227
Processed 100/1227 images...
Processed 200/1227 images...
Processed 300/1227 images...
Processed 400/1227 images...
Processed 500/1227 images...
Processed 600/1227 images...
Processed 700/1227 images...
Processed 800/1227 images...
Processed 900/1227 images...
Processed 1000/1227 images...
Processed 1100/1227 images...
Processed 1200/1227 images...
Processed 1227/1227 images...
✅ Feature extraction complete!
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
✅ Submission file saved as submission.csv


In [None]:
print(np.unique(predicted_labels))
print('\n')
print('Count of unique labels: ', len(np.unique(predicted_labels)))

['acne' 'acne-vulgaris' 'actinic-keratosis' 'basal-cell-carcinoma'
 'basal-cell-carcinoma-morpheiform' 'dermatofibroma' 'dermatomyositis'
 'dyshidrotic-eczema' 'eczema' 'epidermal-nevus' 'folliculitis'
 'kaposi-sarcoma' 'keloid' 'malignant-melanoma' 'melanoma'
 'mycosis-fungoides' 'prurigo-nodularis' 'pyogenic-granuloma'
 'seborrheic-keratosis' 'squamous-cell-carcinoma'
 'superficial-spreading-melanoma-ssm']


Count of unique labels:  21


In [None]:
record = pd.read_csv('bttai-ajl-2025/submission.csv')
print(record['label'].unique())
print('\n')
print('Count of unique labels: ', len(record['label'].unique()))

['kaposi-sarcoma' 'actinic-keratosis' 'squamous-cell-carcinoma'
 'acne-vulgaris' 'folliculitis' 'epidermal-nevus' 'basal-cell-carcinoma'
 'melanoma' 'eczema' 'seborrheic-keratosis' 'mycosis-fungoides' 'acne'
 'dermatofibroma' 'prurigo-nodularis' 'superficial-spreading-melanoma-ssm'
 'keloid' 'dyshidrotic-eczema' 'malignant-melanoma' 'dermatomyositis'
 'pyogenic-granuloma' 'basal-cell-carcinoma-morpheiform']


Count of unique labels:  21
