In [77]:
import numpy as np
import pandas as pd
import os
import tensorflow as tf
import xgboost as xgb
import lightgbm as lgb


from tensorflow.keras import layers, models
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier, RandomForestClassifier

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
)

In [None]:
def build_advanced_nn(input_shape):
    model = models.Sequential()
    model.add(layers.Dense(128, activation="relu", input_shape=input_shape))
    model.add(layers.BatchNormalization())
    model.add(layers.Dropout(0.4))

    model.add(layers.Dense(64, activation="relu"))
    model.add(layers.BatchNormalization())
    model.add(layers.Dropout(0.4))

    model.add(layers.Dense(32, activation="relu"))
    model.add(layers.BatchNormalization())
    model.add(layers.Dropout(0.4))

    model.add(layers.Dense(1, activation="sigmoid"))

    model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
    return model

In [None]:
train_metadata_path = "/kaggle/input/isic-2024-challenge/train-metadata.csv"
test_metadata_path = "/kaggle/input/isic-2024-challenge/test-metadata.csv"

train_metadata = pd.read_csv(train_metadata_path)
test_metadata = pd.read_csv(test_metadata_path)

train_metadata.head()

In [None]:
# Checking for cols. with missing values.
missing_values = train_metadata.isnull().sum()
print("Missing Values=\n", missing_values)

In [None]:
columns_to_remove = [
    "lesion_id",
    "iddx_2",
    "iddx_3",
    "iddx_4",
    "iddx_5",
    "mel_mitotic_index",
    "mel_thick_mm",
]
train_metadata_cleaned = train_metadata.drop(columns=columns_to_remove, errors="ignore")
train_metadata_cleaned_no_nulls = train_metadata_cleaned.dropna()
train_metadata_cleaned_no_nulls["target"].value_counts()

In [None]:
majority = train_metadata_cleaned_no_nulls[
    train_metadata_cleaned_no_nulls["target"] == 0
]
minority = train_metadata_cleaned_no_nulls[
    train_metadata_cleaned_no_nulls["target"] == 1
]
majority_class_downsample = resample(
    majority, replace=False, n_samples=len(minority), random_state=42
)
train_metadata_balanced = pd.concat([majority_class_downsample, minority])


print("Shpareee after balancing classes: ", train_metadata_balanced.shape)
print(
    "Class Dist. after balancing: \n", train_metadata_balanced["target"].value_counts()
)

In [None]:
train_metadata_balanced["sex"] = train_metadata_balanced["sex"].map(
    {"male": 1, "female": 0}
)

anatom_site_mapping = {
    "posterior torso": 1,
    "lower extremity": 2,
    "anterior torso": 3,
    "upper extremity": 4,
    "head/neck": 5,
}

tbp_lv_location_mapping = {
    "Torso Front Top Half": 1,
    "Torso Back Top Third": 2,
    "Head & Neck": 3,
    "Torso Back Middle Third": 4,
    "Left Leg - Lower": 5,
    "Right Leg - Lower": 6,
    "Torso Front Bottom Half": 7,
    "Left Arm - Upper": 8,
    "Left Leg - Upper": 9,
    "Right Arm - Upper": 10,
    "Right Leg - Upper": 11,
    "Left Arm - Lower": 12,
    "Right Arm - Lower": 13,
    "Torso Back Bottom Third": 14,
    "Left Leg": 15,
    "Right Leg": 16,
    "Left Arm": 17,
    "Right Arm": 18,
}


tbp_lv_location_simple_mapping = {
    "Torso Back": 1,
    "Torso Front": 2,
    "Left Leg": 3,
    "Head & Neck": 4,
    "Right Leg": 5,
    "Left Arm": 6,
    "Right Arm": 7,
}


train_metadata_balanced["anatom_site_general"] = train_metadata_balanced[
    "anatom_site_general"
].apply(lambda x: anatom_site_mapping.get(x, 0))

train_metadata_balanced["tbp_lv_location"] = train_metadata_balanced[
    "tbp_lv_location"
].apply(lambda x: tbp_lv_location_mapping.get(x, 0))

train_metadata_balanced["tbp_lv_location_simple"] = train_metadata_balanced[
    "tbp_lv_location_simple"
].apply(lambda x: tbp_lv_location_simple_mapping.get(x, 0))

In [None]:
train_metadata_balanced.head()


In [None]:
train_metadata_balanced["age_sex_interaction"] = (
    train_metadata_balanced["age_approx"] * train_metadata_balanced["sex"]
)

In [None]:
X = train_metadata_balanced.drop(
    columns=[
        "isic_id",
        "target",
        "patient_id",
        "image_type",
        "tbp_tile_type",
        "attribution",
        "copyright_license",
        "iddx_full",
        "iddx_1",
        "tbp_lv_dnn_lesion_confidence",
    ]
)
y = train_metadata_balanced["target"]

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_val, y_train, y_val = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

In [None]:
train_dataset = (
    tf.data.Dataset.from_tensor_slices((X_train, y_train))
    .batch(64)
    .shuffle(buffer_size=1024)
)
val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val)).batch(64)

In [None]:
print(X_train.shape)

In [None]:
input_shape = (39,)
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss", patience=5, restore_best_weights=True
)
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
    monitor="val_loss", factor=0.5, patience=3
)
model = build_advanced_nn(input_shape)
history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=50,
    callbacks=[early_stopping, reduce_lr],
)

In [None]:
val_loss, val_accuracy = model.evaluate(val_dataset)
print(f"Validation Accuracy: {val_accuracy:.4f}")

In [None]:
xgb_model = xgb.XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    objective="binary:logistic",
    eval_metric="auc",
)
xgb_model.fit(X_train, y_train)

lgb_model = lgb.LGBMClassifier(n_estimators=100, learning_rate=0.1, max_depth=6)
lgb_model.fit(X_train, y_train)

base_learners = [("xgb", xgb_model), ("lgb", lgb_model)]
stacking_clf = StackingClassifier(
    estimators=base_learners, final_estimator=LogisticRegression()
)
stacking_clf.fit(X_train, y_train)

In [None]:
# Neural network evaluation
val_loss, val_accuracy = model.evaluate(val_dataset)
print(f"Validation Accuracy (NN): {val_accuracy:.4f}")

# XGBoost evaluation
y_val_pred_xgb = xgb_model.predict_proba(X_val)[:, 1]
roc_auc_xgb = roc_auc_score(y_val, y_val_pred_xgb)
print(f"XGBoost ROC AUC Score: {roc_auc_xgb:.4f}")

# LightGBM evaluation
y_val_pred_lgb = lgb_model.predict_proba(X_val)[:, 1]
roc_auc_lgb = roc_auc_score(y_val, y_val_pred_lgb)
print(f"LightGBM ROC AUC Score: {roc_auc_lgb:.4f}")

# Stacking model evaluation
y_val_pred_stacking = stacking_clf.predict_proba(X_val)[:, 1]
roc_auc_stacking = roc_auc_score(y_val, y_val_pred_stacking)
print(f"Stacking ROC AUC Score: {roc_auc_stacking:.4f}")

In [None]:
test_metadata_cleaned = test_metadata.drop(columns=columns_to_remove, errors="ignore")
test_metadata_cleaned["sex"] = test_metadata_cleaned["sex"].map(
    {"male": 1, "female": 0}
)
test_metadata_cleaned["anatom_site_general"] = test_metadata_cleaned[
    "anatom_site_general"
].apply(lambda x: anatom_site_mapping.get(x, 0))
test_metadata_cleaned["tbp_lv_location"] = test_metadata_cleaned[
    "tbp_lv_location"
].apply(lambda x: tbp_lv_location_mapping.get(x, 0))
test_metadata_cleaned["tbp_lv_location_simple"] = test_metadata_cleaned[
    "tbp_lv_location_simple"
].apply(lambda x: tbp_lv_location_simple_mapping.get(x, 0))

test_metadata_cleaned["age_sex_interaction"] = (
    test_metadata_cleaned["age_approx"] * test_metadata_cleaned["sex"]
)

In [None]:
X_test = test_metadata_cleaned[X.columns]  # Align columns with training set
X_test_scaled = scaler.transform(X_test)

In [None]:
test_predictions = stacking_clf.predict_proba(X_test_scaled)[:, 1]
test_predicted_labels = (test_predictions > 0.5).astype(int)

In [None]:
submission = pd.DataFrame(
    {"isic_id": test_metadata["isic_id"], "target": test_predicted_labels}
)
submission.to_csv(
    "/kaggle/working/submission.csv",
    index=False,
)