In [None]:
# Project Title: Skin Cancer Classification with Transfer Learning

# This project uses a Convolutional Neural Network (CNN) based on Transfer Learning to classify skin lesions into benign and malignant categories.
# The dataset used is the ISIC 2018 dataset, a widely recognized benchmark in skin cancer research.

In [None]:
# Import libraries

# General-purpose libraries
import os
import random
import numpy as np
import pandas as pd
from tqdm import tqdm

# Visualization
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns

# Deep learning (TensorFlow & Keras)
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import (Dense, Conv2D, MaxPooling2D, Flatten, Dropout,
                                     BatchNormalization, GlobalAveragePooling2D, Input)
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array
from tensorflow.keras.applications import MobileNetV2, EfficientNetB0
from tensorflow.keras.applications.efficientnet import preprocess_input
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam

# Scikit-learn for preprocessing & evaluation
from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.model_selection import train_test_split
from sklearn.metrics import (classification_report, confusion_matrix,
                             ConfusionMatrixDisplay, roc_curve, auc)
from itertools import cycle

In [None]:
# Install Kaggle API
!pip install -q kaggle

# If running locally or in Colab, upload your kaggle.json here:
# from google.colab import files
# files.upload()  # Upload kaggle.json manually

# Or manually place kaggle.json in ~/.kaggle/ if you are running in a prepared environment
os.makedirs('/root/.kaggle', exist_ok=True)
# os.rename('kaggle.json', '/root/.kaggle/kaggle.json')  # Uncomment if using upload method
os.chmod('/root/.kaggle/kaggle.json', 600)

# Download and unzip the dataset
!kaggle datasets download -d kmader/skin-cancer-mnist-ham10000
!unzip -q skin-cancer-mnist-ham10000.zip -d ./skin_cancer_data

In [None]:
# Load metadata CSV file

df = pd.read_csv('./skin_cancer_data/HAM10000_metadata.csv')
df.head()

In [None]:
# Label distribution visualization

plt.figure(figsize=(10, 5))
sns.countplot(df, x='dx', order=df['dx'].value_counts().index)
plt.title("Label Distribution in HAM10000 Dataset")
plt.xlabel("Lesion Type")
plt.ylabel("Count")
plt.show()

In [None]:
# Map short labels to full lesion names

lesion_labels = {
    'nv': 'Melanocytic nevi',
    'mel': 'Melanoma',
    'bkl': 'Benign keratosis-like lesions',
    'bcc': 'Basal cell carcinoma',
    'akiec': 'Actinic keratoses',
    'vasc': 'Vascular lesions',
    'df': 'Dermatofibroma'
}

df['lesion_type'] = df['dx'].map(lesion_labels)

In [None]:
# Show class distribution
label_counts = df['dx'].value_counts()
print("Lesion Type Distribution:")
print(label_counts)

# Plot distribution
plt.figure(figsize=(10, 5))
sns.barplot(x=label_counts.index, y=label_counts.values)
plt.title("Distribution of Lesion Types")
plt.xlabel("Lesion Type Code")
plt.ylabel("Count")
plt.show()

In [None]:
# Show one sample image per lesion type class

image_dirs = [
    './skin_cancer_data/HAM10000_images_part_1/',
    './skin_cancer_data/HAM10000_images_part_2/'
]

classes = df['lesion_type'].unique()

plt.figure(figsize=(15, 10))
for i, lesion in enumerate(classes):
    sample = df[df['lesion_type'] == lesion].sample(1).iloc[0]
    image_filename = sample['image_id'] + '.jpg'

    for dir_path in image_dirs:
        image_path = os.path.join(dir_path, image_filename)
        if os.path.exists(image_path):
            img = mpimg.imread(image_path)
            break
    else:
        print(f"Image {image_filename} not found in either directory.")
        continue

    plt.subplot(3, 3, i + 1)
    plt.imshow(img)
    plt.title(lesion)
    plt.axis('off')

plt.tight_layout()
plt.show()

In [None]:
# Encode lesion type labels as integers

le = LabelEncoder()
df['label'] = le.fit_transform(df['lesion_type'])
df[['lesion_type', 'label']].head()

In [None]:
# Split data into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)

# Check class distribution
print("Train distribution:\n", train_df['label'].value_counts())
print("\nTest distribution:\n", test_df['label'].value_counts())

In [None]:
# Preprocess images: resize and normalize

IMAGE_SIZE = (128, 128)
image_dirs = [
    './skin_cancer_data/HAM10000_images_part_1/',
    './skin_cancer_data/HAM10000_images_part_2/'
]

def preprocess_images(df):
    images = []
    labels = []

    for i in tqdm(range(len(df))):
        filename = df['image_id'].iloc[i] + '.jpg'
        for dir_path in image_dirs:
            path = os.path.join(dir_path, filename)
            if os.path.exists(path):
                img = load_img(path, target_size=IMAGE_SIZE)
                img_array = img_to_array(img) / 255.0
                images.append(img_array)
                labels.append(df['label'].iloc[i])
                break
        else:
            print(f"Image {filename} not found in any directory.")
            continue

    return np.array(images), np.array(labels)

X_train, y_train = preprocess_images(train_df)
X_test, y_test = preprocess_images(test_df)

In [None]:
# Encode labels as one-hot vectors

num_classes = len(np.unique(y_train))

y_train_encoded = to_categorical(y_train, num_classes)
y_test_encoded = to_categorical(y_test, num_classes)

In [None]:
# Build the Transfer Learning model using MobileNetV2

input_shape = (128, 128, 3)
base_model = MobileNetV2(input_shape=input_shape, include_top=False, weights='imagenet')
base_model.trainable = False

x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dropout(0.3)(x)
output = Dense(num_classes, activation='softmax')(x)

model = Model(inputs=base_model.input, outputs=output)

In [None]:
# Compile the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Display model summary
model.summary()

In [None]:
# Train the model

history = model.fit(X_train, y_train_encoded,
                    validation_data=(X_test, y_test_encoded),
                    epochs=10,
                    batch_size=32)

In [None]:
# Plot training and validation accuracy & loss
plt.figure(figsize=(14, 6))

# Accuracy
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Training vs Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

# Loss
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training vs Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
# Predict on test data
y_pred_probs = model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)

# Plot confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=le.classes_, yticklabels=le.classes_)
plt.title("Confusion Matrix")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.show()

# Print classification report
print(classification_report(y_test, y_pred, target_names=le.classes_))

In [None]:
# Binarize labels for ROC curve
y_test_bin = label_binarize(y_test, classes=np.arange(num_classes))

# Compute ROC curve and AUC for each class
fpr = dict()
tpr = dict()
roc_auc = dict()

for i in range(num_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_pred_probs[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Plot ROC curves
plt.figure(figsize=(10, 8))
colors = cycle(['aqua', 'darkorange', 'cornflowerblue', 'red', 'green', 'purple', 'brown'])
for i, color in zip(range(num_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=2,
             label=f'Class {le.classes_[i]} (AUC = {roc_auc[i]:0.2f})')

plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Each Class')
plt.legend(loc="lower right")
plt.show()