# Dermatology Classification with PyTorch

This notebook implements a neural network for dermatology disease classification using PyTorch.

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import os
import io

In [None]:
# https://www.kaggle.com/datasets/olcaybolat1/dermatology-dataset-classification
df = pd.read_csv("clinical_data.csv")

In [None]:
# If the age column is unfilled, replace it with the mean age
df['age'] = df['age'].replace('?', np.nan).astype(float)
mean_age = int(df['age'].mean())
df['age'] = df['age'].fillna(mean_age).astype(int)

In [None]:
(df['age'] == '?').any()

In [None]:
# Split data into training and testing sets
train = df.sample(frac=0.8, random_state=200)
test = df.drop(train.index)

In [None]:
# Define the feature columns
feature_cols = [
    'erythema', 'scaling', 'definite_borders', 'itching',
    'koebner_phenomenon', 'polygonal_papules', 'follicular_papules',
    'oral_mucosal_involvement', 'knee_and_elbow_involvement', 'scalp_involvement',
    'family_history', 'age'
]

In [None]:
# Define the neural network model
class DermatologyClassifier(nn.Module):
    def __init__(self, input_size=12, num_classes=6):
        super(DermatologyClassifier, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_size, 32),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(32, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(32, num_classes)
        )
    
    def forward(self, x):
        return self.model(x)

In [None]:
# Prepare training data
X_train = torch.tensor(train[feature_cols].astype('float32').values, dtype=torch.float32)
y_train_raw = train['class'].astype(int).values - 1  # PyTorch uses 0-indexed classes
y_train = torch.tensor(y_train_raw, dtype=torch.long)  # PyTorch uses long for class indices

# Prepare test data
X_test = torch.tensor(test[feature_cols].astype('float32').values, dtype=torch.float32)
y_test_raw = test['class'].astype(int).values - 1  # PyTorch uses 0-indexed classes
y_test = torch.tensor(y_test_raw, dtype=torch.long)

# Create PyTorch datasets and dataloaders
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [None]:
# Create model, loss function, and optimizer
num_classes = 6
model = DermatologyClassifier(input_size=len(feature_cols), num_classes=num_classes)
criterion = nn.CrossEntropyLoss()  # Equivalent to categorical_crossentropy in Keras
optimizer = optim.Adam(model.parameters())  # Same optimizer as in the original

# Print model architecture
print(model)

In [None]:
# Training with early stopping
patience = 25
best_val_loss = float('inf')
epochs_without_improvement = 0
best_model_weights = None
num_epochs = 1000

for epoch in range(num_epochs):
    # Training phase
    model.train()
    train_loss = 0.0
    
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    
    # Validation phase
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    val_loss /= len(test_loader)
    val_accuracy = correct / total
    
    print(f'Epoch {epoch+1}/{num_epochs}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}')
    
    # Early stopping check
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        epochs_without_improvement = 0
        best_model_weights = model.state_dict().copy()
    else:
        epochs_without_improvement += 1
        
    if epochs_without_improvement >= patience:
        print(f'Early stopping triggered after {epoch+1} epochs')
        model.load_state_dict(best_model_weights)  # Restore best weights
        break

# Make sure we're using the best weights
if best_model_weights is not None:
    model.load_state_dict(best_model_weights)

# Create checkpoint directory if it doesn't exist
if not os.path.exists("checkpoint"):
    os.makedirs("checkpoint")

# Get the next checkpoint number
num_checkpoints = len(os.listdir("checkpoint"))

# Save the model weights
torch.save(model.state_dict(), f"checkpoint/cp-{num_checkpoints + 1:01d}.weights.pt")

In [None]:
# Evaluate model
model.eval()
test_loss = 0.0
correct = 0
total = 0

with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        test_loss += loss.item()
        
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

test_loss /= len(test_loader)
accuracy = correct / total
training_result = f"Test Loss: {test_loss:.4f}, Test Accuracy: {accuracy:.4f}"
print(training_result)

In [None]:
# Determine biopsy needs
X_all = torch.tensor(df[feature_cols].astype('float32').values, dtype=torch.float32)

model.eval()
with torch.no_grad():
    outputs = model(X_all)
    probs = torch.softmax(outputs, dim=1)
    
    # Get predicted classes and their confidence scores
    confidences, pred_classes = torch.max(probs, dim=1)
    
    # Convert back to 1-indexed classes for comparison
    pred_classes = pred_classes + 1
    true_classes = df['class'].astype(int).values
    
    # Compute biopsy_needed: 0 if model is ≥80% confident AND correct; else 1
    biopsy_needed = ~((pred_classes == torch.tensor(true_classes)) & 
                      (confidences >= 0.8))
    
    # Convert to numpy for pandas
    df['biopsy_needed'] = biopsy_needed.numpy().astype(int)

num_biopsy_needed = (df["biopsy_needed"] == 1).sum()
num_biopsy_not_needed = (df["biopsy_needed"] == 0).sum()

biopsy_result = f"Biopsies needed: {num_biopsy_needed}, Not needed: {num_biopsy_not_needed}, {num_biopsy_not_needed / (num_biopsy_needed + num_biopsy_not_needed) * 100:.2f}% not needed"
print(biopsy_result)

# Create data directory if it doesn't exist
if not os.path.exists("data"):
    os.makedirs("data")

# Save results to file
buf = io.StringIO()
buf.write(str(model))
summary_str = buf.getvalue()

with open(f"data/results-pytorch-{num_checkpoints + 1:01d}.txt", "w") as f:
    f.write(training_result)
    f.write("\n\n")
    f.write("Model Summary:\n")
    f.write(summary_str)
    f.write("\n\n")
    f.write(biopsy_result)

In [None]:
# Save the model
torch.save(model, "dermatology_model_pytorch.pt")
torch.save(model.state_dict(), "dermatology_model_pytorch.weights.pt")