In [1]:
#!/bin/bash
!kaggle datasets download hasnainjaved/melanoma-skin-cancer-dataset-of-10000-images

Dataset URL: https://www.kaggle.com/datasets/hasnainjaved/melanoma-skin-cancer-dataset-of-10000-images
License(s): CC0-1.0
Downloading melanoma-skin-cancer-dataset-of-10000-images.zip to d:\CodeBackground\AI-ML-Intern-tasks\3-Week\Skin_cancer_detection




  0%|          | 0.00/98.7M [00:00<?, ?B/s]
  1%|          | 1.00M/98.7M [00:01<01:58, 867kB/s]
  2%|▏         | 2.00M/98.7M [00:01<01:01, 1.64MB/s]
  3%|▎         | 3.00M/98.7M [00:01<00:42, 2.38MB/s]
  4%|▍         | 4.00M/98.7M [00:01<00:34, 2.86MB/s]
  5%|▌         | 5.00M/98.7M [00:02<00:38, 2.56MB/s]
  6%|▌         | 6.00M/98.7M [00:02<00:38, 2.50MB/s]
  7%|▋         | 7.00M/98.7M [00:03<00:43, 2.20MB/s]
  8%|▊         | 8.00M/98.7M [00:04<00:51, 1.83MB/s]
  9%|▉         | 9.00M/98.7M [00:05<01:06, 1.41MB/s]
 10%|█         | 10.0M/98.7M [00:06<01:11, 1.30MB/s]
 11%|█         | 11.0M/98.7M [00:06<01:04, 1.42MB/s]
 12%|█▏        | 12.0M/98.7M [00:07<01:04, 1.42MB/s]
 13%|█▎        | 13.0M/98.7M [00:08<01:01, 1.45MB/s]
 14%|█▍        | 14.0M/98.7M [00:08<00:58, 1.53MB/s]
 15%|█▌        | 15.0M/98.7M [00:09<00:55, 1.59MB/s]
 16%|█▌        | 16.0M/98.7M [00:10<00:53, 1.63MB/s]
 17%|█▋        | 17.0M/98.7M [00:10<00:51, 1.66MB/s]
 18%|█▊        | 18.0M/98.7M [00:11<00:48, 1.74MB/s]
 1

In [3]:
import zipfile
import os

# Specify the path to the zip file
zip_file_path = 'melanoma-skin-cancer-dataset-of-10000-images.zip'
extract_to_path = '.'

# Create the directory if it doesn't exist
os.makedirs(extract_to_path, exist_ok=True)

# Extract the zip file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to_path)

print(f"Files extracted to {extract_to_path}")

Files extracted to .


In [33]:
import os
import cv2
import numpy as np
from pathlib import Path
from typing import List, Tuple, Dict
from tqdm import tqdm
import logging

class MelanomaDataProcessor:
    def __init__(self, img_size: int = 50):
        self.img_size = img_size
        self.setup_logging()

    def setup_logging(self):
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler('melanoma_processing.log'),
                logging.StreamHandler()
            ]
        )

    def load_and_process_image(self, image_path: str) -> np.ndarray:
        img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
        if img is None:
            raise ValueError(f"Failed to load image: {image_path}")

        return cv2.resize(img, (self.img_size, self.img_size))

    def process_directory(self, directory: str, label: np.ndarray) -> Tuple[List, List]:
        """
        Process all images in a directory.

        Returns:
            Tuple of (images, labels)
        """
        images = []
        labels = []
        directory_path = Path(directory)

        if not directory_path.exists():
            logging.error(f"Directory not found: {directory}")
            return images, labels

        files = list(directory_path.glob('*.jpg')) + list(directory_path.glob('*.png'))

        for file_path in tqdm(files, desc=f"Processing {directory_path.name}"):
            try:
                img_array = self.load_and_process_image(str(file_path))
                images.append(img_array)
                labels.append(label)
            except Exception as e:
                logging.warning(f"Error processing {file_path}: {str(e)}")

        return images, labels

    def process_dataset(self, data_config: Dict) -> Tuple[Tuple[np.ndarray, np.ndarray],
                                                         Tuple[np.ndarray, np.ndarray]]:
        """
        Process the entire dataset.

        Returns:
            Tuple of ((train_images, train_labels), (test_images, test_labels))
        """
        # Process training data
        ben_train_images, ben_train_labels = self.process_directory(
            data_config['ben_training_folder'],
            np.array([1, 0])
        )
        mal_train_images, mal_train_labels = self.process_directory(
            data_config['mal_training_folder'],
            np.array([0, 1])
        )

        # Balance benign training data
        ben_train_images = ben_train_images[:len(mal_train_images)]
        ben_train_labels = ben_train_labels[:len(mal_train_labels)]

        # Process testing data
        ben_test_images, ben_test_labels = self.process_directory(
            data_config['ben_testing_folder'],
            np.array([1, 0])
        )
        mal_test_images, mal_test_labels = self.process_directory(
            data_config['mal_testing_folder'],
            np.array([0, 1])
        )

        # Combine images and labels
        train_images = np.array(ben_train_images + mal_train_images)
        train_labels = np.array(ben_train_labels + mal_train_labels)
        test_images = np.array(ben_test_images + mal_test_images)
        test_labels = np.array(ben_test_labels + mal_test_labels)

        # Create shuffling index
        train_shuffle_idx = np.random.permutation(len(train_images))
        test_shuffle_idx = np.random.permutation(len(test_images))

        # Shuffle both images and labels using the same index
        train_images = train_images[train_shuffle_idx]
        train_labels = train_labels[train_shuffle_idx]
        test_images = test_images[test_shuffle_idx]
        test_labels = test_labels[test_shuffle_idx]

        # Log dataset statistics
        self.log_dataset_stats(
            len(ben_train_images),
            len(mal_train_images),
            len(ben_test_images),
            len(mal_test_images)
        )

        return (train_images, train_labels), (test_images, test_labels)

    def log_dataset_stats(self, ben_train: int, mal_train: int,
                         ben_test: int, mal_test: int):
        logging.info("\nDataset Statistics:")
        logging.info(f"Benign training samples: {ben_train}")
        logging.info(f"Malignant training samples: {mal_train}")
        logging.info(f"Benign testing samples: {ben_test}")
        logging.info(f"Malignant testing samples: {mal_test}")
        logging.info(f"Total training samples: {ben_train + mal_train}")
        logging.info(f"Total testing samples: {ben_test + mal_test}")

def main():
    # Configuration
    data_config = {
        'ben_training_folder': "melanoma_cancer_dataset/train/benign",
        'mal_training_folder': "melanoma_cancer_dataset/train/malignant",
        'ben_testing_folder': "melanoma_cancer_dataset/test/benign",
        'mal_testing_folder': "melanoma_cancer_dataset/test/malignant",
    }

    # Initialize and run processor
    processor = MelanomaDataProcessor(img_size=224)
    (train_images, train_labels), (test_images, test_labels) = processor.process_dataset(data_config)

    # Save processed data
    np.savez_compressed(
        'melanoma_dataset.npz',
        train_images=train_images,
        train_labels=train_labels,
        test_images=test_images,
        test_labels=test_labels
    )

if __name__ == "__main__":
    main()

Processing benign: 100%|██████████| 5000/5000 [00:16<00:00, 295.08it/s]
Processing malignant: 100%|██████████| 4605/4605 [00:17<00:00, 268.75it/s]
Processing benign: 100%|██████████| 500/500 [00:02<00:00, 229.02it/s]
Processing malignant: 100%|██████████| 500/500 [00:02<00:00, 235.20it/s]
2024-12-30 16:11:10,205 - INFO - 
Dataset Statistics:
2024-12-30 16:11:10,267 - INFO - Benign training samples: 4605
2024-12-30 16:11:10,273 - INFO - Malignant training samples: 4605
2024-12-30 16:11:10,275 - INFO - Benign testing samples: 500
2024-12-30 16:11:10,277 - INFO - Malignant testing samples: 500
2024-12-30 16:11:10,277 - INFO - Total training samples: 9210
2024-12-30 16:11:10,282 - INFO - Total testing samples: 1000


In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
# statquest explanation video
# https://www.youtube.com/watch?v=HGwBXDKFk9I



![Architecture](diagram.png)

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from melanomaCNN import MelanomaCNN
# from net_class import Net

# Constants
IMG_SIZE = 224  # Image dimensions (50x50 pixels)
BATCH_SIZE = 32 # Number of samples per batch
LEARNING_RATE = 0.0001  # Learning rate for the optimizer
EPOCHS = 2  # Number of training epochs

# Load training data
# Loading the data
data = np.load('melanoma_dataset.npz')
train_images = data['train_images']
train_labels = data['train_labels']
test_images = data['test_images']
test_labels = data['test_labels']
# training_data = np.load("melanoma_training_data.npy", allow_pickle=True)

# Prepare input and label tensors
train_X = torch.Tensor([item for item in train_images]) / 255.0  # Normalize pixel values to [0, 1]
train_y = torch.Tensor([item for item in train_labels])  # One-hot encoded labels

# Initialize model, optimizer, and loss function
net = MelanomaCNN()
optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)
loss_function = nn.MSELoss()  # Mean Squared Error Loss

# Training loop
for epoch in range(EPOCHS):
    print(f"Starting epoch {epoch + 1}/{EPOCHS}")
    for i in range(0, len(train_X), BATCH_SIZE):

        # Prepare batches
        batch_X = train_X[i:i + BATCH_SIZE].view(-1, 1, IMG_SIZE, IMG_SIZE)
        batch_y = train_y[i:i + BATCH_SIZE]

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = net(batch_X)

        # Compute loss
        loss = loss_function(outputs, batch_y)

        # Backward pass
        loss.backward()

        # Update parameters
        optimizer.step()

        # Print progress
        progress = (i + BATCH_SIZE) / len(train_X) * 100
    
        print(f"Epoch {epoch + 1}/{EPOCHS}, Progress: {progress:.2f}% - Loss: {loss.item():.4f}")

# Save the trained model
model_path = "Models/saved_model.pth"
torch.save(net.state_dict(), model_path)
print(f"Model saved to {model_path}")


  train_X = torch.Tensor([item for item in train_images]) / 255.0  # Normalize pixel values to [0, 1]


Starting epoch 1/2
Epoch 1/2, Progress: 0.35% - Loss: 0.2512
Epoch 1/2, Progress: 0.69% - Loss: 0.3284
Epoch 1/2, Progress: 1.04% - Loss: 0.2704
Epoch 1/2, Progress: 1.39% - Loss: 0.2615
Epoch 1/2, Progress: 1.74% - Loss: 0.2563
Epoch 1/2, Progress: 2.08% - Loss: 0.2535
Epoch 1/2, Progress: 2.43% - Loss: 0.2466
Epoch 1/2, Progress: 2.78% - Loss: 0.2478
Epoch 1/2, Progress: 3.13% - Loss: 0.2445
Epoch 1/2, Progress: 3.47% - Loss: 0.2373
Epoch 1/2, Progress: 3.82% - Loss: 0.2388
Epoch 1/2, Progress: 4.17% - Loss: 0.2328
Epoch 1/2, Progress: 4.52% - Loss: 0.2178
Epoch 1/2, Progress: 4.86% - Loss: 0.2649
Epoch 1/2, Progress: 5.21% - Loss: 0.2212
Epoch 1/2, Progress: 5.56% - Loss: 0.2214
Epoch 1/2, Progress: 5.91% - Loss: 0.2270
Epoch 1/2, Progress: 6.25% - Loss: 0.1989
Epoch 1/2, Progress: 6.60% - Loss: 0.2317
Epoch 1/2, Progress: 6.95% - Loss: 0.2014
Epoch 1/2, Progress: 7.30% - Loss: 0.2027
Epoch 1/2, Progress: 7.64% - Loss: 0.2368
Epoch 1/2, Progress: 7.99% - Loss: 0.2243
Epoch 1/2, Prog

In [43]:
train_X[1].shape

torch.Size([224, 224])

In [4]:
net.eval()

correct = 0
total = 0
with torch.no_grad():
    for i in range(0, len(train_X), BATCH_SIZE):
        batch_X = train_X[i:i + BATCH_SIZE].view(-1, 1, IMG_SIZE, IMG_SIZE)
        batch_y = train_y[i:i + BATCH_SIZE]

        outputs = net(batch_X)
        predicted = torch.argmax(outputs, dim=1)
        labels = torch.argmax(batch_y, dim=1)

        correct += (predicted == labels).sum().item()
        total += batch_y.size(0)

accuracy = (correct / total) * 100
print(f"Training Accuracy: {accuracy:.2f}%")

Training Accuracy: 85.52%


In [5]:
model_path = 'Models/saved_model.pth'

net.load_state_dict(torch.load(model_path))
net.eval()
print(f"Model loaded from {model_path}")

Model loaded from Models/saved_model.pth


In [6]:
test_X = torch.Tensor([item for item in test_images]) / 255.0  # Normalize pixel values to [0, 1]
test_y = torch.Tensor([item for item in test_labels])  # One-hot encoded labels


In [7]:
net.eval()

correct = 0
total = 0
with torch.no_grad():
    for i in range(0, len(test_X), BATCH_SIZE):
        batch_X = test_X[i:i + BATCH_SIZE].view(-1, 1, IMG_SIZE, IMG_SIZE)
        batch_y = test_y[i:i + BATCH_SIZE]

        outputs = net(batch_X)
        predicted = torch.argmax(outputs, dim=1)
        labels = torch.argmax(batch_y, dim=1)

        correct += (predicted == labels).sum().item()
        total += batch_y.size(0)

accuracy = (correct / total) * 100
print(f"Testing Accuracy: {accuracy:.2f}%")


Testing Accuracy: 85.20%


In [8]:
!pip install torchsummary

Collecting torchsummary
  Downloading torchsummary-1.5.1-py3-none-any.whl.metadata (296 bytes)
Downloading torchsummary-1.5.1-py3-none-any.whl (2.8 kB)
Installing collected packages: torchsummary
Successfully installed torchsummary-1.5.1


In [9]:
from torchsummary import summary

# model = model.to('cuda')  # Move to GPU if available
summary(net, input_size=(1, 224, 224))


----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 32, 220, 220]             832
            Conv2d-2         [-1, 64, 106, 106]          51,264
            Conv2d-3          [-1, 128, 49, 49]         204,928
            Linear-4                  [-1, 512]      37,749,248
            Linear-5                    [-1, 2]           1,026
Total params: 38,007,298
Trainable params: 38,007,298
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.19
Forward/backward pass size (MB): 19.65
Params size (MB): 144.99
Estimated Total Size (MB): 164.83
----------------------------------------------------------------
