Skin cancer machine learning project

In [None]:
# Libraries

import os
import random
import shutil
import cv2
import numpy as np
import matplotlib.pyplot as plt
from google.colab import drive
from skimage import color, img_as_ubyte
from skimage.feature import hog, local_binary_pattern
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, accuracy_score

In [None]:
# Configuration

TRAIN_RATIO = 0.7
VAL_RATIO   = 0.15
TEST_RATIO  = 0.15

IMAGES_DIR = "/content/drive/My Drive/Colab Notebooks/SkinCancerML/images"
DATASET_DIR = "/content/drive/My Drive/Colab Notebooks/SkinCancerML/data"

FEATURE_EXTRACTOR = ["HOG", "LBP", "ColorHist"]
MODEL_TYPE = ["SVM", "KNN", "RandomForest"]

IMG_SIZE = (224, 224)

In [None]:
# Data shuffleling

drive.mount("/content/drive")

required_dirs = ["train", "val", "test"]
if all(os.path.exists(os.path.join(DATASET_DIR, d)) for d in required_dirs):
    print("[INFO]: train/val/test folders already exist")
else:
    print("[INFO]: Creating train/val/test folders...")
    for d in required_dirs:
        for category in ["benign", "malignant"]:
            os.makedirs(os.path.join(DATASET_DIR, d, category), exist_ok=True)

    for category in ["benign", "malignant"]:
        imgs = [f for f in os.listdir(os.path.join(IMAGES_DIR, category))
                if f.lower().endswith((".jpg", ".jpeg", ".png"))]
        random.shuffle(imgs)

        total = len(imgs)
        train_end = int(total * TRAIN_RATIO)
        val_end   = train_end + int(total * VAL_RATIO)

        splits = {
            "train": imgs[:train_end],
            "val": imgs[train_end:val_end],
            "test": imgs[val_end:]
        }

        for split_name, split_imgs in splits.items():
            for img in split_imgs:
                shutil.copyfile(
                    os.path.join(IMAGES_DIR, category, img),
                    os.path.join(DATASET_DIR, split_name, category, img)
                )

    print("[INFO]: Image shuffling completed")
    print(f"[INFO]: train={TRAIN_RATIO*100}%, val={VAL_RATIO*100}%, test={TEST_RATIO*100}%")

In [None]:
# Functions

def extract_hog_features(img):
    gray = color.rgb2gray(img)
    features = hog(gray, pixels_per_cell=(8,8), cells_per_block=(2,2), block_norm='L2-Hys')
    return features

def extract_lbp_features(img, P=8, R=1):
    gray_image = color.rgb2gray(img)
    gray_image_uint8 = img_as_ubyte(gray_image)  # 0-255 uint8
    lbp = local_binary_pattern(gray_image_uint8, P, R, method="uniform")
    hist, _ = np.histogram(lbp.ravel(), bins=np.arange(0, P + 3), range=(0, P + 2))
    hist = hist.astype("float")
    hist /= (hist.sum() + 1e-6)
    return hist

def extract_color_hist(img):
    hist = cv2.calcHist([img], [0, 1, 2], None, [8, 8, 8],
                        [0, 256, 0, 256, 0, 256])
    hist = cv2.normalize(hist, hist).flatten()
    return hist

def extract_features(img_path, extractor):
    img = cv2.imread(img_path)
    img = cv2.resize(img, IMG_SIZE)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    if extractor == "HOG":
        return extract_hog_features(img)
    elif extractor == "LBP":
        return extract_lbp_features(img)
    elif extractor == "ColorHist":
        return extract_color_hist(img)
    else:
        raise ValueError("[ERROR]: Unknown extractor selected")

def load_dataset(split, extractor):
    X, y = [], []
    for label, category in enumerate(["benign", "malignant"]):
        folder = os.path.join(DATASET_DIR, split, category)
        for file in os.listdir(folder):
            if file.lower().endswith((".jpg", ".jpeg", ".png")):
                path = os.path.join(folder, file)
                feats = extract_features(path, extractor)
                X.append(feats)
                y.append(label)
    return np.array(X), np.array(y)

In [None]:
# Load dataset

results = []

for mod in MODEL_TYPE:
    for extractor in FEATURE_EXTRACTOR:
        print(f"[INFO]: Training with {mod} model and {extractor} feature extractor")
        X_train, y_train = load_dataset("train", extractor)
        X_val, y_val = load_dataset("val", extractor)
        X_test, y_test = load_dataset("test", extractor)
        print(f"[INFO]: Feature extraction completed: {X_train.shape[1]} features per image \n")

        # Model training
        if mod == "SVM":
            model = SVC(kernel='rbf', C=1.0, gamma='scale')
        elif mod == "KNN":
            model = KNeighborsClassifier(n_neighbors=5)
        elif mod == "RandomForest":
            model = RandomForestClassifier(n_estimators=100, random_state=42)
        else:
            raise ValueError("[ERROR]: Unknown model type selected")

        model.fit(X_train, y_train)

        # Validation and test
        y_test_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_test_pred)

        results.append({
            "Model": mod,
            "Extractor": extractor,
            "Accuracy": acc
        })

# Plot sorted by accuracy
results_sorted = sorted(results, key=lambda x: x["Accuracy"], reverse=True)
labels = [f"{r['Model']}-{r['Extractor']}" for r in results_sorted]
accuracies = [r['Accuracy']*100 for r in results_sorted]

plt.figure(figsize=(10,6))
plt.barh(labels, accuracies, color='skyblue')
plt.xlabel("Accuracy (%)")
plt.title("(Model - Feature Extractor) Accuracy")
plt.gca().invert_yaxis()
plt.show()
