# Skin Cancer Classification kNN Analysis

Importing required libraries and splitting features and labels as well as training and test data. In the dataset it used its own split ratio, as 16800 training instance and 4192 test instance. Hence, the split ratio is 0.8 for training data and 0.2 test data. From that we will split the training data into 0.7 train, 0.1 validation.

In [1]:
import pandas as pd
import numpy as np


def load_dataset():

    splits = {
        "train": "balanced_skin_cancer_dataset_train_2.parquet",
        "test": "balanced_skin_cancer_dataset_test.parquet",
    }
    # Load train dataset
    train_df = pd.read_parquet(
        "hf://datasets/akinsanyaayomide/skin_cancer_dataset_balanced_labels/"
        + splits["train"]
    )
    train_arr = train_df.values
    train_x = train_arr[:, :-1]
    train_y = train_arr[:, -1]

    # Load test dataset
    test_df = pd.read_parquet(
        "hf://datasets/akinsanyaayomide/skin_cancer_dataset_balanced_labels/"
        + splits["test"]
    )
    test_arr = test_df.values
    test_x = test_arr[:, :-1]
    test_y = test_arr[:, -1]

    return train_x, train_y, test_x, test_y

Test for loading the dataset.

In [2]:
train_x, train_y, test_x, test_y = load_dataset()

print("Training dataset X shape:", train_x.shape)
print("Training dataset Y shape:", train_y.shape)
print("Test dataset X shape:", test_x.shape)
print("Test dataset Y shape:", test_y.shape)
print("An instance in the training dataset:", train_x[0], train_y[0])
print("An instance in the test dataset:", test_x[0], test_y[0])

  from .autonotebook import tqdm as notebook_tqdm


Training dataset X shape: (16800, 1)
Training dataset Y shape: (16800,)
Test dataset X shape: (4192, 1)
Test dataset Y shape: (4192,)
An instance in the training dataset: [{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00\x00\x01\x00\x01\x00\x00\xff\xdb\x00C\x00\x08\x06\x06\x07\x06\x05\x08\x07\x07\x07\t\t\x08\n\x0c\x14\r\x0c\x0b\x0b\x0c\x19\x12\x13\x0f\x14\x1d\x1a\x1f\x1e\x1d\x1a\x1c\x1c $.\' ",#\x1c\x1c(7),01444\x1f\'9=82<.342\xff\xdb\x00C\x01\t\t\t\x0c\x0b\x0c\x18\r\r\x182!\x1c!22222222222222222222222222222222222222222222222222\xff\xc0\x00\x11\x08\x00\xe0\x00\xe0\x03\x01"\x00\x02\x11\x01\x03\x11\x01\xff\xc4\x00\x1f\x00\x00\x01\x05\x01\x01\x01\x01\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\xff\xc4\x00\xb5\x10\x00\x02\x01\x03\x03\x02\x04\x03\x05\x05\x04\x04\x00\x00\x01}\x01\x02\x03\x00\x04\x11\x05\x12!1A\x06\x13Qa\x07"q\x142\x81\x91\xa1\x08#B\xb1\xc1\x15R\xd1\xf0$3br\x82\t\n\x16\x17\x18\x19\x1a%&\'()*456789:CDEFGHIJSTUVWXYZcdefghijstuvwxyz\x8

Apply transformations to artificially expand the dataset. For this case, we applied rotation.

In [3]:
import cv2


# Apply rotation to the training image dataset to artificially increase the size of the dataset
# Images are hold as byte arrays in the dataset
def rotate_images(images, angle=90):
    """
    Rotate images by the given angle.
    Args:
    images: numpy array of images(bytes, path)
    angle: angle to rotate the images by
    Returns:
    numpy array of rotated images
    """
    rotated_images = []
    for image in images:
        image_bytes = image[0]["bytes"]
        image_path = image[0]["path"]

        # Rotate the byte array image
        nparr = np.frombuffer(image_bytes, np.uint8)
        img_np = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
        img_np = cv2.rotate(img_np, cv2.ROTATE_90_CLOCKWISE * (angle // 90))
        _, img_encoded = cv2.imencode(".jpg", img_np)
        rotated_images.append([{"bytes": img_encoded.tobytes(), "path": image_path}])

    return np.array(rotated_images)


print(
    "Applying rotation to the training dataset to artificially increase the size of the dataset..."
)

# Double the size of the training dataset by rotating the images by 90(first 1/3), 180(second 1/3), and 270(third 1/3) degrees.
# Split the training dataset into 3 equal parts
train_x_1, train_x_2, train_x_3 = np.array_split(train_x, 3)
train_y_1, train_y_2, train_y_3 = np.array_split(train_y, 3)
train_x_1_rotated = rotate_images(train_x_1, angle=90)
train_x_2_rotated = rotate_images(train_x_2, angle=180)
train_x_3_rotated = rotate_images(train_x_3, angle=270)

print(train_x_1_rotated)
print("Training dataset Y shape after rotation:", train_y_1.shape)

# Concatenate the rotated images with the original training dataset
train_x = np.concatenate(
    (train_x, train_x_1_rotated, train_x_2_rotated, train_x_3_rotated)
)
train_y = np.concatenate((train_y, train_y_1, train_y_2, train_y_3))

print("Training dataset X shape after rotation:", train_x.shape)
print("Training dataset Y shape after rotation:", train_y.shape)
print("Test dataset X shape:", test_x.shape)
print("Test dataset Y shape:", test_y.shape)

Applying rotation to the training dataset to artificially increase the size of the dataset...
[[{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00\x00\x01\x00\x01\x00\x00\xff\xdb\x00C\x00\x02\x01\x01\x01\x01\x01\x02\x01\x01\x01\x02\x02\x02\x02\x02\x04\x03\x02\x02\x02\x02\x05\x04\x04\x03\x04\x06\x05\x06\x06\x06\x05\x06\x06\x06\x07\t\x08\x06\x07\t\x07\x06\x06\x08\x0b\x08\t\n\n\n\n\n\x06\x08\x0b\x0c\x0b\n\x0c\t\n\n\n\xff\xdb\x00C\x01\x02\x02\x02\x02\x02\x02\x05\x03\x03\x05\n\x07\x06\x07\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\xff\xc0\x00\x11\x08\x00\xe0\x00\xe0\x03\x01"\x00\x02\x11\x01\x03\x11\x01\xff\xc4\x00\x1f\x00\x00\x01\x05\x01\x01\x01\x01\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\xff\xc4\x00\xb5\x10\x00\x02\x01\x03\x03\x02\x04\x03\x05\x05\x04\x04\x00\x00\x01}\x01\x02\x03\x00\x04\x11\x05\x12!1A\x06\x13Qa\x07"q\x142\x81\x91\xa1\x08#B\xb1\xc1\x15R\xd1\xf0$3br\x82\t\n\x16\x17\x18\x19\

In [4]:
def train_val_split(train_x, train_y, val_size=0.125, random_state=464):
    """
    Manually splits the training data into a training and validation set.

    Parameters:
    -----------
    train_x : np.ndarray
        Feature matrix of the training data.

    train_y : np.ndarray
        Labels for the training data.

    val_size : float
        Proportion of the training data to be used as validation set.

    random_state : int
        Random seed for reproducibility.

    Returns:
    --------
    X_train, X_val, y_train, y_val : np.ndarray
        Training and validation splits of the features and labels.
    """
    np.random.seed(random_state)

    indices = np.arange(train_x.shape[0])
    np.random.shuffle(indices)

    split_index = int((1 - val_size) * len(indices))

    X_train, X_val = train_x[indices[:split_index]], train_x[indices[split_index:]]
    y_train, y_val = train_y[indices[:split_index]], train_y[indices[split_index:]]

    return X_train, X_val, y_train, y_val

In [5]:
def bytes_to_vector(byte_data):
    """
    Converts a sequence of bytes into a numeric vector.

    Parameters:
    -----------
    byte_data : A byte sequence representing each row of dataset contained after 'byte:' dict.

    Returns:
    --------
    np.ndarray
        A NumPy array of type uint8, representing the byte data as a vector.
    """
    return np.frombuffer(byte_data, dtype=np.uint8)


def calc_distance(instance_1, instance_2):
    # Turn the byte data of the iamge to vector and normalize
    vector_1 = bytes_to_vector(instance_1) / 255.0  # Scale to [0, 1]
    vector_2 = bytes_to_vector(instance_2) / 255.0  # Scale to [0, 1]

    min_len = min(len(vector_1), len(vector_2))
    vector_1, vector_2 = vector_1[:min_len], vector_2[:min_len]

    # Compute Norm
    distance = np.linalg.norm(vector_1 - vector_2)
    return distance

In [6]:
def get_neighbors(train_X, train_Y, test_instance, k):
    """
    Finds the k nearest neighbors of a given test instance in the training data.

    Parameters:
    -----------
    train_X : The feature matrix of the training data.

    train_Y : The target labels corresponding to the training data.

    test_instance : The feature vector of the test instance for which neighbors are being found.

    k : int
        The number of nearest neighbors to consider when making a prediction.

    Returns:
    --------
    sorted_classes[:k] : The class labels of the k nearest neighbors to the test instance, sorted by distance.
    """
    distances = []
    for train_instance in train_X:
        distance = calc_distance(train_instance, test_instance)
        distances.append(distance)
    distances = np.array(distances)
    sorted_indexes = np.argsort(distances)
    sorted_classes = train_Y[sorted_indexes]
    return sorted_classes[:k]

In [7]:
def classify(neighbors_classes):
    """
    Determines the most occuring label among the neighboring data points.

    Parameters:
    -----------
    neighbors_classes : list
        A list of class labels for the nearest neighbors of a given test point.

    Returns:
    --------
    most_voted_class : object
        The class label with the highest number of votes among the neighbors.
        If there is a tie, the first encountered class with the highest count is returned.
    """
    class_votes = {cls: 0 for cls in neighbors_classes}
    for cls in neighbors_classes:
        class_votes[cls] += 1
    most_vote_count = 0
    most_voted_class = None
    for cls in class_votes:
        if class_votes[cls] > most_vote_count:
            most_voted_class = cls
            most_vote_count = class_votes[cls]
    return most_voted_class

In [8]:
class KNN:
    """
    A simple implementation of the K-Nearest Neighbors (KNN) classifier.

    Attributes:
    -----------
    n_neighbors : int
        The number of nearest neighbors to consider when making predictions.

    Methods:
    --------
    fit(train_x, train_y):
        Stores the training data for future predictions.

    predict(test_x):
        Predicts labels for each data point in the test set by identifying the most common label among
        the nearest neighbors in the training data.
    """

    def __init__(self, n_neighbors=5):
        self.n_neighbors = n_neighbors

    def fit(self, train_x, train_y):
        self.train_X = train_x
        self.train_y = train_y

    def predict(self, test_x):
        predictions = []
        for x in test_x:
            neighbors_classes = get_neighbors(
                self.train_X, self.train_y, x, self.n_neighbors
            )
            predicted_label = classify(neighbors_classes)
            predictions.append(predicted_label)
        return predictions

Testing kNN for the image data. We first load the dataset and set training data and test data as well as features and labels. Then we split the training data 0.7 train and 0.1 validation approximately. Then from k = 1, 3, ..., 9 we choose the best three k values as our metric for the performances that will be used on the test data.

In [9]:
train_x, train_y, test_x, test_y = load_dataset()
X_train, X_val, y_train, y_val = train_val_split(train_x, train_y)

In [10]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

In [11]:
k_values = range(1, 10, 2)
validation_scores = {}

test_y = test_y.astype(int)
y_val = y_val.astype(int)

for k in k_values:
    knn = KNN(n_neighbors=k)
    knn.fit(X_train, y_train)
    val_pred_y = knn.predict(X_val)

    val_accuracy = accuracy_score(y_val, val_pred_y)
    validation_scores[k] = val_accuracy
    print(f"Validation Accuracy for K={k}: {val_accuracy * 100:.3f}%")

top_k = sorted(validation_scores, key=validation_scores.get, reverse=True)[:3]
print("\nTop (3) K values based on validation set:", top_k)


for k in top_k:
    knn = KNN(n_neighbors=k)
    knn.fit(train_x, train_y)
    pred_y = knn.predict(test_x)

    test_accuracy = accuracy_score(test_y, pred_y)
    print("\nK={} | Test Accuracy: {:.3f}%".format(k, test_accuracy * 100))

    conf_matrix = confusion_matrix(test_y, pred_y)
    print("Confusion Matrix for K = {}:\n".format(k), conf_matrix)

    print(
        "Classification Report for K = {}:\n".format(k),
        classification_report(test_y, pred_y),
    )

    plt.figure(figsize=(8, 6))
    sns.heatmap(
        conf_matrix,
        annot=True,
        fmt="d",
        cmap="Blues",
        cbar=False,
        xticklabels=np.unique(test_y),
        yticklabels=np.unique(test_y),
    )
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.title("Confusion Matrix for K = {}".format(k))
    plt.show()

Validation Accuracy for K=1: 12.857%
Validation Accuracy for K=3: 12.524%


KeyboardInterrupt: 