In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_context("talk")
%matplotlib inline

import plotly.offline as py
py.init_notebook_mode(connected=True)

In [None]:
labels = ("glioma", "meningioma", "notumor", "pituitary")
img_dim = (512, 512)

In [None]:
import numpy as np
import os
import cv2

def load_images_dataset(label, num_images, img_dim):
    root = os.path.join(os.getcwd(), 'data') # get system path to data folder
    path = os.path.join(root, label) # get path to label folder
    file_paths = os.listdir(path) # list all files

    random_select = np.random.choice(file_paths, num_images, replace=False) # get N random selection of files
    images = []
    for f in random_select:                                                 # Preprocess images
        img_path = os.path.join(path, f)                                    # Get full file path
        img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)                    # Read image as grayscale
        img = cv2.resize(img, img_dim, interpolation=cv2.INTER_LINEAR)      # Resize image to correct dimensions
        normalized_img = cv2.normalize(img, None, alpha=0, beta=255, norm_type=cv2.NORM_MINMAX)    # normalize image
        images.append(normalized_img)

    return images

In [None]:
images = load_images_dataset('glioma', 200, img_dim)
plt.imshow(images[1], cmap='gray')

In [None]:
def convert_images_to_np_array(img_list):
    size = len(img_list)
    num_col = img_list[0].shape[0] * img_list[0].shape[1]   # get number of cols by multiplying dims of first image, all same dim

    for i in range(size):
        img_list[i] =  np.ndarray.flatten(img_list[i]).reshape(num_col, 1)      # flatten image

    img_stack = np.dstack(img_list)                                             # Remove extra axis
    img_stack = np.rollaxis(img_stack, axis = 2, start = 0)
    img_stack = img_stack.reshape(size, num_col)

    return img_stack


In [None]:
from sklearn.utils import shuffle
def create_dataframe(num_images, img_dim):

    df_list = []

    for l in labels:    # for each label
        temp_list = load_images_dataset(l, num_images, img_dim)    # load images
        temp_array = convert_images_to_np_array(temp_list)                      # convert images to np arraies
        temp_df = pd.DataFrame(temp_array)                                      # convert each array to a df
        temp_df['label'] = l                                                    # add corresponding label to group of images
        df_list.append(temp_df)

    act = pd.concat(df_list, ignore_index=True)         # combine dataframes into one
    final = shuffle(act).reset_index()                  # shuffle images
    final = final.drop(columns=['index'], axis=1)       # drop unneeded index col

    return final

In [None]:
df = create_dataframe(num_images=200, img_dim=img_dim)
print(df.shape)
df.head()
df.tail()

In [None]:
from sklearn.preprocessing import MinMaxScaler


def split_x_and_y(df):

    x = df.iloc[:, :-1] # get image data from data frame
    y = df.iloc[:, -1]  # get label data from data frame

    scaler = MinMaxScaler()
    x_scaled = scaler.fit_transform(x)

    return (x_scaled, y)

In [None]:
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.decomposition import PCA

# Split features and labels
x, y = split_x_and_y(df)
x_reduced = PCA(n_components=4, random_state=42).fit_transform(x)

# Apply K-means clustering with 4 clusters (one for each tumor type)
kmeans_model = KMeans(n_clusters=4, random_state=42, max_iter=300, init='k-means++', algorithm='elkan')
kmeans_model.fit(x_reduced)

# Get cluster predictions
y_pred_clusters = kmeans_model.predict(x_reduced)

# Map clusters to actual labels by finding the most common label in each cluster
cluster_to_label = {}
for cluster_id in range(4):
    mask = y_pred_clusters == cluster_id
    most_common_label = y[mask].mode()[0]
    cluster_to_label[cluster_id] = most_common_label

# Map predictions to actual label names
y_pred_mapped = [cluster_to_label[cluster] for cluster in y_pred_clusters]

# Calculate accuracy
accuracy = metrics.accuracy_score(y, y_pred_mapped)
print(f"K-means clustering accuracy: {accuracy * 100:.2f}%")

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
from sklearn.metrics import classification_report

# Create confusion matrix
cm = confusion_matrix(y, y_pred_mapped, labels=labels)

# Plot confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
plt.title('K-means Clustering Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.show()

# Print classification report
print("\nClassification Report:")
print(classification_report(y, y_pred_mapped, target_names=labels))

In [None]:
from matplotlib.patches import Patch

# Visualize K-means clustering results
plt.figure(figsize=(12, 8))

# Create color map for each label
color_map = {'glioma': 'red', 'meningioma': 'blue', 'notumor': 'green', 'pituitary': 'orange'}
colors = [color_map[label] for label in y_pred_mapped]

# Plot the first two principal components
scatter = plt.scatter(x_reduced[:, 0], x_reduced[:, 1], c=colors, alpha=0.6, s=50)

# Plot cluster centers
centers = kmeans_model.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1], c='black', s=300, alpha=0.8, marker='X', edgecolors='white', linewidths=2, label='Centroids')

# Add legend
legend_elements = [Patch(facecolor=color_map[label], label=label) for label in labels]
legend_elements.append(plt.Line2D([0], [0], marker='X', color='w', markerfacecolor='black', markersize=15, label='Centroids'))
plt.legend(handles=legend_elements, loc='best')

plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.title('K-means Clustering Results (PCA Visualization)')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()