In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.datasets import fetch_openml

In [2]:
import torch
for i in range(torch.cuda.device_count()):
    print(torch.cuda.get_device_properties(i).name)

In [3]:
# Load the MNIST dataset, False makes it return the data as a NumPy array
mnist = fetch_openml('mnist_784', version=1, as_frame=False, parser='liac-arff')

# Flatten the images
X = mnist.data
y = mnist.target
#print(X.shape) #(70000, 784)

# Split the data
X_train, y_train = X[:60000], y[:60000] #X_train.shape (60000, 784)
X_test, y_test = X[60000:], y[60000:]

# Convert labels to integers
y_train = y_train.astype(int)
y_test = y_test.astype(int)

In [8]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize

def kmeans_cosine(X, k):
    # Normalize the data to unit length
    X_norm = normalize(X)

    # Apply k-means with cosine distance
    kmeans = KMeans(n_clusters=k, random_state=0).fit(X_norm)

    # Get labels and centroids
    labels = kmeans.labels_
    centroids = kmeans.cluster_centers_

    return labels, centroids


In [6]:
def cluster_consistency(labels, y_train, k):
    Q = 0

    for i in range(k):
        
        cluster_labels = y_train[labels == i]
        class_counts = np.bincount(cluster_labels)

        mi = np.max(class_counts)
        Ni = len(cluster_labels)

        Qi = mi / Ni
        Q += Qi
    Q /=k

    return Q

#Train set

k_values = [5, 10, 20, 40, 200]

for k in k_values:
    labels, centroids = kmeans_cosine(X_train, k) ####
    Q = cluster_consistency(labels, y_train, k)
    print(f"Consistency for k={k}: {Q}")



Consistency for k=5: 0.5348332465334867




Consistency for k=10: 0.6470038256087342




Consistency for k=20: 0.7456912065105913




Consistency for k=40: 0.8170488682569534




Consistency for k=200: 0.9181505413656041


In [9]:
#Test set
k_values = [5, 10, 20, 40, 200]
X_train_norm = normalize(X_train)
X_test_norm = normalize(X_test)

for k in k_values:
    # Apply k-means with cosine distance
    kmeans = KMeans(n_clusters=k, random_state=0).fit(X_train_norm)

    # Apply the k-means model to the test data
    labels_test = kmeans.predict(X_test_norm)

    # Calculate the consistency for the test data
    Q_test = cluster_consistency(labels_test, y_test, k)
    print(f"Consistency for test data with k={k}: {Q_test}")



Consistency for test data with k=5: 0.5369658666209592




Consistency for test data with k=10: 0.6493407470345633




Consistency for test data with k=20: 0.7486155096417354




Consistency for test data with k=40: 0.8235276208242197




Consistency for test data with k=200: 0.9198500728384915


In [11]:
#Use another distance measure for evaluating the cluster consistency
def cluster_internal_distance(X, labels, centroids):
    J = 0
    for i in range(len(centroids)):
        cluster_points = X[labels == i]
        distances = np.linalg.norm(cluster_points - centroids[i], axis=1)
        J += np.sum(distances**2)
    return J

for k in k_values:
    # Apply k-means with cosine distance
    kmeans = KMeans(n_clusters=k, random_state=0).fit(X_train_norm)
    labels = kmeans.labels_
    centroids = kmeans.cluster_centers_

    # Calculate the internal distance for the clusters
    J = cluster_internal_distance(X_train_norm, labels, centroids) #the result is smaller the better
    print(f"Internal distance for k={k}: {J}")




Internal distance for k=5: 29785.909253899536




Internal distance for k=10: 26443.18348876443




Internal distance for k=20: 23562.846742489706




Internal distance for k=40: 21209.92920703419




Internal distance for k=200: 16937.056012729696


In [13]:
# Apply PCA to the training data
pca = PCA(n_components=5)  # You can adjust the number of components
X_train_norm = normalize(X_train)
X_train_pca = pca.fit_transform(X_train_norm)

k_values = [5, 10, 20, 40, 200]

for k in k_values:
    # Apply k-means with cosine distance
    kmeans = KMeans(n_clusters=k, random_state=0).fit(X_train_pca)

    labels = kmeans.labels_
    Q_train = cluster_consistency(kmeans.labels_, y_train, k)
    print(f"Consistency for train data with k={k}: {Q_train}")
    J = cluster_internal_distance(X_train_pca, kmeans.labels_, kmeans.cluster_centers_)
    print(f"Internal distance for train k={k}: {J}")

    # Apply the k-means model to the test data
    X_test_pca = pca.transform(X_test_norm)
    labels_test = kmeans.predict(X_test_pca)

    # Calculate the consistency for the test data
    Q_test = cluster_consistency(labels_test, y_test, k)
    print(f"Consistency for test data with k={k}: {Q_test}")

    # Calculate the internal distance for the clusters
    J = cluster_internal_distance(X_test_pca, labels_test, kmeans.cluster_centers_)
    print(f"Internal distance for test k={k}: {J}")



Consistency for train data with k=5: 0.4636483374321534
Internal distance for train k=5: 5789.59061535925
Consistency for test data with k=5: 0.4668687781581492
Internal distance for test k=5: 965.9782852643928




Consistency for train data with k=10: 0.6001417286376383
Internal distance for train k=10: 3712.477430035757
Consistency for test data with k=10: 0.5991502338696233
Internal distance for test k=10: 626.9490124379665




Consistency for train data with k=20: 0.6654042604547693
Internal distance for train k=20: 2597.8075699815704
Consistency for test data with k=20: 0.680306140999449
Internal distance for test k=20: 437.0491594797717




Consistency for train data with k=40: 0.6976721160282737
Internal distance for train k=40: 1876.2938096451476
Consistency for test data with k=40: 0.6969698015121765
Internal distance for test k=40: 316.82006091035737




Consistency for train data with k=200: 0.7385425704488598
Internal distance for train k=200: 927.3869154714818
Consistency for test data with k=200: 0.7432973742491391
Internal distance for test k=200: 161.89169633718842


4. (5 pts) Which k value produces the best results? Explain. Can the results from cluster consistency be misleading? Explain. [HINT Intuitively, what k value should produce the best results on the MNIST dataset?]



k=200 generated the highest consistency value of 0.9181505413656041 for training set and 0.9198500728384915 for testing set, so it can be said that k=200 produced the best result. This is because as the value of k increases, the size of each cluster (i.e., the number of points it contains) usually decreases. Therefore, the proportion of the most common category in each cluster may increase, leading to an improvement in consistency.
However, this result may be misleading. Although consistency is a useful indicator, it does not fully reflect the quality of the clusters. For example, if the value of k is too large, although the consistency may be high, this may be just because each cluster contains only a few points. In addition, if the value of k equals the total number of data points, then each data point will become its own cluster, and the consistency will reach the maximum value of 1, but this does not mean that we have obtained a good clustering result.
For the MNIST dataset, intuitively, the best value of k should be 10, because the MNIST dataset contains 10 categories of handwritten digits (0 to 9). However, due to the distribution and variability of the data, the actual optimal value of k may be different.

5. (10 pts) What can you do to further validate your results if the cluster consistency metric is not working?
Can you use the objective function defined in the class to find out the internal cluster distance of the data points from the mean? How can this objective help determine any misleading clustering results?
Explain and demonstrate this method on the clustering results of the previous steps.



I will use internal cluster distance and apply pca on dataset to further validate the results. Using internal cluster distance to calculate the distance between each point within a cluster and the center of that cluster. The smaller the value of this objective function, the closer each point in a cluster is to the center of that cluster, that is to say, the higher the compactness of the cluster. Therefore, this objective function can help us identify possible misleading clustering results. For example, if the internal distance of a cluster is large, it may mean that the cluster contains multiple different sub-clusters, so we may need to increase the value of k to get better clustering results. 
When k=200, the internal distance reached the minimum value of 16937.056012729696, so we can say that k=200 produced the best result. However, please note that this does not mean that k=200 is the optimal number of clusters, because a too large k value may lead to overclustering.
After applying PCA, k=200 still gets the best results (see results in jupyter notebook). I notice that if k equals to the number of data points, the cluster consistency will be 1 (the highest score), and internal distance will be 0 (the lowest score), which means that it’s overclustering. Besides, the internal distance drops more when goes from k = 5 to k = 10 than k =10 to k =20. Therefore, set a threshold for internal distance will be a feasible way to give the best fit. 
