In [132]:
import numpy as np
from keras.datasets import mnist
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.cluster import KMeans

(X_train, y_train), (X_test, y_test) = mnist.load_data()


In [150]:
M = 1000

In [151]:
random_indices = np.random.choice(len(X_train), size=M, replace=False)

#selecting M=10000 randomly
selected_M_x = X_train[random_indices]
selected_M_y = y_train[random_indices]

#1nn classifier from sklearn
knn_classifier = KNeighborsClassifier(n_neighbors=1)

reshaped_X_train= X_train.reshape(60000,-1)
reshaped_X_test = X_test.reshape(10000,-1)

reshaped_M_x= selected_M_x.reshape(M,-1)

# Train the classifier on the training data
knn_classifier.fit(reshaped_M_x, selected_M_y)

# Make predictions on the test data
y_pred = knn_classifier.predict(reshaped_X_test)

# Evaluate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print("1NN random Accuracy:", accuracy)


1NN random Accuracy: 0.8833


In [152]:
#Kmeans using 10 buckets method

# Create a dictionary to hold the buckets
buckets = {}
cluster_centers = []

kmeans_x_train = []
kmeans_y_train = []

# Iterate through each label and populate the corresponding bucket
for label in range(10):  # Assuming labels range from 0 to 9
    bucket_indices = np.where(y_train == label)[0]
    buckets[label] = reshaped_X_train[bucket_indices]

# Print the shapes of the resulting buckets
for label, bucket in buckets.items():
    print(f"Bucket for label {label}, shape: {bucket.shape}")

# Perform k-means clustering multiple times on different subsets of the dataset
k=M//10

for i in range(10):
    # Create a KMeans instance
    kmeans = KMeans(n_clusters= k, random_state=42,n_init="auto")
    
    # Fit the model to the subset of the data
    kmeans.fit(buckets[i])
    
    # Get cluster labels and cluster centers
    cluster_centers.append(kmeans.cluster_centers_)

kmeans_x_train = np.concatenate(cluster_centers, axis=0)
for i in range(10):
    kmeans_y_train.append([i] * cluster_centers[i].shape[0])
kmeans_y_train = np.concatenate(kmeans_y_train)
print(kmeans_y_train[0])

Bucket for label 0, shape: (5923, 784)
Bucket for label 1, shape: (6742, 784)
Bucket for label 2, shape: (5958, 784)
Bucket for label 3, shape: (6131, 784)
Bucket for label 4, shape: (5842, 784)
Bucket for label 5, shape: (5421, 784)
Bucket for label 6, shape: (5918, 784)
Bucket for label 7, shape: (6265, 784)
Bucket for label 8, shape: (5851, 784)
Bucket for label 9, shape: (5949, 784)
0


In [153]:
kmeans_buckets_1nn = KNeighborsClassifier(n_neighbors=1)
kmeans_buckets_1nn.fit(kmeans_x_train, kmeans_y_train)

y_pred = kmeans_buckets_1nn.predict(reshaped_X_test)
kmeans_buckets_accuracy = accuracy_score(y_test, y_pred)
print("Kmeans with buckets accuracy_1NN: ",kmeans_buckets_accuracy)

Kmeans with buckets accuracy_1NN:  0.9579


In [137]:
def majority_vote(M):
    kmeans = KMeans(n_clusters=M, random_state=0, n_init="auto").fit(train_X)

    clusters = []
    train_X_majority = []
    train_y_majority = []

    for _ in range(M):
        clusters.append([])
    for i in range(train_X.shape[0]):
        clusters[kmeans.labels_[i]].append(i)
    for clust in clusters:
        # clust contains indices of training data
        count = [0] * 10
        for j in clust:
            count[train_y[j]] += 1
        label = count.index(max(count)) # Majority vote. Break ties by the smallest label
        centroid = np.mean(train_X[clust], axis=0)
        train_X_majority.append(centroid)
        train_y_majority.append(label)
    train_X_majority = np.array(train_X_majority)
    train_y_majority = np.array(train_y_majority)
    assert(train_X_majority.shape[0] == M)
    assert(train_y_majority.shape[0] == M)
    neigh_majority_1nn = KNeighborsClassifier(n_neighbors=1)
    neigh_majority_1nn.fit(train_X_majority, train_y_majority)
    neigh_majority_1nn.score(test_X, test_y)

In [138]:
# 5. What if we do KMeans multiple times with different random seeds? 
# Since KMeans is random, I suspect averaging over these will give a better performance.
# Then we would have multiple copies of M/10 centroids. 
# What if we do another KMeans clustering over these?

# Do this for the 10 buckets, not the whole dataset, since KMeans on the whole dataset takes too long

In [139]:
k = M//10
cluster_centers_0 = []
kmeans_y_train_0 = []
for i in range(10):
    # Create a KMeans instance
    kmeans_random_seeds_0 = KMeans(n_clusters= k, random_state=0,n_init="auto")
    
    # Fit the model to the subset of the data
    kmeans_random_seeds_0.fit(buckets[i])
    
    # Get cluster labels and cluster centers
    cluster_centers_0.append(kmeans_random_seeds_0.cluster_centers_)
cluster_centers_0 = np.concatenate(cluster_centers_0, axis=0)
    
for i in range(10):
    kmeans_y_train_0.append([i] * cluster_centers_0[i].shape[0])
kmeans_y_train_0 = np.concatenate(kmeans_y_train_0)

In [140]:
cluster_centers_42 = []
kmeans_y_train_42 = []
for i in range(10):
    # Create a KMeans instance
    kmeans_random_seeds_42 = KMeans(n_clusters= k, random_state=42, n_init="auto")
    
    # Fit the model to the subset of the data
    kmeans_random_seeds_42.fit(buckets[i])
    
    # Get cluster labels and cluster centers
    cluster_centers_42.append(kmeans_random_seeds_42.cluster_centers_)

cluster_centers_42 = np.concatenate(cluster_centers_42, axis=0)    

for i in range(10):
    kmeans_y_train_42.append([i] * cluster_centers_42[i].shape[0])

kmeans_y_train_42 = np.concatenate(kmeans_y_train_42)

In [141]:
cluster_centers_88 = []
kmeans_y_train_88 = []

for i in range(10):
    # Create a KMeans instance
    kmeans_random_seeds_88 = KMeans(n_clusters= k, random_state=88, n_init="auto")
    
    # Fit the model to the subset of the data
    kmeans_random_seeds_88.fit(buckets[i])
    
    # Get cluster labels and cluster centers
    cluster_centers_88.append(kmeans_random_seeds_88.cluster_centers_)

cluster_centers_88 = np.concatenate(cluster_centers_88, axis=0) 
    
for i in range(10):
    kmeans_y_train_88.append([i] * cluster_centers_88[i].shape[0])

kmeans_y_train_88 = np.concatenate(kmeans_y_train_88)


In [142]:
x_train_randomseed = []
x_train_randomseed.append(cluster_centers_0)
x_train_randomseed.append(cluster_centers_42)
x_train_randomseed.append(cluster_centers_88)

x_train_randomseed = np.concatenate(x_train_randomseed, axis=0)

y_train_randomseed = []
y_train_randomseed.append(kmeans_y_train_0)
y_train_randomseed.append(kmeans_y_train_42)
y_train_randomseed.append(kmeans_y_train_88)

y_train_randomseed = np.concatenate(y_train_randomseed)

cluster_centers_total = []


# Create a KMeans instance
kmeans_random_seeds = KMeans(n_clusters= M, random_state=42, n_init="auto")
    
# Fit the model to the subset of the data
kmeans_random_seeds.fit(x_train_randomseed)
    
cluster_centers_total.append(kmeans_random_seeds.cluster_centers_)

cluster_centers_total = np.concatenate(cluster_centers_total, axis=0) 


In [143]:
cluster_centers_total.shape

(10000, 784)

In [144]:
kmeans_random_seeds.labels_.shape

(30000,)

In [145]:
kmeans_y_train_total = []
for i in range(10000):
    kmeans_y_train_total.append(y_train_randomseed[kmeans_random_seeds.labels_[i]])


kmeans_buckets_1nn = KNeighborsClassifier(n_neighbors=1)
kmeans_buckets_1nn.fit(cluster_centers_total, kmeans_y_train_total)

y_pred = kmeans_buckets_1nn.predict(reshaped_X_test)
print(y_test)
kmeans_randomseeds_accuracy = accuracy_score(y_test, y_pred)
print("Kmeans with random seeds combined accuracy_1NN: ",kmeans_randomseeds_accuracy)

[7 2 1 ... 4 5 6]
Kmeans with random seeds combined accuracy_1NN:  0.0996
