### MNIST Dataset

Parsing the MNIST dataset

In [1]:
import os
import codecs
import numpy as np
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
data_folder_path = os.environ.get("MNIST_DATA_PATH")
data_files = os.listdir(data_folder_path)
mnist_data_files = [file for file in data_files if file.endswith("ubyte")]

def convert_to_int(byts):
    integer = int(codecs.encode(byts, 'hex'), 16)
    return integer

dataset = {}
for file in mnist_data_files:
    print("Reading...", file)
    with open(data_folder_path + file, "rb") as f:
        data = f.read()
        type_of_data = convert_to_int(data[:4])
        length = convert_to_int(data[4:8])
        if type_of_data == 2051:
            category = "images"
            number_of_rows = convert_to_int(data[8:12])
            number_of_columns = convert_to_int(data[12:16])
            parsed = np.frombuffer(data, dtype=np.uint8, offset=16)
            parsed = parsed.reshape(length, number_of_rows, number_of_columns);
        
        if type_of_data == 2049:
            category = "labels"
            parsed = np.frombuffer(data, dtype=np.uint8, offset=8)
            parsed = parsed.reshape(length)

        if length == 60000:
            set_type = "train"
        if length == 10000:
            set_type = "test"
    dataset[set_type + '_' + category] = parsed

print(dataset['train_images'][0, :, :])

Reading... t10k-images-idx3-ubyte
Reading... t10k-labels-idx1-ubyte
Reading... train-images-idx3-ubyte
Reading... train-labels-idx1-ubyte
[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   3  18  18  18 126 136
  175  26 166 255 247 127   0   0   0   0]
 [  0   0   0   0   0   0   0   0  30  36  94 154 170 253 253 253 253 253
  225 172 253 242 195  64   0   0   0   0]
 [  0   0   0   0   0   0   0  49 238 253 2

In [3]:
train_images = dataset['train_images']
train_images_flattened = train_images.reshape(60000, -1)

In [6]:
def initialize_k_centroids(k):
    np.random.seed(96)
    cen = train_images_flattened.shape[0]
    centroids = np.random.choice(cen, k, replace=False)
    return train_images_flattened[centroids]


def calculate_distances(centroids):
    distances = np.linalg.norm(train_images_flattened[:, np.newaxis] - centroids, axis=2)
    return distances


def assign_clusters(centroids):
    distances = calculate_distances(centroids)
    return np.argmin(distances, axis=1)


def update_centroids(labels, k):
    new_centroids = np.zeros((k, train_images_flattened.shape[1]))
    for i in range(k):
        cluster_points = train_images_flattened[labels == i]
        if len(cluster_points) > 0:
            new_centroids[i] = cluster_points.mean(axis=0)
    return new_centroids


def k_means(k, max_iterations=100, tols=1e-4):
    centroids = initialize_k_centroids(k)

    for iterations in range(max_iterations):
        labels = assign_clusters(centroids)
        new_centroids = update_centroids(labels, k)

        if np.linalg.norm(new_centroids - centroids) < tols:
            print(f"Converged in {iterations + 1} iterations.")
            break

        centroids = new_centroids

    return centroids, labels



In [13]:
k = 10
centroids, labels = k_means(k)

print(centroids.shape)
print(centroids[:, :20])

print(labels.shape)
print(labels[:10])

Converged in 61 iterations.
(10, 784)
[[0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.02067368 0.04526822 0.         0.         0.         0.
  0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.       

In [7]:
def calculate_total_variance(centroids, labels):
    total_variance = 0.0

    for k in range(centroids.shape[0]):
        cluster_points = train_images_flattened[labels == k]
        
        squared_distances = np.sum((cluster_points - centroids[k])**2, axis=1)
        
        total_variance += np.sum(squared_distances)

    return total_variance

In [11]:
variance_k_10 = calculate_total_variance(centroids, labels)
print("Value of objective function (variance) at k = 10 is", variance_k_10)

Value of objective function (variance) at k = 10 is 153760292292.6878


Applying KMeans at k = 5

In [12]:
k_5 = 5
centroids_5, labels_5 = k_means(k_5)

print(centroids_5.shape)
print(centroids_5[:, :20])

print(labels_5.shape)
print(labels_5[:10])

variance_k_5 = calculate_total_variance(centroids_5, labels_5)
print("Value of objective function (variance) at k = 5 is", variance_k_5)

Converged in 49 iterations.
(5, 784)
[[0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.01167099 0.04353464 0.02000741 0.00083364 0.         0.
  0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.        

Applying KMeans at k = 20

In [9]:
k_20 = 20
centroids_20, labels_20 = k_means(10)

print(centroids_20.shape)
# print(centroids_20[:, :20])

print(labels_20.shape)
# print(labels_20[:10])

# variance_k_20 = calculate_total_variance(centroids_20, labels_20)
# print("Value of objective function (variance) at k = 20 is", variance_k_20)

Converged in 61 iterations.
(10, 784)
(60000,)


In [17]:
print(centroids_20.shape)

(10, 784)


153760292292.6878
169452578273.97925
153760292292.6878

In [4]:
def calculate_purity(original_labels, kmeans_labels, k):
    mode_counts = 0
    for i in range(k):
        cluster_k = original_labels[kmeans_labels == i]
        if len(cluster_k) == 0:
            continue
        unique, counts = np.unique(cluster_k, return_counts=True)
        mode_c = np.max(counts)
        mode_counts += mode_c

    purity_index = mode_counts / 11314 #60000
    return purity_index


In [19]:
orig_labels = dataset['train_labels']

purity_index = calculate_purity(orig_labels, labels, 10)
print(purity_index)

0.6369833333333333


In [5]:
def calculate_gini(original_labels, kmeans_labels, k):
    total_gini = 0
    for i in range(k):
        cluster_k = original_labels[kmeans_labels == i]
        dp_in_cluster_k = cluster_k.shape[0]
        if dp_in_cluster_k == 0:
            continue
        unique, counts = np.unique(cluster_k, return_counts=True)
        cluster_ks_gini = 0
        for count in counts:
            cluster_ks_gini += ((count / dp_in_cluster_k) ** 2)
        gini_k = 1 - cluster_ks_gini
        gini_k = gini_k * dp_in_cluster_k
        total_gini += gini_k
    final = total_gini / 11314
    return final

In [21]:
mnist_gini_index = calculate_gini(orig_labels, labels, 10)
print(mnist_gini_index)

0.4954065677981258


In [7]:
from scipy.spatial.distance import cdist

In [8]:
def soft_k_means(main_dataset, k, beta, max_iters=100, threshold=1e-5):
    N, D = main_dataset.shape
    centroids = np.random.choice(N, k, replace=False)
    centroids = main_dataset[centroids]

    for i in range(max_iters):
        dist = cdist(main_dataset, centroids, 'euclidean')

        max_dist = np.max(-beta * dist, axis=1, keepdims=True)
        log_sum_exp = max_dist + np.log(np.sum(np.exp(-beta * dist - max_dist), axis=1, keepdims=True))
        
        responsibilities = np.exp(-beta * dist - log_sum_exp)
        responsibilities /= responsibilities.sum(axis=1, keepdims=True)

        denominators = responsibilities.T.sum(axis=1, keepdims=True)
        denominators[denominators == 0] = 1e-10
        new_centroids = (responsibilities.T @ main_dataset) / denominators

        if np.linalg.norm(new_centroids - centroids) < threshold:
            print(f"Converged in {i + 1} iterations.")
            break
        centroids = new_centroids
    return centroids, responsibilities

In [9]:
def calculate_purity_soft(original_labels, responsibilities):
    n = len(original_labels)
    assigned_clusters = np.argmax(responsibilities, axis=1)

    mode_counts = 0
    k = responsibilities.shape[1]
    for i in range(k):
        cluster_k = original_labels[assigned_clusters == i]
        if len(cluster_k) == 0:
            continue
        unique, counts = np.unique(cluster_k, return_counts=True)
        mode_c = np.max(counts)
        mode_counts += mode_c

    purity_index = mode_counts / n
    return purity_index

In [10]:
def calculate_gini_soft(original_labels, responsibilities):
    n = len(original_labels)
    k = responsibilities.shape[1]

    total_gini = 0
    for i in range(k):
        cluster_probs = responsibilities[:, i]
        if np.sum(cluster_probs) == 0:
            continue
        
        weighted_counts = {}
        for label, prob in zip(original_labels, cluster_probs):
            if label not in weighted_counts:
                weighted_counts[label] = 0
            weighted_counts[label] += prob

        cluster_ks_gini = 0
        total_weight = np.sum(cluster_probs)
        for count in weighted_counts.values():
            cluster_ks_gini += (count / total_weight) ** 2
        
        gini_k = 1 - cluster_ks_gini
        gini_k = gini_k * total_weight
        total_gini += gini_k
    
    final = total_gini / n
    return final

In [11]:
orig_labels = dataset['train_labels']

In [14]:
for beta in [0.1, 1.0, 10.0]:
        centers, responsibilities = soft_k_means(train_images_flattened, 10, beta)
        print(f"Beta={beta}, Final Centers:\n", centers)
        p = calculate_purity_soft(orig_labels, responsibilities)
        print(f"Purity: {p}")
        g = calculate_gini_soft(orig_labels, responsibilities)
        print(f"Gini: {g}")

Beta=0.1, Final Centers:
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Purity: 0.5906
Gini: 0.5503717216074996
Beta=1.0, Final Centers:
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Purity: 0.5947333333333333
Gini: 0.5468512155258705
Beta=10.0, Final Centers:
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Purity: 0.5774333333333334
Gini: 0.555427468337574


### Fashion MNIST dataset

In [4]:
path_to_fmnist_dataset = os.environ.get("FASHION_MNIST_DATA_PATH")
files = os.listdir(path_to_fmnist_dataset)
fmnist_files = [x for x in files if x.endswith("ubyte")]

fmnist_dataset = {}

def convert_to_int(byts):
    integer = int(codecs.encode(byts, 'hex'), 16)
    return integer

for file in fmnist_files:
    with open(path_to_fmnist_dataset + file, 'rb') as fd:
        fmnist_data = fd.read()

        category = convert_to_int(fmnist_data[:4])
        length = convert_to_int(fmnist_data[4:8])
        if category == 2051:
            category = "images"
            no_of_rows = convert_to_int(fmnist_data[8: 12])
            no_of_cols = convert_to_int(fmnist_data[12: 16])
            parsed = np.frombuffer(fmnist_data, dtype=np.uint8, offset=16)
            parsed = parsed.reshape(length, no_of_rows, no_of_cols)
        if category == 2049:
            category = "labels"
            parsed = np.frombuffer(fmnist_data, dtype=np.uint8, offset=8)
            parsed = parsed.reshape(length)
        if length == 60000:
            set_type = "train"
        if length == 10000:
            set_type = "test"
    fmnist_dataset[set_type + "_" + category] = parsed

print(fmnist_dataset['train_images'][:5])
print(fmnist_dataset.keys())

[[[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]]
dict_keys(['test_images', 'test_labels', 'train_images', 'train_labels'])


In [5]:
fmnist_train_images = fmnist_dataset['train_images']
fmnist_train_images_flattened = fmnist_train_images.reshape(fmnist_train_images.shape[0], -1)

In [15]:
def initialize_k_centroids(flattened_images_dataset, k):
    np.random.seed(97)
    centroids = np.random.choice(flattened_images_dataset.shape[0], size=k, replace=False)
    return flattened_images_dataset[centroids]


def calculate_distances(flattened_images_dataset, centroids):
    reshaped_images_datset = flattened_images_dataset[:, np.newaxis]
    minus_centroids = reshaped_images_datset - centroids
    dist = np.linalg.norm(minus_centroids, axis=2)
    return dist


def reassign_clusters(flattened_image_dataset, centroids):
    distances = calculate_distances(flattened_image_dataset, centroids)
    cluster_labels = np.argmin(distances, axis=1)
    return cluster_labels


def recalculate_centroids(cluster_labels, flattened_images_dataset, k):
    new_centroids = np.zeros((k, flattened_images_dataset.shape[1]))
    for i in range(k):
        dps_in_cluster_k = flattened_images_dataset[cluster_labels == i]
        avg = np.mean(dps_in_cluster_k, axis=0)
        new_centroids[i] = avg
    return new_centroids

def general_k_means(k, flattened_images_dataset, max_iterations=100, threshold=1e-4):
    centroids = initialize_k_centroids(flattened_images_dataset, k)
    for iteration in range(max_iterations):
        cluster_labels = reassign_clusters(flattened_images_dataset, centroids)
        new_centroids = recalculate_centroids(cluster_labels, flattened_images_dataset, k)

        diff_in_centroids = centroids - new_centroids
        diff_in_centroids = np.linalg.norm(diff_in_centroids)
        if diff_in_centroids < threshold:
            print(f"Converged in {iteration + 1}th iteration.")
            break
        centroids = new_centroids

    return centroids, cluster_labels

In [17]:
fmnist_k_10 = 10
fmnist_centroids, fmnist_labels = general_k_means(fmnist_k_10, fmnist_train_images_flattened)

print("Completed")

print(fmnist_centroids.shape)
print(fmnist_centroids[:5, :20])

print(fmnist_labels.shape)
print(fmnist_labels[:10])


Converged in 72th iteration.
Completed
(10, 784)
[[1.81370644e-03 3.88651380e-03 2.35781837e-02 7.33255603e-02
  2.30081617e-01 2.89933929e-01 4.99287472e-01 1.40743620e+00
  3.19238243e+00 7.13499158e+00 1.37442674e+01 2.10883534e+01
  2.51936779e+01 2.29187719e+01 2.25083560e+01 2.50352377e+01
  2.55494235e+01 1.83579479e+01 1.04183184e+01 5.59709807e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  4.51287497e-03 4.51287497e-03 4.51287497e-03 4.51287497e-03
  5.84019113e-03 4.51287497e-03 2.99973454e-02 2.65463233e-02
  8.22936023e-03 4.51287497e-03 4.51287497e-03 4.51287497e-03
  3.05282718e-02 2.65463233e-02 4.51287497e-03 6.37111760e-03]
 [2.42336120e-04 6.66424331e-03 4.78613837e-02 2.21616382e-01
  4.71707258e-01 7.68569005e-01 1.96958682e+00 5.12686296e+00
  1.11101418e+01 2.12935902e+01 3.52104689e+01 5.08641706e+01
  6.48262450e+01 6.66003877e+01 6.96165031e+01 7.11936266e+01
  6.28343633e+01 4.49200291e+01 2.88854962e+01 1.60631286e+01]
 [0.00000000e+00 3

In [23]:
fmnist_purity = calculate_purity(fmnist_dataset['train_labels'], fmnist_labels, 10)
print(fmnist_purity)

0.5241166666666667


In [26]:
fmnist_gini = calculate_gini(fmnist_dataset['train_labels'], fmnist_labels, 10)
print(fmnist_gini)

0.5900829479702764


In [18]:
fmnist_k_5 = 5
fmnist_centroids_5, fmnist_labels_5 = general_k_means(fmnist_k_5, fmnist_train_images_flattened)

print("Completed")

print(fmnist_centroids_5.shape)
print(fmnist_centroids_5[:5, :20])

print(fmnist_labels_5.shape)
print(fmnist_labels_5[:10])

Converged in 64th iteration.
Completed
(5, 784)
[[1.34544231e-03 5.71812984e-03 4.49041372e-02 1.78187016e-01
  4.49545913e-01 6.47157753e-01 1.44542550e+00 3.76639758e+00
  8.24117053e+00 1.71300875e+01 3.13654558e+01 4.51570804e+01
  5.45450723e+01 5.37599226e+01 5.55428019e+01 5.75831652e+01
  5.39743525e+01 3.97001345e+01 2.45809788e+01 1.32411705e+01]
 [0.00000000e+00 0.00000000e+00 6.21007807e-04 4.52448545e-03
  4.16962385e-03 1.33073101e-03 1.24201561e-03 1.24201561e-03
  1.68559262e-03 3.54861604e-03 1.23314407e-02 2.41305891e-02
  1.99609652e-02 1.68559262e-02 2.49290277e-02 1.84528034e-02
  2.49290277e-02 1.54364798e-02 5.76650106e-03 3.63733144e-03]
 [2.11445331e-03 9.95839302e-03 6.55480527e-02 2.48755201e-01
  5.55419139e-01 1.00115954e+00 1.80328763e+00 4.51899598e+00
  1.06874702e+01 2.20622059e+01 4.20225087e+01 6.70118682e+01
  8.84813451e+01 8.72486870e+01 8.64341450e+01 9.09075097e+01
  8.22304754e+01 5.34299843e+01 2.96832413e+01 1.44903485e+01]
 [1.06541658e-04 1.

In [19]:
fmnist_k_20 = 20
fmnist_centroids_20, fmnist_labels_20 = general_k_means(fmnist_k_20, fmnist_train_images_flattened)

print("Completed")

print(fmnist_centroids_20.shape)
print(fmnist_centroids_20[:5, :20])

print(fmnist_labels_20.shape)
print(fmnist_labels_20[:10])

KeyboardInterrupt: 

### 20NG Dataset

In [38]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer

newsgroups = fetch_20newsgroups(subset="train")
documents = newsgroups.data
document_labels = newsgroups.target

tfidf_vectorizer = TfidfVectorizer(max_features=1000)
vectors = tfidf_vectorizer.fit_transform(documents).toarray()

print("documents", vectors.shape)
print("labels", document_labels.shape)


def initialize_centroids(data, k):
    indices = np.random.choice(data.shape[0], k, replace=False)
    return data[indices]


def assign_clusters(data, centroids):
    distances = np.linalg.norm(data[:, np.newaxis] - centroids, axis=2)
    return np.argmin(distances, axis=1)


def update_centroids(data, labels, k):
    centroids = np.zeros((k, data.shape[1]))
    for i in range(k):
        cluster_points = data[labels == i]
        if cluster_points.shape[0] > 0:
            centroids[i] = np.mean(cluster_points, axis=0)
    return centroids


def kmeans(data, k, max_iters=100, tol=1e-4):
    centroids = initialize_centroids(data, k)
    for i in range(max_iters):
        labels = assign_clusters(data, centroids)
        new_centroids = update_centroids(data, labels, k)
        if np.allclose(centroids, new_centroids, atol=tol):
            print(f"Converged in {i + 1}th iteration")
            break
        centroids = new_centroids
    return labels, centroids


ng_k = 20
ng_labels, ng_centroids = kmeans(vectors, ng_k)

documents (11314, 1000)
labels (11314,)
Converged in 60th iteration


In [41]:
newsgroups_gini_index = calculate_gini(document_labels, ng_labels, 20)
print(newsgroups_gini_index)

0.8806129460379966


In [42]:
newsgroups_purity = calculate_purity(document_labels, ng_labels, 20)
print(newsgroups_purity)

0.2121265688527488
