In [38]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import euclidean as edist, cosine as cdist
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.preprocessing import LabelEncoder

# distance computation functions
edist_calculation = lambda k, l: edist(k, l)
cosine_similaritycheck = lambda k, l: 1 - np.dot(k, l) / (np.linalg.norm(k) * np.linalg.norm(l))
jaccard_similaritycheck = lambda k, l: 1 - np.minimum(k, l).sum() / np.maximum(k, l).sum() if np.maximum(k, l).sum() != 0 else 0



In [39]:
# performing K -Means
def kmeans(data, n, dist_func, max_iter=500):
    rand_indices = np.random.choice(data.shape[0], n, replace=False)
    centers = data[rand_indices]
    i = 0 #iteration count
    prev_sse = None
    start_time = time.time()
    
    while i < max_iter:
        clusters = np.array([np.argmin(list(map(lambda center: dist_func(point, center), centers))) for point in data])
        new_centers = np.array([np.mean(data[np.where(clusters == idx)], axis=0) for idx in range(n)])
        
        if np.array_equal(centers, new_centers):
            break
        
        if prev_sse is not None and np.sum(np.square(pairwise_distances_argmin_min(data, centers, metric=dist_func)[1])) > prev_sse:
            break
        
        prev_sse = np.sum(np.square(pairwise_distances_argmin_min(data, centers, metric=dist_func)[1]))
        centers = new_centers
        i += 1
    end_time = time.time()
    return clusters, centers, prev_sse, i,end_time -start_time

In [40]:
# Loading the dataset
data = pd.read_csv('data.csv').values
# Flattening the values
labels = pd.read_csv('label.csv').values.flatten()

In [41]:
#Determining the length of unique clusters
uniqueclusters =  len(np.unique(labels))

In [42]:
def accuracycalc(clusters, labels):
    c_labels = {}
    unique_clusters = np.unique(clusters)  # Use np.unique to get unique cluster labels
    for cluster in unique_clusters:
        c_label = labels[clusters == cluster]
        c_label = np.ravel(c_label).astype(int)  # Convert to integer array
        label_popular = np.argmax(np.bincount(c_label))
        c_labels[cluster] = label_popular
    
    # Calculate accuracy
    accurate_count = sum(c_labels[cluster] == label for cluster, label in zip(clusters, labels))
    accuracy = accurate_count / len(labels)
    return accuracy

In [43]:
# Define distance functions and their names
distance_functions = [
    (edist_calculation, 'Euclidean'),
    (cosine_similaritycheck, 'Cosine'),
    (jaccard_similaritycheck, 'Jaccard')
]

# Initialize dictionary to store results
kmeans_results = {}

# Execute K-means with each distance metric and store results
results_iter = iter(distance_functions)
while True:
    try:
        d_function, f_name = next(results_iter)
        clusters, centers, sse, iters = kmeans(data, uniqueclusters, d_function)
        accuracy = accuracycalc(clusters, labels)
        kmeans_results[f_name] = {
            'Sum of Squared Errors': sse, 
            'Iterations Completed': iters,
            'Accuracy': accuracy
            'Time to Converge': time_to_converge
        }
    except StopIteration:
        break

# Print formatted results with accuracy
results_iter = iter(kmeans_results.items())
while True:
    try:
        method, result = next(results_iter)
        print(f"Method: {method}, SSE: {result['Sum of Squared Errors']}, Iterations: {result['Iterations Completed']}, Accuracy: {result['Accuracy']}")
    except StopIteration:
        break


Method: Euclidean, SSE: 25473898606.485294, Iterations: 76, Accuracy: 0.603060306030603
Method: Cosine, SSE: 682.2266686639317, Iterations: 33, Accuracy: 0.5937593759375938
Method: Jaccard, SSE: 4208.689080624973, Iterations: 1, Accuracy: 0.42824282428242827


In [47]:
# q4 when there is no change in centroid
# performing K -Means
def kmeans(data, n, dist_func, max_iter=500):
    rand_indices = np.random.choice(data.shape[0], n, replace=False)
    centers = data[rand_indices]
    i = 0 #iteration count
    prev_sse = None
    
    while i < max_iter:
        clusters = np.array([np.argmin(list(map(lambda center: dist_func(point, center), centers))) for point in data])
        new_centers = np.array([np.mean(data[np.where(clusters == idx)], axis=0) for idx in range(n)])
        
        if np.array_equal(centers, new_centers):
            break
        
        #if prev_sse is not None and np.sum(np.square(pairwise_distances_argmin_min(data, centers, metric=dist_func)[1])) > prev_sse:
            #break
        
        prev_sse = np.sum(np.square(pairwise_distances_argmin_min(data, centers, metric=dist_func)[1]))
        centers = new_centers
        i += 1
    
    return clusters, centers, prev_sse, i
# Loading the dataset
data = pd.read_csv('data.csv').values
# Flattening the values
labels = pd.read_csv('label.csv').values.flatten()
#Determining the length of unique clusters
uniqueclusters =  len(np.unique(labels))
def accuracycalc(clusters, labels):
    c_labels = {}
    unique_clusters = np.unique(clusters)  # Use np.unique to get unique cluster labels
    for cluster in unique_clusters:
        c_label = labels[clusters == cluster]
        c_label = np.ravel(c_label).astype(int)  # Convert to integer array
        label_popular = np.argmax(np.bincount(c_label))
        c_labels[cluster] = label_popular
    
    # Calculate accuracy
    accurate_count = sum(c_labels[cluster] == label for cluster, label in zip(clusters, labels))
    accuracy = accurate_count / len(labels)
    return accuracy
# Define distance functions and their names
distance_functions = [
    (edist_calculation, 'Euclidean'),
    (cosine_similaritycheck, 'Cosine'),
    (jaccard_similaritycheck, 'Jaccard')
]

# Initialize dictionary to store results
kmeans_results = {}

# Execute K-means with each distance metric and store results
results_iter = iter(distance_functions)
while True:
    try:
        d_function, f_name = next(results_iter)
        clusters, centers, sse, iters = kmeans(data, uniqueclusters, d_function)
        accuracy = accuracycalc(clusters, labels)
        kmeans_results[f_name] = {
            'Sum of Squared Errors': sse, 
            'Iterations Completed': iters,
            'Accuracy': accuracy
        }
    except StopIteration:
        break

# Print formatted results with accuracy
results_iter = iter(kmeans_results.items())
while True:
    try:
        method, result = next(results_iter)
        print(f"Method: {method}, SSE: {result['Sum of Squared Errors']}, Iterations: {result['Iterations Completed']}, Accuracy: {result['Accuracy']}")
    except StopIteration:
        break


Method: Euclidean, SSE: 25406097880.991848, Iterations: 59, Accuracy: 0.5978597859785979
Method: Cosine, SSE: 692.3585331924694, Iterations: 61, Accuracy: 0.5978597859785979
Method: Jaccard, SSE: 3660.6903681275694, Iterations: 44, Accuracy: 0.6015601560156015
