In [1]:
import torch
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [2]:
# Load the dataset
data = pd.read_csv("Mall_Customers.csv")

In [3]:
# Select relevant features for clustering (Annual Income and Spending Score)
X = data.iloc[:, [3, 4]].values

In [4]:
# Normalize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [5]:
X_tensor = torch.tensor(X_scaled)

In [6]:
# Number of clusters
k = 5

In [7]:
# Initialize centroids randomly
centroids = X_tensor[torch.randperm(X_tensor.size(0))[:k]]

In [8]:
# Number of iterations
max_iters = 100
for _ in range(max_iters):
    # Calculate distances from centroids to each point
    distances = torch.cdist(X_tensor, centroids)

    # Assign each point to the cluster with the nearest centroid
    cluster_assignments = torch.argmin(distances, dim=1)

    # Update centroids by taking the mean of all points assigned to each cluster
    new_centroids = torch.stack([X_tensor[cluster_assignments == i].mean(0) for i in range(k)])

    # Check for convergence
    if torch.all(new_centroids == centroids):
        break

    centroids = new_centroids

In [9]:
# Final cluster assignments
cluster_assignments = cluster_assignments.numpy()

# Add cluster assignments to the original dataframe
data['Cluster'] = cluster_assignments

# Print the cluster centers
print("Cluster Centers:")
print(centroids)

# Print the count of points in each cluster
print("\nCount of points in each cluster:")
print(data['Cluster'].value_counts())

Cluster Centers:
tensor([[-1.3295,  1.1322],
        [-0.2009, -0.0265],
        [ 1.0550, -1.2844],
        [ 0.9916,  1.2395],
        [-1.3075, -1.1370]], dtype=torch.float64)

Count of points in each cluster:
Cluster
1    81
3    39
2    35
4    23
0    22
Name: count, dtype: int64


In [10]:
# Calculate the sum of squared distances
def calculate_error(X_tensor, cluster_assignments, centroids):
    distances = torch.cdist(X_tensor, centroids)
    cluster_distances = torch.min(distances, dim=1)[0]
    error = torch.sum(cluster_distances ** 2)
    return error

# After updating centroids:
error = calculate_error(X_tensor, cluster_assignments, centroids)
print("Error (Within-Cluster Sum of Squares):", error.item())


Error (Within-Cluster Sum of Squares): 65.56840815571682
