In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from collections import Counter

US = pd.read_csv("USArrests.csv")

In [2]:
scaler = StandardScaler()
USm = US.iloc[:, 1:].values
USsc = scaler.fit_transform(USm)

In [3]:
def distance(x, y):
    return np.sum((x - y)**2)

def centroid(X):
    return np.mean(X, axis=0)

In [9]:
def kmeans1(X, k):
    m, n = X.shape
    if k > m:
        raise ValueError(f"Number of clusters {k} > number of observations {m}")
    
    # Random initial assignment
    clusters0 = np.random.randint(0, k, m)
    clusters = np.zeros(m, dtype=int)
    
    iteration = 0
    while True:
        # Compute the cluster centroids
        centroids = np.zeros((k, n))
        for i in range(k):
            centroids[i, :] = centroid(X[clusters0 == i, :])
        
        # Assign each observation to the nearest centroid
        for i in range(m):
            best = distance(X[i, :], centroids[0, :])
            clusters[i] = 0
            for j in range(1, k):
                candidate = distance(X[i, :], centroids[j, :])
                if candidate < best:
                    best = candidate
                    clusters[i] = j
        
        if np.array_equal(clusters, clusters0):
            break
        clusters0 = clusters.copy()
        iteration += 1
        if iteration % 1 == 0:
            continue
    
    # Check if solution produced less than k clusters
    if len(np.unique(clusters)) != k:
        print(f"Warning: Clustering solution contains less than {k-1} clusters.")
    
    return clusters

In [5]:
def objective(X, k, cl):
    total_distance = 0.0
    for i in range(k):
        cluster_points = X[cl == i, :]
        c = centroid(cluster_points)
        for point in cluster_points:
            total_distance += distance(point, c)
    return total_distance

In [6]:
def kmeans(X, k, niter=50):
    best_clusters = None
    best_objective = np.inf
    for i in range(niter):
        clusters = kmeans1(X, k)
        obj = objective(X, k, clusters)
        if obj < best_objective:
            best_objective = obj
            best_clusters = clusters
    return best_clusters

In [10]:
k = 4
cl = kmeans(USsc, k)
objective(USsc, k, cl)

cldict = Counter(cl)
print("Cluster sizes:", cldict)
for i in range(4):
    cluster_points = US.iloc[cl == i, 1:]
    print(f"Cluster {i + 1}:")
    print(cluster_points.mean())

Cluster sizes: Counter({1: 16, 2: 13, 3: 13, 0: 8})
Cluster 1:
Murder       13.9375
Assault     243.6250
UrbanPop     53.7500
Rape         21.4125
dtype: float64
Cluster 2:
Murder        5.65625
Assault     138.87500
UrbanPop     73.87500
Rape         18.78125
dtype: float64
Cluster 3:
Murder       10.815385
Assault     257.384615
UrbanPop     76.000000
Rape         33.192308
dtype: float64
Cluster 4:
Murder       3.600000
Assault     78.538462
UrbanPop    52.076923
Rape        12.176923
dtype: float64
