In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler

In [2]:
# Generate a synthetic dataset with 300 samples and 2 features
data = make_blobs(n_samples=300, centers=4, n_features=2, random_state=42)
# Standardize the dataset
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data[0])

In [8]:
def initialize_centroids(data, k):
    # Randomly select k data points as initial centroids
    # Your implementation here
    np.random.shuffle(data)
    centroids = data[:k]
    return centroids

def compute_distances(data, centroids):
    # Compute distances between data points and centroids
    # Your implementation here
    distances = np.zeros((len(data), len(centroids)))
    for i, point in enumerate(data):
        for j, centroid in enumerate(centroids):
            distances[i, j] = np.linalg.norm(point - centroid)
    return distances


def assign_clusters(distances):
    # Assign each data point to the closest centroid
    # Your implementation here
    return np.argmin(distances, axis=1)


def update_centroids(data, clusters, k):
    # Update centroids by computing the mean of points in each cluster
    # Your implementation here
    new_centroids = np.zeros((k, data.shape[1]))
    for i in range(k):
        cluster_points = data[clusters == i]
        if len(cluster_points) > 0:
            new_centroids[i] = np.mean(cluster_points, axis=0)
        else:
            # If no points in the cluster, keep the old one
            new_centroids[i] = centroids[i]
    return new_centroids


def k_means(data, k, max_iterations=100):
    centroids = initialize_centroids(data, k)
    for _ in range(max_iterations):
        distances = compute_distances(data, centroids)
        clusters = assign_clusters(distances)
        new_centroids = update_centroids(data, clusters, k)
        if np.allclose(centroids, new_centroids):
            break
        centroids = new_centroids
    return centroids, clusters

import numpy as np

#main or random usage
np.random.seed(42)
data = np.random.rand(100, 2)
k = 3
centroids, clusters = k_means(data, k)
print("Final centroids:")
print(centroids)
print("\nCluster assignments for each data point:")
print(clusters)



Final centroids:
[[0.8039633  0.57026999]
 [0.18520943 0.72228065]
 [0.36376248 0.20008043]]

Cluster assignments for each data point:
[0 2 2 1 2 2 2 2 1 0 0 0 2 1 2 2 2 0 0 0 2 1 1 1 1 0 1 2 1 0 2 1 1 0 2 2 2
 2 0 0 1 1 0 1 2 0 0 0 0 2 2 1 2 2 0 0 0 0 2 0 2 1 0 0 1 0 1 1 2 1 0 0 2 1
 2 0 2 0 0 1 1 2 0 1 0 0 1 1 1 2 0 0 2 0 2 0 0 2 2 1]
