In [4]:
import numpy as np

def make_clusters(
        n: int = 500,
        m: int = 2,
        k: int = 3,
        bounds: tuple = (-10, 10),
        scale: float = 1,
        seed: int = 42) -> (np.ndarray, np.ndarray):
    """
    creates some clustered data

    inputs:
        n: int
            number of observations
        m: int
            number of features
        k: int
            number of clusters
        bounds: tuple
            minimum and maximum bounds for cluster grid
        scale: float
            standard deviation of normal distribution
        seed: int
            random seed

    outputs:
        (np.ndarray, np.ndarray)
            returns a 2D matrix of `n` observations and `m` features that are clustered into `k` groups
            returns a 1D array of `n` size that defines the cluster origin for each observation
    """
    np.random.seed(seed)
    assert k <= n

    labels = np.sort(np.random.randint(0, k, size=n))
    centers = np.random.uniform(bounds[0], bounds[1], size=(k,m))
    mat = np.vstack([
        np.random.normal(
            loc=centers[idx],
            scale=scale,
            size=(np.sum(labels==idx), m))
        for idx in np.arange(0, k)])

    return mat, labels

In [54]:
mat, labels = make_clusters(3, 2, 3, (-10, 10), 1, 42)

In [55]:
print(mat)

[[ 6.21909165  2.74060441]
 [-9.30780214  7.86608296]
 [-9.30174545  6.85779316]]


In [28]:
print(labels)

[0 2 2]


In [40]:
random_index = np.random.choice(np.arange(mat.shape[0]), size=1)
random_center_point = mat[random_index]
print(random_center_point)

[[-9.30174545  6.85779316]]


In [41]:
from scipy.spatial.distance import cdist
dist = cdist(random_center_point, mat, 'euclidean')
print(dist)

[[16.05763456  1.00830799  0.        ]]


In [42]:
square_dist = np.square(dist)
print(square_dist)

[[257.84762752   1.016685     0.        ]]


In [44]:
sum_dist = np.sum(square_dist)
print(sum_dist)

258.8643125136015


In [45]:
prob_dist = square_dist / sum_dist
print(prob_dist)

[[0.99607252 0.00392748 0.        ]]


In [113]:
print(mat)
mat.ndim

[[ 6.21909165  2.74060441]
 [-9.30780214  7.86608296]
 [-9.30174545  6.85779316]]


2

In [109]:
# Generate random x coordinate for initial centroid within range of data points
random_centroid_x = np.random.uniform(low=np.min(mat[:, 0]), high=np.max(mat[:, 0]))

# Generate random y coordinate for initial centroid within range of data points
random_centroid_y = np.random.uniform(low=np.min(mat[:, 1]), high=np.max(mat[:, 1]))

In [117]:
bounds = (random_centroid_x, random_centroid_y)

In [118]:
random_centroid = np.random.uniform(bounds[0], bounds[1], size=(1,2))

In [119]:
print(random_centroid)

[[ 1.37798466 -1.46121733]]


In [122]:
# Calculate the Euclidean distance between the random centroid and the rest of the points
dist = cdist(random_centroid, mat, 'euclidean')

# Square the distances between the random centroid and the rest of the points
square_dist = np.square(dist)

# Sum the squared the distances
sum_dist = np.sum(square_dist)

# Divide each element in the squared distances array by the sum of all the squared distances
prob_dist = square_dist / sum_dist

print(prob_dist)

[[0.09656376 0.47277613 0.43066011]]


In [123]:
%config IPCompleter.greedy=True