In [291]:
import pandas as pd
import sklearn
import numpy as np
import matplotlib.pyplot as plt
import random
from sklearn.cluster import KMeans
import math

In [281]:
sigma = 0.15
def sample_de_guitarra(center_x, center_y):
    return np.random.multivariate_normal([center_x, center_y], [[sigma, 0], [0, sigma]], 1)[0]

In [282]:
def data_simulation():
    points = []
    for i in range(1000):
        x = random.randrange(4)
        if (x == 0):
            points.append(sample_de_guitarra(1, 0))
        elif (x == 1):
            points.append(sample_de_guitarra(0, -1))
        elif (x == 2):
            points.append(sample_de_guitarra(-1, 0))
        else:
            points.append(sample_de_guitarra(0, 1))
    return points

def data_uniform(min_x, max_x, min_y, max_y):
    points = []
    for i in range(1000):
        x = random.uniform(min_x, max_x)
        y = random.uniform(min_y, max_y)
        points.append([x, y])
    return points

In [283]:
def objective_value(num_cluster, points):
    kmeans = KMeans(n_clusters=num_cluster).fit(points)
    return -kmeans.score(points)

In [284]:
points = data_simulation()

In [289]:
results = []
ks = [i + 1 for i in range(20)]
for i in ks:
    results.append(objective_value(i, points))

In [286]:
plt.title("K-Means in 4 Gaussian Distribuition")
plt.xlabel("K")
plt.ylabel("")
plt.plot(ks, results, color='black')
plt.show()

In [287]:
total_results = []
for j in range(100):
    points = data_simulation()
    x_min = min([a[0] for a in points])
    x_max = max([a[0] for a in points])
    y_min = min([a[1] for a in points])
    y_max = max([a[1] for a in points])
    points_uniform = data_uniform(x_min, x_max, y_min, y_max)
    results = []
    ks = [i + 1 for i in range(20)]
    for i in ks:
        results.append(objective_value(i, points_uniform))
    total_results.append(results)
sum_results = [0] * len(total_results[0])
for i in range(len(total_results)):
    for j in range(len(total_results[0])):
        sum_results[j] += total_results[i][j]
for i in range(len(total_results[0])):
    sum_results[i] /= len(total_results)

In [288]:
plt.title("K-Means in Uniform Distribuition")
plt.xlabel("K")
plt.ylabel("")
plt.plot(ks, sum_results, color='black')
plt.show()

In [292]:
G_K = [math.log(sum_results[i]) - math.log(results[i]) for i in range(len(results))]

In [294]:
plt.title("Gap statistic")
plt.xlabel("K")
plt.ylabel("")
plt.plot(ks, G_K, color='black')
plt.show()