<a href="https://colab.research.google.com/github/daothuphuong98/machine_learning2/blob/main/Week_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from ipywidgets import widgets
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
import plotly
from copy import copy
import warnings
warnings.filterwarnings("ignore")

In [None]:
def random_data():
    means = [[2, 2], [8, 4], [3, 6], [5, -1]]
    cov = [[[1.5, 0.4], [0.6, 1.2]],
           [[0.6, 0.7], [0.5, 1.8]],
           [[2, 0.4], [0.8, 1]],
           [[0.6, 0.5], [0.5, 1.1]]]
    N = 100
    X0 = np.random.multivariate_normal(means[0], cov[0], N)
    X1 = np.random.multivariate_normal(means[1], cov[1], N)
    X2 = np.random.multivariate_normal(means[2], cov[2], N)
    X3 = np.random.multivariate_normal(means[3], cov[3], N)

    X = np.concatenate((X0, X1, X2, X3), axis = 0)
    original_label = np.asarray([0] * N + [1] * N + [2] * N + [3] * N).T
    return X, original_label

def get_centroid(x, centroid):
    result = np.sqrt((centroid[:, 0] - x[0])**2 + (centroid[:, 1] - x[1])**2)
    res = result.argmin()
    return res

def kmeans(X, k):
    random_row = np.random.choice(X.shape[0], size=k, replace=False)
    centroid = X[random_row, :]
    new_centroid = np.zeros([k, 2])
    kmean_log = []
    iter = 0

    while True:
        label = np.apply_along_axis(get_centroid, 1, X, centroid)
        for c in range(k):
            new_centroid[c] = X[label == c].mean(axis=0)
        if np.all(new_centroid == centroid):
            label = np.apply_along_axis(get_centroid, 1, X, centroid)
            iter += 1
            kmean_log.append({'iter':iter, 'label': label, 'centroid': centroid})
            break
        centroid = copy(new_centroid)
        
        iter += 1
        kmean_log.append({'iter':iter, 'label': label, 'centroid': centroid})

    return label, centroid, kmean_log

In [None]:
K = 4
data, original_label = random_data()
l, c, log = kmeans(data, K)

In [None]:
df = pd.DataFrame(data, columns = ['x0', 'x1'])
df['original_label'] = original_label
for l in log:
  df[f'iter_{l["iter"]}'] = l['label']
df=df.melt(id_vars = ['x0', 'x1', ], value_vars = df.columns[3:])
df['type'] = 'Data'

cent = pd.DataFrame(np.concatenate([d['centroid'] for d in log]), columns = ['x0', 'x1'])
cent['variable'] = np.array([[f'iter_{d["iter"]}']*4 for d in log]).reshape(-1)
cent['type'] = 'Centroid'

df = pd.concat([df, cent])
df['size'] = df['type'].replace({'Centroid': 7, 'Data': 5})

In [None]:
fig=px.scatter(df, x="x0", y="x1", 
           animation_frame="variable", 
           color="value", 
           symbol = 'type', 
           size = 'size', 
           size_max=10,
           width = 700,
           height = 700,
           color_discrete_sequence= px.colors.sequential.Plasma_r)
fig.update_layout(showlegend=False, coloraxis_showscale=False) 
fig.show()