In [None]:
import numpy as np
from sklearn.cluster import KMeans
from scipy.cluster import hierarchy
from scipy.spatial.distance import pdist

import holoviews as hv; hv.extension('bokeh', 'plotly', logo=False)
import panel as pn;     pn.extension()
import plotly.graph_objs as go

In [None]:
# Training and testing set sizes
n1 = 100                                  # Train
n2 = 50                                   # Test

#                                           Random ellipse 1 centered at (0,0)
x = np.random.randn(n1+n2)
y = 0.5*np.random.randn(n1+n2)

#                                           Random ellipse 2 centered at (1,-2)
x2 = np.random.randn(n1+n2) + 1
y2 = 0.2*np.random.randn(n1+n2) - 2

#                                           Rotate ellipse 2 by theta
theta  =  np.pi/4
A      =  np.zeros((2,2))
A[0,0] =  np.cos(theta)
A[0,1] = -np.sin(theta)
A[1,0] =  np.sin(theta)
A[1,1] =  np.cos(theta)

x3     = A[0,0]*x2 + A[0,1]*y2
y3     = A[1,0]*x2 + A[1,1]*y2

In [None]:
hv.Scatter((x[:n1],y[:n1]))*hv.Scatter((x3[:n1],y3[:n1]))

In [None]:
# Training set: first 200 of 240 points
X1    = np.column_stack((x3[:n1],y3[:n1]))
X2    = np.column_stack((x[:n1],y[:n1]))

Y     = np.concatenate((X1,X2))
Z     = np.column_stack((np.ones(n1),2*np.ones(n1)))

# Test set: remaining 40 points
x1test = np.column_stack((x3[n1:],y3[n1:]))
x2test = np.column_stack((x[n1:],y[n1:]))

In [None]:
imgs = []

g1 = np.array([-1, 0])                                     # Initial guess
g2 = np.array([1, 0])

for j in range(4):
    class1 = np.zeros((1,2))
    class2 = np.zeros((1,2))
    for jj in range(Y.shape[0]):
        d1 = np.linalg.norm(g1-Y[jj,:],ord=2)
        d2 = np.linalg.norm(g2-Y[jj,:],ord=2)
        if d1<d2:
            class1 = np.append(class1,Y[jj,:].reshape((1,2)),axis=0)
        else:
            class2 = np.append(class2,Y[jj,:].reshape((1,2)),axis=0)
    class1 = np.delete(class1, (0), axis=0)                # remove zeros used to initialize
    class2 = np.delete(class2, (0), axis=0)
    
    imgs.append( hv.Scatter((class1[:,0],   class1[:,1]))*\
                 hv.Scatter((class2[:,0],   class2[:,1]))*\
                 hv.Scatter(([g1[0],g2[0]], [g1[1], g2[1]] )).opts(color="black", size=10, marker="star")
               )

    g1 = np.array([np.mean(class1[:,0]),np.mean(class1[:,1])])
    g2 = np.array([np.mean(class2[:,0]),np.mean(class2[:,1])])

hv.Layout(imgs).opts("Scatter", width=300).cols(2)

In [None]:
# kmeans code
kmeans = KMeans(n_clusters=2, random_state=0).fit(Y)
c      = kmeans.cluster_centers_
ind    = kmeans.labels_

# result from previous section
h_prev = hv.Scatter(([g1[0],g2[0]],[g1[1],g2[1]]), label='prev')
# kmeans result
h_kmeans = hv.Scatter( (c[:,0],c[:,1]), label='kmeans')

(h_kmeans * h_prev).opts( "Scatter", size=6) 

In [None]:
midx   = (c[0,0]+c[1,0])/2
midy   = (c[0,1]+c[1,1])/2
slope  = (c[1,1]-c[0,1])/(c[1,0]-c[0,0])    # rise/run
b      = midy+(1/slope)*midx
xsep   = np.arange(-1,2,0.1)
ysep   = -(1/slope)*xsep+b

In [None]:
h=\
hv.Scatter((x[:n1],y[:n1]))*hv.Scatter((x3[:n1],y3[:n1]))*hv.Curve((xsep,ysep)).opts(ylim=(-3,3),color='black',width=400,height=200)+\
hv.Scatter((x[n1:],y[n1:]))*hv.Scatter((x3[n1:],y3[n1:]))*hv.Curve((xsep,ysep)).opts(ylim=(-3,3),color='black',width=400,height=200)
h.cols(1)

In [None]:
## Dendrograms

Y3     = np.concatenate((X1[:50,:],X2[:50,:]))

Y2     = pdist(Y3, metric='euclidean')
Z      = hierarchy.linkage(Y2,method='average')
thresh = 0.85*np.max(Z[:,2])

dn = hierarchy.dendrogram(Z,p=100,color_threshold=thresh, no_labels=True)

In [None]:
hv.Spikes((range(100),dn['leaves']), "x", "y").opts(width=600, line_width=2)*\
hv.Curve((np.array([0, 100]),    np.array([50, 50]))).opts(color='red',line_dash='dotted')*\
hv.Curve((np.array([50.5, 50.5]),np.array([0, 100]))).opts(color='red',line_dash='dotted')