In [18]:
# imports 
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from scipy.spatial import  distance 
from scipy.cluster import hierarchy
import scipy.cluster as sc




Load the two moons data provided in “twomoons.csv”. The data contains two interleaved half-circle clusters (“moons”), where the third column denotes which cluster each example belongs to (out of two). Using only the 2 features (first 2 columns).


In [19]:
df = pd.read_csv('twomoons.csv')    # read the data

X = np.array(df.iloc[:, [0, 1]])   # convert the data to numpy array
y_true = np.array(df.iloc[:, 2])   # get the true labels

scaler = StandardScaler()   # create a scaler object

X = scaler.fit_transform(X)   # scale the data

Kmeans clustering with n = 2 

In [20]:
kmeans = KMeans(n_clusters=2, random_state=26).fit(X)   # create a kmeans object and fit the data
# predict the clusters
y_kmeans = kmeans.predict(X)


# calculate the sum of squared errors (SSE)
cluster_centers =  kmeans.cluster_centers_

cluster_sse = [0, 0]
for point, label in zip(X, kmeans.labels_):
    cluster_sse[label] += np.linalg.norm(point - cluster_centers[label])**2

print('Clusters SSE ', cluster_sse)
print('Cluster SSE when k is 2 =', cluster_sse[0])

# calculate the misclassification rate
mis_classification_rate = 1 - accuracy_score(y_true, y_kmeans)
print('Kmeans Misclassification rate =', mis_classification_rate)


Clusters SSE  [41.87412868894992, 41.49533639092751]
Cluster SSE when k is 2 = 41.87412868894992
Kmeans Misclassification rate = 0.15000000000000002


In [21]:
# create an agglomerative clustering object

'''
k=2
linkage =[ 'single', 'complete', 'average']
agg = AgglomerativeClustering(n_clusters=k, linkage=linkage).fit(X)
y_pred = agg.labels_
'''
k = 2
linkage =['ward', 'single', 'complete', 'average']

# calculate the sse and misclassification rate for agglomerative clustering using single linkage

for link in linkage:
    agg = AgglomerativeClustering(n_clusters=k, linkage=link).fit(X)
    y_pred = agg.labels_
    clusters = [X[y_pred == i] for i in range(k)]
    centroids = [np.mean(cluster, axis=0) for cluster in clusters]
    agg_cluster_sse = [0, 0]
    # calculate the sum of squared errors (SSE) with all the linkages:
    for point, label in zip(X, y_pred):
       agg_cluster_sse[label] += np.square(distance.euclidean(point, centroids[label]))
       
    print('Agglomerative Clustering', link, 'Linkage SSE =', agg_cluster_sse[0])
    print('Agglomerative Clustering', link, 'Linkage Misclassification rate =', 1 - accuracy_score(y_true, y_pred))

'''
agg_cluster_sse = [0, 0]
clusters = [X[y_pred == i] for i in range(k)]
centroids = [np.mean(cluster, axis=0) for cluster in clusters]
# calculate the sum of squared errors (SSE) with all the linkages:
for point, label in zip(X, y_pred):
    agg_cluster_sse[label] += np.linalg.norm(point - centroids[label])**2
   ''' 
 

 
    






Agglomerative Clustering ward Linkage SSE = 63.71447253277438
Agglomerative Clustering ward Linkage Misclassification rate = 0.12
Agglomerative Clustering single Linkage SSE = 195.43664033808255
Agglomerative Clustering single Linkage Misclassification rate = 0.51
Agglomerative Clustering complete Linkage SSE = 67.78792019783519
Agglomerative Clustering complete Linkage Misclassification rate = 0.85
Agglomerative Clustering average Linkage SSE = 63.71447253277438
Agglomerative Clustering average Linkage Misclassification rate = 0.12


'\nagg_cluster_sse = [0, 0]\nclusters = [X[y_pred == i] for i in range(k)]\ncentroids = [np.mean(cluster, axis=0) for cluster in clusters]\n# calculate the sum of squared errors (SSE) with all the linkages:\nfor point, label in zip(X, y_pred):\n    agg_cluster_sse[label] += np.linalg.norm(point - centroids[label])**2\n   '