In [27]:
# imports 
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from scipy.spatial.distance import  cdist   




Load the two moons data provided in “twomoons.csv”. The data contains two interleaved half-circle clusters (“moons”), where the third column denotes which cluster each example belongs to (out of two). Using only the 2 features (first 2 columns).


In [28]:
df = pd.read_csv('twomoons.csv')    # read the data

X = np.array(df.iloc[:, [0, 1]])   # convert the data to numpy array
y_true = np.array(df.iloc[:, 2])   # get the true labels

scaler = StandardScaler()   # create a scaler object

X = scaler.fit_transform(X)   # scale the data

Kmeans clustering with n = 2 

In [29]:
kmeans = KMeans(n_clusters=2, random_state=26).fit(X)   # create a kmeans object and fit the data

# calculate the sum of squared errors (SSE)
cluster_centers =  kmeans.cluster_centers_

cluster_sse = [0, 0]
for point, label in zip(X, kmeans.labels_):
    cluster_sse[label] += np.linalg.norm(point - cluster_centers[label])**2

print('Clusters SSE ', cluster_sse)

print('Cluster SSE when k is 2 =', cluster_sse[0])

# predict the clusters
y_kmeans = kmeans.predict(X)

# calculate the misclassification rate
mis_classification_rate = 1 - accuracy_score(y_true, y_kmeans)

print('Kmeans Misclassification rate =', mis_classification_rate)


Clusters SSE  [41.87412868894992, 41.49533639092751]
Cluster SSE when k is 2 = 41.87412868894992
Kmeans Misclassification rate = 0.15000000000000002


  super()._check_params_vs_input(X, default_n_init=10)


In [30]:
# create an agglomerative clustering object
agg = AgglomerativeClustering(n_clusters=2)

#  fit the data and predict the clusters
y_pred = agg.fit_predict(X)


