# Homework 11 (K-Means Clustering Skeleton)

In [1]:
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('../Data/Iris/Iris-cleaned.csv')
df.head(3)

Unnamed: 0,sepal length,sepal width,petal length,petal width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa


In [3]:
attributes = df.drop('species',axis=1)
attributes.head(3)

Unnamed: 0,sepal length,sepal width,petal length,petal width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2


In [4]:
def InitializeCentroids(attributes,K):
    centroids = attributes.sample(K)
    return centroids

def ComputeDistanceMatrix(centroids,attributes):
    number_centroids = centroids.shape[0]
    number_records   = attributes.shape[0]
    DistMat = pd.DataFrame(0,index=range(number_records),columns=range(number_centroids))
    for k in range(number_centroids):
        difference = attributes.sub(centroids.iloc[k,:])
        #print(difference)
        DistMat.iloc[:,k] = (difference*difference).sum(axis=1)
    return DistMat  
 

def UpdateClusters(centroids,attributes):
    DistMat = ComputeDistanceMatrix(centroids,attributes)
    records = attributes.copy()
    records['cluster'] = DistMat.idxmin(axis=1) #index of first occurrence of minimum over requested axis
    records['distance2'] = DistMat.min(axis=1)
    return records

def UpdateCentroids(records):
    centroids = records.groupby('cluster').mean().drop('distance2',axis=1)
    return centroids

def kmeans(attributes,K=2,tol=0.01,itermax=100):
    centroids = InitializeCentroids(attributes,K)
    records   = UpdateClusters(centroids,attributes)
    SSE_old   = records.distance2.sum()
    centroids = UpdateCentroids(records)
    records   = UpdateClusters(centroids,attributes)
    SSE   = records.distance2.sum()
    SSE_percent_change = 100*(SSE_old - SSE)/SSE_old
    
    iter = 0
    while (SSE_percent_change >= tol) and (iter <= itermax):
        centroids = UpdateCentroids(records)
        records   = UpdateClusters(centroids,attributes)
        SSE_old   = SSE
        SSE       = records.distance2.sum()
        SSE_percent_change = 100*(SSE_old - SSE)/SSE_old
        iter = iter + 1
    return records

In [5]:
# run KMeans multiple times and save SSE results
trial = []
SSE   = []
for k in range(100):
    records = kmeans(attributes,K=3)
    SSE.append(records.distance2.sum())
    trial.append(k+1)
    
results = pd.DataFrame()
results['trial'] = trial
results['SSE']   = SSE

In [6]:
# compare 
Kmeans = KMeans(n_clusters=3)
Kmeans.fit(attributes)
print('min SSE           =',results.SSE.min())
print('min SSE (sklearn) =',Kmeans.inertia_)

min SSE           = 78.94084142614602
min SSE (sklearn) = 78.94084142614601


In [7]:
centroids = InitializeCentroids(attributes,3)
centroids

Unnamed: 0,sepal length,sepal width,petal length,petal width
127,6.1,3.0,4.9,1.8
12,4.8,3.0,1.4,0.1
84,5.4,3.0,4.5,1.5


In [8]:
DistMat = ComputeDistanceMatrix(centroids,attributes)
print(DistMat.shape)
DistMat.head()

(150, 3)


Unnamed: 0,0,1,2
0,16.06,0.35,11.64
1,16.25,0.02,11.55
2,17.52,0.07,12.46
3,16.38,0.07,11.34
4,16.38,0.41,11.82


In [9]:
records = UpdateClusters(centroids,attributes)
records.head()

Unnamed: 0,sepal length,sepal width,petal length,petal width,cluster,distance2
0,5.1,3.5,1.4,0.2,1,0.35
1,4.9,3.0,1.4,0.2,1,0.02
2,4.7,3.2,1.3,0.2,1,0.07
3,4.6,3.1,1.5,0.2,1,0.07
4,5.0,3.6,1.4,0.2,1,0.41
