In [2]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from scipy.spatial import distance

In [3]:
students = pd.read_csv("students_info.csv")
justtwo_np = students[["sleep","coffee"]].to_numpy()

In [13]:
justtwo_np.shape[0]

300

In [67]:
def calculate_within_cluster_sse(fit, data):
    """
    fit: the fit get from sklearn
    data: a numpy array
    """
    SSE = 0

    # Loop over all clusters
    for c in range(len(fit.cluster_centers_)):
        # Extract the cluster's center and associated points:
        cluster_center = [fit.cluster_centers_[c]]
        cluster_points = data[np.where(fit.labels_ == c)]
        # Compute the following for each cluster:
        cluster_spread = distance.cdist(cluster_points, cluster_center, 'euclidean')
        cluster_total = np.sum(cluster_spread)
        # Add this cluster's within sum of squares to within_cluster_sumsqs
        SSE += cluster_total
        
    return SSE

In [68]:
def looping_kmeans(data, kList):
    """
    data: a numpy array
    kList: a list of all k values
    """
    
    # 1. normalize the data
    shape = data.shape
    data_norm = np.empty([shape[0],0])
    
    for i in range(shape[1]):
        var = data[:,i]
        mx = np.max(var)
        mn = np.min(var)

        var_norm = (var - mn)/(mx - mn)
        var_norm = np.around(var_norm, decimals = 2)
        data_norm = np.hstack((data_norm, np.array([var_norm]).transpose()))
        
    # 2. calculate within-cluster SSE for each K
    SSE_list = []
    for i in kList:
        km = KMeans(n_clusters=i, init="k-means++", random_state = 1, max_iter = 200)
        fit = km.fit(data_norm)
        SSE = calculate_within_cluster_sse(fit, data_norm)
        SSE_list.append(SSE)

    return SSE_list

In [69]:
justtwo_norm = looping_kmeans(justtwo_np, range(1,10))
print(justtwo_norm)

[93.0346530174882, 69.58881417527036, 54.56236696521816, 49.52734745444229, 42.724192778743664, 36.99089389387997, 34.136722598193174, 31.923517354038758, 29.771168801295513]
