In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
import string
from sklearn.preprocessing import OneHotEncoder
from scipy.spatial import distance
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import PorterStemmer
import nltk
from nltk.corpus import stopwords

# 1. Lloyd k-means

In [13]:
class Ck:
    def __init__(self, df_dataset, k, t=0.1):
        self.df_dataset = df_dataset.copy()
        self.df_dataset_with_cluster = df_dataset.copy()
        self.df_dataset_with_cluster['cluster'] = np.nan
        self.k = k
        self.t = t
        self.centroids = self.initialize_centroids()
        
    def initialize_centroids(self):
        centroid_indices = np.random.choice(self.df_dataset.index.array, size=self.k)
        c_df = pd.DataFrame(self.df_dataset, index = centroid_indices)
        c_df.reset_index(drop=True, inplace=True)
        return c_df
    
    def calc_euclidean_dist(self, series_1, series_2):
        return np.linalg.norm(np.array(series_1) - np.array(series_2))
    
    def re_initialize_centroids(self):
        for c_index, _ in self.centroids.iterrows():
            cluster = self.df_dataset[self.df_dataset_with_cluster['cluster'] == c_index]
            cluster_size = cluster.shape[0]
            if cluster_size == 0:
                self.centroids.loc[c_index] = pd.DataFrame(self.df_dataset, index = np.random.choice(self.df_dataset.index.array, size=1)).squeeze()
                continue
            # Calculate average for this cluster.
            row_sums = cluster.sum()
            for col_name, summed_val in row_sums.iteritems(): 
                self.centroids.loc[c_index, col_name] = summed_val / cluster_size
        self.centroids.reset_index(drop=True, inplace=True)
                
    def k_means(self):
        prev_centroid_df = self.centroids
        while True:
            # Remove old cluster assignments.
            self.df_dataset_with_cluster['cluster'] = np.nan
            
            
            # Assign clusters.
            euc_dist_mat = distance.cdist(self.df_dataset, self.centroids)
            cluster_assignment = np.argmin(euc_dist_mat, axis=1)
            self.df_dataset_with_cluster['cluster'] = cluster_assignment.tolist()
            
            # Re-initialize clusters.
            self.re_initialize_centroids()
            
            # Calculate cluster difference between previous and current cluster
            total_centroid_change = 0
            for c_index, c in self.centroids.iterrows():
                total_centroid_change += self.calc_euclidean_dist(prev_centroid_df.loc[c_index], c)
            avg_centroid_change = total_centroid_change / self.k
            
            if avg_centroid_change < self.t:
                break
            prev_centroid_df = self.centroids
        self.centroids = prev_centroid_df
        return self.df_dataset_with_cluster

# 2. k-means with SSE

In [4]:
class CkSSE:
    def __init__(self, df_dataset, k, t=0.1):
        self.df_dataset = df_dataset.copy()
        self.df_dataset_with_cluster = df_dataset.copy()
        self.df_dataset_with_cluster['cluster'] = np.nan
        self.k = k
        self.t = t
        self.centroids = self.initialize_centroids()
        
    def initialize_centroids(self):
        centroid_indices = np.random.choice(self.df_dataset.index.array, size=self.k)
        c_df = pd.DataFrame(self.df_dataset, index = centroid_indices)
        c_df.reset_index(drop=True, inplace=True)
        return c_df
    
    def calc_euclidean_dist(self, series_1, series_2):
        return np.linalg.norm(np.array(series_1) - np.array(series_2))
    
    def re_initialize_centroids(self):
        for c_index, _ in self.centroids.iterrows():
            cluster = self.df_dataset[self.df_dataset_with_cluster['cluster'] == c_index]
            cluster_size = cluster.shape[0]
            if cluster_size == 0:
                self.centroids.loc[c_index] = pd.DataFrame(self.df_dataset, index = np.random.choice(self.df_dataset.index.array, size=1)).squeeze()
                continue
            # Calculate average for this cluster.
            row_sums = cluster.sum()
            for col_name, summed_val in row_sums.iteritems(): 
                self.centroids.loc[c_index, col_name] = summed_val / cluster_size
        self.centroids.reset_index(drop=True, inplace=True)

    def calc_total_sse(self):
        total_sse = 0
        for c_index, _ in self.centroids.iterrows():
            # for each cluster, calculate the euclidean distance of all data points in that cluster and the centroid.
            curr_centroid = self.centroids.loc[c_index]
            data_in_curr_cluster = self.df_dataset[self.df_dataset_with_cluster['cluster']==c_index]
            sse_mat = distance.cdist(data_in_curr_cluster, self.centroids.loc[[c_index]])
            total_sse += np.sum(np.square(sse_mat))
        return total_sse
                
    def k_means(self):
        prev_centroid_df = self.centroids
        prev_total_sse = 0
        while True:
            # Remove old cluster assignments.
            self.df_dataset_with_cluster['cluster'] = np.nan
            
            # Assign clusters.
            euc_dist_mat = distance.cdist(self.df_dataset, self.centroids)
            cluster_assignment = np.argmin(euc_dist_mat, axis=1)
            self.df_dataset_with_cluster['cluster'] = cluster_assignment.tolist()
                
            # Re-initialize clusters.
            self.re_initialize_centroids()
            
            # Calculate cluster difference between previous and current cluster
            
            curr_total_sse = self.calc_total_sse()
            total_sse_change = abs(curr_total_sse-prev_total_sse)
            
            if total_sse_change < self.t:
                break
            prev_centroid_df = self.centroids
            prev_total_sse = curr_total_sse
        self.centroids = prev_centroid_df
        return self.df_dataset_with_cluster, prev_total_sse
    

# 3. k-means++

In [19]:
class Ckplusplus:
    def __init__(self, df_dataset, k, t=0.1):
        self.df_dataset = df_dataset.copy()
        self.df_dataset_with_cluster = df_dataset.copy()
        self.df_dataset_with_cluster['cluster'] = np.nan
        self.k = k
        self.t = t
        self.centroids = self.initialize_centroids()
        
    def initialize_centroids(self):
        centroid_indices = [np.random.choice(self.df_dataset.index.array)]
        c_df = pd.DataFrame(self.df_dataset, index = centroid_indices)
        c_df.reset_index(drop=True, inplace=True)
        
        for i in range(self.k - 1):
            euc_dist_mat = np.min(np.square(distance.cdist(self.df_dataset, c_df)), axis=1)
            prob_dist = (euc_dist_mat / np.sum(euc_dist_mat)).tolist()
            new_centeroid_index = np.random.choice(self.df_dataset.index.array, p=prob_dist)
            centroid_indices.append(new_centeroid_index)
            c_df = pd.DataFrame(self.df_dataset, index = centroid_indices)
            c_df.reset_index(drop=True, inplace=True)
        return c_df
    
    def calc_euclidean_dist(self, series_1, series_2):
        return np.linalg.norm(np.array(series_1) - np.array(series_2))
    
    def re_initialize_centroids(self):
        for c_index, _ in self.centroids.iterrows():
            cluster = self.df_dataset[self.df_dataset_with_cluster['cluster'] == c_index]
            cluster_size = cluster.shape[0]
            if cluster_size == 0:
                c_df = self.centroids.copy()
                c_df = c_df.drop(c_index)
                euc_dist_mat = np.min(np.square(distance.cdist(self.df_dataset, c_df)), axis=1)
                prob_dist = (euc_dist_mat / np.sum(euc_dist_mat)).tolist()
                new_centeroid_index = np.random.choice(self.df_dataset.index.array, p=prob_dist)
                self.centroids.loc[c_index] = pd.DataFrame(self.df_dataset, index = new_centeroid_index).squeeze()
                continue
            # Calculate average for this cluster.
            row_sums = cluster.sum()
            for col_name, summed_val in row_sums.iteritems(): 
                self.centroids.loc[c_index, col_name] = summed_val / cluster_size
        self.centroids.reset_index(drop=True, inplace=True)
                
    def k_means(self):
        prev_centroid_df = self.centroids
        prev_total_sse = 0
        while True:
            # Remove old cluster assignments.
            self.df_dataset_with_cluster['cluster'] = np.nan
            
            # Assign clusters.
            euc_dist_mat = distance.cdist(self.df_dataset, self.centroids)
            cluster_assignment = np.argmin(euc_dist_mat, axis=1)
            self.df_dataset_with_cluster['cluster'] = cluster_assignment.tolist()
                
            # Re-initialize clusters.
            self.re_initialize_centroids()
            
            # Calculate cluster difference between previous and current cluster
            total_centroid_change = 0
            for c_index, c in self.centroids.iterrows():
                total_centroid_change += self.calc_euclidean_dist(prev_centroid_df.loc[c_index], c)
            avg_centroid_change = total_centroid_change / self.k
            
            if avg_centroid_change < self.t:
                break
            prev_centroid_df = self.centroids
        self.centroids = prev_centroid_df
        return self.df_dataset_with_cluster
    