In [None]:
# If you want, you can use the following function to efficiently compute pairwise distances.
# Read the docstring to learn how to use it.
from scipy.spatial.distance import cdist
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
class KMeans(object):
    
    def __init__(self, k=3, n_inits=10, random_seed=None):
        '''KMeans clustering algorithm.
        
        Parameters
        ----------
        k: number of clusters
        n_intis: number of initializations
        '''
        # Parameters
        self.k = k
        self.n_inits = n_inits
        self.random_seed = random_seed
        
        # The following attributes will be computed through execution of the
        # KMeans algorithm in the fit method.
        self.centroids_ = None
        self.labels_ = None
        self.cost_ = None
        self.num_iterations_ = 0
    
    
    def fit(self, X):
        '''Clusters the dataset X into k clusters.
        '''                
        
        result_n = []
        
        entries, features = X.shape
        
        #implement random seed
        np.random.seed(self.random_seed)
        
        #loop: 
        # 1. calculate distance to data and predict data according to it
        # 2. if cost functiuon not converged then calculate mean of predicted data and update centroids
        # 3. repeat until cost function is low
        cost_old = None
        
        loops = []
        all_costs = np.array([])
        inits_data = []
        
        for i in range(self.n_inits):
            #create random centroids from minimum to maximum value of data set
            self.centroids_ = np.amin(X) + (np.amax(X)-np.amin(X)) * np.random.rand(self.k, features)
            self.centroids_init = self.centroids_
            
            while(True):
                #calculate new labels, new costs
                self.labels_ = self.predict(X)
                centroids_old = self.centroids_
                self.cost_ = self.cost_function(X)

                #for each centroid, calculate new mean of data belonging to that centroid
                for k in range(self.k):
                    new_ = np.mean(X[np.where(np.isclose(self.labels_,k))], axis = 0)
                    #ignore if there is no data for centroid
                    if(~np.any(np.isnan(new_) == True)):
                        self.centroids_[k] = new_

                #if old cost difference to new cost is lower than margin, break the loop and stop
                if(cost_old is not None):
                    if np.abs(self.cost_ - cost_old) < 0.00001:
                        break

                #set new cost to old cost for next iteration
                cost_old = self.cost_
                self.num_iterations_ += 1
            
            all_costs = np.append(all_costs, self.cost_)
            inits_data.append([self.centroids_, self.num_iterations_])

        self.cost_ = np.min(all_costs)
        self.centroids_, self.num_iterations_ = inits_data[np.where(np.isclose(all_costs, self.cost_))[0][-1]]
        
        return self.centroids_

  
    def cost_function(self, X):
        '''Computes the KMeans cost function for a given dataset X.
        '''
        #calculate distances
        #code is deprecated
        '''distances = np.array([cdist(X, np.array([point]), 'euclidean') for point in self.centroids_])
        minimums = []
        for i in range(len(distances[0])):
            #get distance to closest centroid per entry
            minimum = distances[:,i].min(axis=0)
            minimums.append(minimum)
        return np.sum(np.array(minimums)**2)'''
    
        distances = cdist(X, self.centroids_ , metric='euclidean')
        minimums = np.min(distances, axis = 1)
        return np.sum(minimums**2)
        
    
    def predict(self, X):
        '''Assigns each data point in X to the closest cluster.
        
        Can only be used after the clustering algorithm has been executed.
        '''
        
        #calculate distance to each centroid
        #code is deprecated
        '''distances = np.array([cdist(X, np.array([point]), 'euclidean') for point in self.centroids_])
        
        predictions = []
        indexes = []
        for i in range(len(distances[0])):
            #get distance to closest centroid per entry
            minimum = distances[:,i].min(axis=0)
            #get index of that centroid
            indexes.append(np.where(np.isclose(distances[:,i], minimum))[0][0])
        
        return indexes'''
        
        #much easier with argmin lol
        distances = cdist(X, self.centroids_, metric='euclidean')
        labels = np.argmin(distances, axis=1)
        return labels
    
        #??:
        #initialize centroids
        
        #optimization
