In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm

import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.manifold import TSNE

import random
import scipy
import math

import os

# Clustering problem

**Technique** <br>
K-Means clustering algorithm.

**Dataset** <br>
Leaf descriptive data.

**Description** <br>
The following code aims to implement and apply the K-Means algorithm,  looking for optimized solutions about centroid quantity and cost minimization. <br>
The output is defined as the best model parameters, result visualization and proper data clustering.

In [None]:
path = "../input"
df = pd.read_csv("{path}/{file}".format(path=path, file="train.csv"))
df = df[df.columns[1:]]

#print("Leaf species in dataset: {species}\n".format(species=len(set(df["species"]))))
df["species"] = LabelEncoder().fit_transform(df["species"])

df = df[df["species"] < 5]

selected_features = abs(df.corr()["species"]).sort_values(ascending=False)[:16]
#print("Top 5 relevant features:\n{feat}\n".format(feat=selected_features[1:6]))
df = df[selected_features.index.values]


#print("Dataset dimensions: {dim}".format(dim=df.values.shape))
data = df.values

data_tsne_X = TSNE(n_components=2).fit_transform(data[:,1:])
data_tsne_Y = data[:,0]

plt.figure(figsize=(15,10))
plt.scatter(data_tsne_X[:,0], data_tsne_X[:,1], c=data_tsne_Y, cmap=matplotlib.cm.Set1, s=100)
plt.show()

In [None]:
def k_means(data, n_centroids = 0, iteration_params = (2,1)):

    def init_centroids(data, num_centroids):
        n,d = data.shape
        centroids = []
        for i in range(0, num_centroids):
            centroids.append(data[random.randint(0,n-1)])
        return centroids
    
    def cost_fun(data, centroids, labeling):
        n,d = data.shape
        distances = [np.linalg.norm(data[x] - centroids[labeling[x]]) for x in range(0,n)]
        return sum(distances)/n
    
    def assign_labels(data, centroids):
        n,d = data.shape
        labeling = []
        for i in range(0,n):
            distances = []
            for c in range(0, len(centroids)):
                distances.append(np.linalg.norm(data[i] - centroids[c]))
            labeling.append(distances.index(min(distances)))
        return labeling
    
    def update_centroids(data, centroids, labeling):
        n,d = data.shape
        new_centroids = []
        for c in range(0, len(centroids)):
            label_samples = [np.array(data[i]) for i in np.where(np.array(labeling) == c)[0]]
            if len(label_samples) > 0:
                centroid_temp = np.zeros(d)
                for sample in label_samples:
                    centroid_temp = centroid_temp + sample
                new_centroids.append(centroid_temp / len(label_samples))
        
        # if len(centroids) != len(new_centroids):
           # print("Number of centroids has changed.")
            
        return np.array(new_centroids)
    
    def k_means_proc(data, num_centroids):
        centroids = init_centroids(data, num_centroids)
        labeling = assign_labels(data, centroids)
        centroids = update_centroids(data, centroids, labeling)
        new_labeling = assign_labels(data, centroids)
        
        while labeling != new_labeling:
            labeling = new_labeling
            centroids = update_centroids(data, centroids, labeling)
            new_labeling = assign_labels(data, centroids)
        
        return {"cost":cost_fun(data, centroids, labeling), "centroids":centroids, "labels":labeling}

    
    centroids_fixed = False
    if (n_centroids > 0):
        centroids_fixed = True
    
    if centroids_fixed:
        return k_means_proc(data, n_centroids)
    else:
        max_clusters, step = iteration_params
        results = []
        costs = []
        num_centroids_interval = range(2, max_clusters, step)
        for c in num_centroids_interval:
            res_c = k_means_proc(data, c)
            results.append(res_c)
            costs.append(res_c["cost"])
        return {"results":results, "costs":costs}

# K-Means algorithm

This algorithm uses K points, called *centroids*, to represent each cluster the dataset is divided into. Each point is assigned to one of the chosen centroids and, then, centroids and corresponding labels are updated to represent the current state of the model. After multiple iterations, each point is associated with a particolar cluster, defining some structure features in common with the other samples. <br><br>
**Input** Training data, number of centroids (optional). <br>
**Functioning** Multiple updating iterations, moving each time centroids in the barycenter of the labeled points until no changes are applied. <br>
**Output** Sample labels, number of centroids, cost of the best solution. <br>
<br>
# Data structures

**data** matrix (n x d) containing all the samples of the dataset considered (sample index to sample). <br>
**centroids** : list of all the centroids at each algorithm iteration (centroid id to centroid features). <br>
**labeling** : list of all the labels corresponding to each sample of the dataset (sample index to centroid id). <br>

In [None]:
plt.figure(figsize=(30,15))
best_res = {"cost":float("inf"), "result":None}

for i in range(0,4):
    cluster_number = 4
    result = k_means(data=data, n_centroids=cluster_number)
    if result["cost"] < best_res["cost"]:
        best_res["cost"] = result["cost"]
        best_res["result"] = result
    labels = result["labels"]
    centroids = result["centroids"]
    plt.subplot(2,2,i+1)
    cmap=matplotlib.cm.viridis
    plt.scatter(data[:,0], data[:,1], c=labels, cmap=cmap, s=50)
    plt.scatter(centroids[:,0], centroids[:,1], c=range(0,len(centroids)), marker="X", cmap=cmap, s=400)

plt.figure(figsize=(11,7))
plt.title("Best K-means model. Cost: {cost}".format(cost=best_res["cost"]))
plt.scatter(data[:,0], data[:,1], c=best_res["result"]["labels"], cmap=cmap, s=50)
plt.scatter(best_res["result"]["centroids"][:,0], best_res["result"]["centroids"][:,1], c=range(0,len(best_res["result"]["centroids"])), marker="X", cmap=cmap, s=400)

plt.show()

In [None]:
plt.figure(figsize=(30,40))
for index,i in enumerate(range(4,20,2)):
    max_clusters = i
    step = 1
    results = k_means(data=data, iteration_params=(max_clusters,step))
    plt.subplot(int((20 - 4)/3) + 1, 3, index + 1)
    plt.title("Max clusters: {clusters}".format(clusters=i))
    plt.plot(np.arange(0,max_clusters - 2,step), results["costs"], c="b")
plt.show()