## Preliminars

In [None]:
import pandas as pd
from pandas import DataFrame
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
import random
import colorsys
from functools import partial
from functools import reduce

filename = 'Iris.csv'
sample_proportions = {'Train': 0.6, 'Validation': 0.2, 'Test': 0.2}
k = 3
data = pd.read_csv(filename, index_col=0)
name_to_index = {name: i for i, name in enumerate(data.iloc[:, -1].unique())}
i_to_category = {i: name for name, i in name_to_index.items()}
data.iloc[:, -1] = \
    data.iloc[:, -1].apply(lambda x: name_to_index[x])


## Plotting

In [None]:
def get_color(N, n):
    return colorsys.hsv_to_rgb(n/N, 0.75, 1)

def plot_clusters(data, index_to_name, centroids = []):
    plt.rcParams["figure.figsize"] = (8,8)
    fig, axs = plt.subplots(data.shape[1]-1, data.shape[1]-1,
        sharex='col', sharey='row', constrained_layout=True)

    get_colorN = lambda x: get_color(len(index_to_name), x)
    colors = data.iloc[:, -1].apply(get_colorN)

    for r, r_name in enumerate(data.columns[:-1]):
        y_data = data.iloc[:, r]
        for c, c_name in enumerate(data.columns[:-1]):
            if r != c:
                axs[r, c].scatter(data.iloc[:, c], y_data, c=colors)
            else:
                axs[r, c].text(0.5, 0.5, r_name,
                               fontweight='bold',
                               horizontalalignment='center',
                               verticalalignment='center',
                               transform=axs[r, c].transAxes)

    legend_elements = [Line2D([0], [0], color='w', marker='s', markersize=12,
                              markerfacecolor=get_colorN(i), label=index_to_name[i])
                       for i in range(len(index_to_name))]
    fig.legend(handles=legend_elements, loc='center')
    plt.show()


## Sampling

In [None]:
def sampling_uniform(data, sample_proportions):
    assert sum(sample_proportions.values()) == 1, \
        sum(sample_proportions.values())

    shuffled_data = data.sample(frac=1)
    N = len(shuffled_data)
    p_low = 0
    index_to_name = {}
    for i, (sample, proportion) in enumerate(sample_proportions.items()):
        p_high = p_low + proportion
        i_low, i_high = int(N * p_low), int(N * p_high)
        shuffled_data.iloc[i_low: i_high, -1] = i
        index_to_name[i] = sample
        p_low = p_high

    return shuffled_data, index_to_name


## Distances

In [None]:
def dist_euclidean2(p1, p2):
    d = p1 - p2
    return d.dot(d)

def dist_manhattan(p1, p2):
    return abs(p1 - p2).sum()

# precision matrix is the inverse of the covariance matrix
def dist_mahalanobis2(precision, p1, p2):
    d = p1 - p2
    return d.transpose().dot(precision).dot(d)


## Clustering

In [None]:
def get_centroids(points):
    return points.groupby(points.columns[-1]).mean()
    
def init_randompartition(points, k):
    points.iloc[:,-1] = [random.randrange(0, k) for i in range(points.shape[0])]
    means = get_centroids(points)
    return points, means

def cluster_kmeans(points, dist, k):
    points, means = init_randompartition(points, k)
    while True:
        clusters_new = [means.apply(lambda m: dist(r, m), axis=1).argmin()
                        for _, r in points.iloc[:, :-1].iterrows()]
        points.iloc[:,-1] = clusters_new
        means_new = get_centroids(points)
        delta = np.sum([dist(r, means.iloc[i,:])
                        for i, r in means_new.iterrows()])
        assert not np.isnan(delta)
        if(delta < 0.01):
            return points, {i:i for i in range(k)}, means
        means = means_new


## Run

In [None]:
plot_clusters(data, i_to_category)

sample, i_to_partition = sampling_uniform(data, sample_proportions)
#plot_clusters(sample, i_to_partition)

trained, i_to_partition, centroids = cluster_kmeans(
    sample.copy(), dist_euclidean2, k)
plot_clusters(trained, i_to_partition)
