## Preliminars

In [None]:
import pandas as pd
from pandas import DataFrame
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
import random
import colorsys
from functools import partial

plt.rcParams["figure.figsize"] = (20,20)

filename = 'Iris.csv'
column_category = 'Species'
columns_plot = ('SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm')
sample_proportions = {'Train': 0.6, 'Validation': 0.2, 'Test': 0.2}

## Plotting

In [None]:
def plot_clusters(clusters, attributes):
    fig, axs = plt.subplots(1, len(clusters)+1, subplot_kw=dict(projection='3d'))

    colors = [colorsys.hsv_to_rgb(i/len(clusters), 1, 1) for i in range(len(clusters))]

    for ax, (name, data), color in zip(axs, clusters.items(), colors):
        ax.set_title(name)
        ax.scatter(data.loc[:, attributes[0]],
                data.loc[:, attributes[1]],
                data.loc[:, attributes[2]], color=color)
        ax.set_xlabel(attributes[0])
        ax.set_ylabel(attributes[1])
        ax.set_zlabel(attributes[2])
        axs[-1].scatter(data.loc[:, attributes[0]],
                data.loc[:, attributes[1]],
                data.loc[:, attributes[2]], color=color)

    axs[-1].set_title('Total')
    axs[-1].set_xlabel(attributes[0])
    axs[-1].set_ylabel(attributes[1])
    axs[-1].set_zlabel(attributes[2])

## Sampling

In [None]:
def sampling_uniform(data, sample_proportions):
    assert sum(sample_proportions.values()) == 1, sum(sample_proportions.values())

    shuffled_data = data.sample(frac=1)
    slice_low = 0
    samples = {}
    for sample, proportion in sample_proportions.items():
        slice_high = slice_low + proportion
        samples[sample] = shuffled_data.iloc[int(len(shuffled_data) * slice_low) : int(len(shuffled_data) * slice_high), :]
        slice_low = slice_high

    return samples

## Distances

In [None]:
def dist_euclidean2(p1, p2):
    d = p1 - p2
    return d.dot(d)

def dist_manhattan(p1, p2):
    return abs(p1 - p2).sum()

# precision matrix is the inverse of the covariance matrix
def dist_mahalanobis2(precision, p1, p2):
    d = p1 - p2
    return d.transpose().dot(precision).dot(d)

## Clustering

In [None]:
def centroids(points, k):
    return [points[points['Cluster'] == cluster].mean() for cluster in range(k)]
    
def init_randompartition(points, k):
    points['Cluster'] = [random.randrange(0, k) for i in range(points.shape[0])]
    means = centroids(points, k)
    return points, means

def cluster_kmeans(points, dist, k):
    points, means = init_randompartition(points, k)

    while True:
        for r in points.iterrows():
            print(r)
            print(list(map(partial(dist, points.iloc[1]), means)))
        print(points.iloc[1], np.argmin(map(partial(dist, points.iloc[1]), means)))
        clusters_new = [np.argmin(map(partial(dist, r), means)) for r in points.iterrows()]
        points['Cluster'] = clusters_new
        means_new = centroids(points, k)
        delta = np.mean([dist(p1, p2) for p1, p2 in zip(means, means_new)])
        assert not np.isnan(delta)
        print(delta)
        if(delta < 0.01):
            return points, means_new
        means = means_new

## Run

In [None]:
data = pd.read_csv(filename, index_col=0)
sample = sampling_uniform(data.drop([column_category], axis=1), sample_proportions)
plot_clusters(sample, columns_plot)
cluster_kmeans(sample['Train'], dist_euclidean2, 3)