## Preliminars

In [None]:
import pandas as pd
from pandas import DataFrame
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
import random
import colorsys
from functools import partial

plt.rcParams["figure.figsize"] = (10,10)

filename = 'Iris.csv'
column_category = 'Species'
columns_plot = ('SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm')
sample_proportions = {'Train': 0.6, 'Validation': 0.2, 'Test': 0.2}
k = 3
data = pd.read_csv(filename, index_col=0)
name_to_index = {name: i for i, name in enumerate(data.iloc[:, -1].unique())}
index_to_name = {i: name for name, i in name_to_index.items()}
data.iloc[:, -1] = \
    data.iloc[:, -1].apply(lambda x: name_to_index[x])


## Plotting

In [None]:
def get_color(N, n):
    return colorsys.hsv_to_rgb(n/N, 0.75, 1)

def plot_clusters(data, index_to_name):
    fig, axs = plt.subplots(
        data.shape[1]-1, data.shape[1]-1, sharex='col', sharey='row')

    get_colorN = lambda x: get_color(len(index_to_name), x)
    colors = data.iloc[:, -1].apply(get_colorN)

    for r, r_name in enumerate(data.columns[:-1]):
        y_data = data.iloc[:, r]
        for c, c_name in enumerate(data.columns[:-1]):
            if r == c:
                axs[r, c].text(0.5, 0.5, r_name,
                                fontweight='bold',
                                horizontalalignment='center',
                                verticalalignment='center',
                                transform=axs[r, c].transAxes)
            else:
                axs[r, c].scatter(data.iloc[:, c], y_data, c=colors)

    legend_elements = [Line2D([0], [0], color='w', marker='s', markersize=12,
                              markerfacecolor=get_colorN(i), label=index_to_name[i])
                       for i in range(len(index_to_name))]
    fig.legend(handles=legend_elements, loc='center')


## Sampling

In [None]:
def sampling_uniform(data, sample_proportions):
    assert sum(sample_proportions.values()) == 1, sum(sample_proportions.values())

    shuffled_data = data.sample(frac=1)
    slice_low = 0
    samples = {}
    for sample, proportion in sample_proportions.items():
        slice_high = slice_low + proportion
        samples[sample] = shuffled_data.iloc[int(len(shuffled_data) * slice_low) : int(len(shuffled_data) * slice_high), :]
        slice_low = slice_high

    return samples

## Distances

In [None]:
def dist_euclidean2(p1, p2):
    d = p1 - p2
    return d.dot(d)

def dist_manhattan(p1, p2):
    return abs(p1 - p2).sum()

# precision matrix is the inverse of the covariance matrix
def dist_mahalanobis2(precision, p1, p2):
    d = p1 - p2
    return d.transpose().dot(precision).dot(d)

## Clustering

In [None]:
def get_centroids(points, k):
    return [points[points['Cluster'] == cluster].mean() for cluster in range(k)]
    
def init_randompartition(points, k):
    points['Cluster'] = [random.randrange(0, k) for i in range(points.shape[0])]
    means = get_centroids(points, k)
    return points, means

def cluster_kmeans(points, dist, k):
    points, means = init_randompartition(points, k)
    while True:
        clusters_new = [np.argmin(list(map(partial(dist, r), means)))
                        for _, r in points.iterrows()]
        points['Cluster'] = clusters_new
        means_new = get_centroids(points, k)
        delta = np.mean([dist(p1, p2) for p1, p2 in zip(means, means_new)])
        assert not np.isnan(delta)
        if(delta < 0.01):
            return {cluster: points[points['Cluster'] == cluster].drop('Cluster', axis = 1) 
                    for cluster in range(k)}, means_new
        means = means_new


## Run

In [None]:
plot_clusters(data, index_to_name)
sample = sampling_uniform(
    data.drop([column_category], axis=1), sample_proportions)
plot_clusters(sample)
trained, centroids = cluster_kmeans(sample['Train'], dist_euclidean2, k)
trained.rename(columns={'Cluster': 'TFR'})
plot_clusters(trained, columns_plot)
