## Preliminars

In [None]:
import pandas as pd
from pandas import DataFrame
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
import random
import colorsys
from functools import partial
from functools import reduce

filename = 'Iris.csv'
sample_proportions = {'Train': 0.6, 'Validation': 0.2, 'Test': 0.2}
k = 3

data = pd.read_csv(filename, index_col=0)
name_to_index = {name: i for i, name in enumerate(data.iloc[:, -1].unique())}
i_to_category = {i: name for name, i in name_to_index.items()}
data.iloc[:, -1] = data.iloc[:, -1].apply(lambda x: name_to_index[x])


## Plotting

In [None]:
def plot_clusters(data, i_to_name, centroids=pd.DataFrame(columns=data.columns)):
    def get_color(N, n): return colorsys.hsv_to_rgb(n/N, 0.65, 1)

    def get_colorN(x): return get_color(len(i_to_name), x)

    plt.rcParams["figure.figsize"] = (8, 8)
    fig, axs = plt.subplots(data.shape[1]-1, data.shape[1]-1,
                            sharex='col', sharey='row', constrained_layout=True)

    colors = data.iloc[:, -1].apply(get_colorN)
    color_clusters = [get_colorN(i) for i in range(len(i_to_name))]
    color_centroids = [get_colorN(i) for i in range(centroids.shape[0])]

    for r, r_name in enumerate(data.columns[:-1]):
        y_data = data.iloc[:, r]
        y_centroids = centroids.iloc[:, r]
        for c, c_name in enumerate(data.columns[:-1]):
            if r != c:
                axs[r, c].scatter(data.iloc[:, c], y_data, c=colors)
                axs[r, c].scatter(centroids.iloc[:, c], y_centroids,
                                  c=color_centroids, marker=(5, 1), edgecolors='black')
            else:
                axs[r, c].text(0.5, 0.5, r_name,
                               fontweight='bold',
                               horizontalalignment='center',
                               verticalalignment='center',
                               transform=axs[r, c].transAxes)

    legend_elements = [Line2D([0], [0], label=i_to_name[i], markerfacecolor=c,
                              color='w', marker='s', markersize=12)
                       for i, c in enumerate(color_clusters)]
    fig.legend(handles=legend_elements, loc='center')
    plt.show()


## Sampling

In [None]:
def sampling_uniform(data, sample_proportions):
    assert sum(sample_proportions.values()) == 1, \
        sum(sample_proportions.values())

    shuffled_data = data.sample(frac=1)
    N = len(shuffled_data)
    p_low = 0
    index_to_name = {}
    shuffled_data['Sample'] = -1
    for i, (sample, proportion) in enumerate(sample_proportions.items()):
        p_high = p_low + proportion
        i_low, i_high = int(N * p_low), int(N * p_high)
        shuffled_data.iloc[i_low: i_high, -1] = i
        index_to_name[i] = sample
        p_low = p_high

    return shuffled_data, index_to_name


## Distances

In [None]:
def dist_euclidean2(p1, p2):
    d = p1.iloc[:-1] - p2.iloc[:-1]
    return d.dot(d)


def dist_manhattan(p1, p2):
    return abs(p1.iloc[:-1] - p2.iloc[:-1]).sum()


def dist_mahalanobis2(precision, p1, p2):
    # precision matrix is the inverse of the covariance matrix
    d = p1.iloc[:-1] - p2.iloc[:-1]
    return d.transpose().dot(precision).dot(d)


def closest(p, points, dist):
    d = points.apply(lambda m: dist(p, m), axis=1)
    return (d.argmin(), d.min())


def normalize(points):
    min_elem = points.iloc[:, :-1].min()
    scale = points.iloc[:, :-1].max() - min_elem
    def f(x): return ((x.iloc[:-1]-min_elem)/scale).append(x[-1:])
    def g(x): return (x.iloc[:-1]*scale + min_elem).append(x[-1:])
    return points.apply(f, axis=1), lambda x: x.apply(g, axis=1)


def get_centroids(points):
    points = points.groupby(points.columns[-1])
    count = points.size()
    means = points.mean().reset_index()
    means.iloc[:,0] = count
    cols = means.columns.tolist()
    return means[cols[1:] + cols[:1]]


## Clustering

In [None]:
def get_internal_validation(points, centroids, dist):
    #Calinski-Harabasz Criterion
    m = points.mean()
    m.iloc[-1] = -1
    k = centroids.shape[0]
    N = centroids.iloc[:, -1].sum()
    SSW = sum(points.apply(lambda p: dist(
        p, centroids.iloc[int(p.iloc[-1])]), axis=1))
    SSB = sum(centroids.apply(lambda p: p.iloc[-1]*dist(p, m), axis=1))
    VRC = (SSB/SSW) * ((N - k)/(k-1)) 
    return VRC


def get_external_validation(points, answers):
    def paired(t): return t.apply(lambda x: t.apply(lambda y: x == y))

    def triang_inf(t): return t.mask(
        np.triu(np.ones(t.shape)).astype(bool)).stack()

    points = points.iloc[:, -1]
    points = triang_inf(paired(points))
    answers = triang_inf(paired(answers))
    TP, FP, FN, TN = 0, 0, 0, 0

    for u, v in zip(points, answers):
        TP += u and v
        FP += u and not v
        FN += not u and v
        TN += not u and not v

    Pr = TP/(TP + FP)
    R = TP/(TP + FN)
    f1_measure = 2*(Pr * R)/(Pr + R)
    return {'Precision': Pr, 'Recall': R, 'F1': f1_measure}


def init_randompartition(points, k):
    points.iloc[:, -1] = [random.randrange(0, k)
                          for _ in range(points.shape[0])]
    means = get_centroids(points)
    return points, means


def cluster_kmeans(points, dist, k):
    points, means = init_randompartition(points, k)
    objective = points.apply(lambda p: dist(
        p, means.iloc[int(p[-1]), :]), axis=1).sum()
    while True:
        clusters_new, objective_new = assign_clusters(points, means, dist)
        objective_new = sum(objective_new)
        points.iloc[:, -1] = clusters_new
        means_new = get_centroids(points)
        delta = abs(objective - objective_new)
        assert not np.isnan(delta)
        if(delta < 0.0001):
            return points, {i: i for i in range(k)}, means
        means = means_new
        objective = objective_new


def assign_clusters(points, centroids, dist):
    return zip(*points.apply(lambda p: closest(p, centroids, dist), axis=1))


def train(points, method, dist):
    best_1, best_2, best_3 = None, None, None
    best_score = 0
    for i in range(5):
        trained, i_to_partition, centroids = method(points)
        new_score = get_internal_validation(trained, centroids, dist)
        if new_score > best_score:
            best_1, best_2, best_3 = trained, i_to_partition, centroids
            best_score = new_score
    return best_1, best_2, best_3


## Run

In [None]:
data, revert_norm = normalize(data)
#plot_clusters(data, i_to_category)

sample, i_to_partition = sampling_uniform(data, sample_proportions)
#plot_clusters(sample.drop(sample.columns[-2], axis=1), i_to_partition)

sample_train = sample.iloc[:, :-1].copy()
trained, i_to_partition, centroids = train(
    sample_train.copy(),
    lambda points: cluster_kmeans(points, dist_euclidean2, 3),
    dist_euclidean2)

plot_clusters(revert_norm(trained), i_to_partition,
              revert_norm(centroids))
#plot_clusters(revert_norm(sample_train), i_to_category)

print(get_external_validation(trained, sample_train.iloc[:, -1]).copy())
