# A comparison of extrinsic clustering evaluation metrics based on formal constraints

In [1]:
from sklearn import metrics #pip install scikit-learn==0.24, import sklearn, print('The scikit-learn version is {}.'.format(sklearn.__version__))
import numpy as np
from pandas import DataFrame as df
from pandas import set_option as pd_set_option
import bcubed #https://pypi.org/project/bcubed/
from math import log
pd_set_option('display.max_columns', 500)
pd_set_option('display.width', 1000)
pd_set_option("display.precision", 3)
pd_set_option('display.max_rows', None)

## Create other functions

In [2]:
def purity_score(y_true, y_pred):
    # compute contingency matrix (also called confusion matrix)
    contingency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
    # return purity
    return np.sum(np.amax(contingency_matrix, axis=0)) / np.sum(contingency_matrix) 
def inv_purity_score(y_true, y_pred):
    # compute contingency matrix (also called confusion matrix)
    contingency_matrix = metrics.cluster.contingency_matrix(y_pred, y_true)
    # return purity
    return np.sum(np.amax(contingency_matrix, axis=0)) / np.sum(contingency_matrix) 

def labels_to_sets(lab_vec):
    v = []
    for l in np.unique(lab_vec):
        ind_vec = [i for i, e in enumerate(lab_vec) if e == l]
        #print(l, ind_vec)
        v.append(ind_vec)
    return list(v)
def variation_of_information(y_true, y_pred): 
    #https://gist.github.com/jwcarr/626cbc80e0006b526688
    X = labels_to_sets(y_true)
    Y = labels_to_sets(y_pred)
    n = float(sum([len(x) for x in X]))
    sigma = 0.0
    for x in X:
        p = len(x) / n
        for y in Y:
            q = len(y) / n
            r = len(set(x) & set(y)) / n
            if r > 0.0:
                sigma += r * (log(r / p, 2) + log(r / q, 2))
    return -abs(sigma)

def create_bcubed_dict(y):
    set_dict = {};
    for i in range(len(y)):
        set_dict["item{:02d}".format(i+1)] = set([str(y[i])])
    return set_dict
        
def bcubed_precision(y_true, y_pred):
    tr_dict = create_bcubed_dict(y_true)
    pr_dict = create_bcubed_dict(y_pred)
    precision = bcubed.precision(pr_dict, tr_dict)
    return precision

def bcubed_recall(y_true, y_pred):
    tr_dict = create_bcubed_dict(y_true)
    pr_dict = create_bcubed_dict(y_pred)
    recall = bcubed.recall(pr_dict, tr_dict)
    return recall

def bcubed_fscore(y_true, y_pred):
    tr_dict = create_bcubed_dict(y_true)
    pr_dict = create_bcubed_dict(y_pred)
    precision = bcubed.precision(pr_dict, tr_dict)
    recall = bcubed.recall(pr_dict, tr_dict)
    fscore = bcubed.fscore(precision, recall)
    return fscore

def highlight_cells(df_filled):
    df = df_filled.copy()
    r,c = df.shape
    df.loc[:,:] = 'background-color: papayawhip' #set default color
    for ri in range(r):
        for cj in range(round(c/2)):
            vl = df_filled.iloc[ri, 2*cj]
            vr = df_filled.iloc[ri, 2*cj+1]
            if vr>vl:
                df.iloc[ri,cj*2] = 'background-color: cyan'
                df.iloc[ri,1+cj*2] = 'background-color: cyan'
            else:
                df.iloc[ri,cj*2] = 'background-color: red'
                df.iloc[ri,1+cj*2] = 'background-color: red'
    return df 

## Create the example set

In [3]:
labels_dict = {"c00" : [1, 1, 1, 1, 1, 2, 2, 6, 6, 6, 6, 3, 4, 5],#Homogeneity Base
               "c01" : [2, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4],#Homogeneity Less
               "c02" : [2, 2, 2, 2, 3, 4, 4, 5, 5, 5, 5, 5, 5, 5],#Homogeneity More
               "c10" : [0, 0, 0, 0, 0, 0, 0, 2, 3, 5, 4, 4, 4, 4],#Completeness Base
               "c11" : [1, 1, 1, 1, 2, 3, 3, 4, 4, 4, 4, 4, 4, 4],#Completeness Less
               "c12" : [1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3],#Completeness More
               "c20" : [0, 0, 0, 0, 1, 2, 3, 4, 5],#RagBag Base
               "c21" : [1, 1, 1, 1, 1, 2, 2, 2, 2],#RagBag Less
               "c22" : [1, 1, 1, 1, 2, 2, 2, 2, 2],#RagBag More
               "c30" : [0, 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4],#ClustSizeQty Base
               "c31" : [1, 1, 1, 1, 1, 2, 3, 4, 5, 6, 7, 8, 9],#ClustSizeQty Less
               "c32" : [1, 1, 1, 1, 2, 3, 3, 4, 4, 5, 5, 6, 6],#ClustSizeQty More
               }
sample_count = int(len(labels_dict)/3)
column_names = ['Homo_Less', 'Homo_More', 'Compl_Less', 'Comp_More', 'RagBag_L', 'RagBag_M', 'Clustsq_L', 'Clustsq_M']

## Evaluate and print

In [4]:
eval_crit = ['purity_score', 'inv_purity_score', 'f1_score',
             'rand_score','adjusted_rand_score', 'jaccard_score','fowlkes_mallows_score',
             'mutual_info_score','variation_of_information', 
             'bcubed_precision', 'bcubed_recall', 'bcubed_fscore', 
             'v_measure_score', 'completeness_score', 'homogeneity_score',
             ]
df_mat = np.zeros((len(eval_crit),8),dtype=float)
df_table = df(df_mat, columns=column_names)
df_table.index = eval_crit

df_table_style = df_table.copy()
df_table_style.loc[:,:] = '' 

for ec in eval_crit:
    #print(ec, sample_count)
    add_str = ''
    if ec=='f1_score' or ec=='jaccard_score':
        add_str=",average='micro'"
    for i in range(sample_count):        
        eval_str_1 = "metrics.{}(labels_dict['c{:d}0'], labels_dict['c{:d}1']{})".format(ec,i,i,add_str)
        eval_str_2 = "metrics.{}(labels_dict['c{:d}0'], labels_dict['c{:d}2']{})".format(ec,i,i,add_str)
        try:
            x_left = eval(eval_str_1)
            x_right = eval(eval_str_2)
        except :
            eval_str_3 = "{}(labels_dict['c{:d}0'], labels_dict['c{:d}1']{})".format(ec,i,i,add_str)
            eval_str_4 = "{}(labels_dict['c{:d}0'], labels_dict['c{:d}2']{})".format(ec,i,i,add_str)
            x_left = eval(eval_str_3)
            x_right = eval(eval_str_4)
            #print("e1:", eval_str_1)
            #print("e2:", eval_str_2)            
            #print("e3:", eval_str_3)
            #print("e4:", eval_str_4)            
            
        df_table.iloc[df_table.index.get_loc(ec),i*2] = x_left
        df_table.iloc[df_table.index.get_loc(ec),1+i*2] = x_right
            
        #print("{:d}-ec({}) - {}({:5.3f})- {}({:5.3f})".format(i, ec, column_names[i*2], x_left, column_names[i*2+1], x_right))

In [5]:
t = df_table.style.apply(highlight_cells, axis=None)
t

Unnamed: 0,Homo_Less,Homo_More,Compl_Less,Comp_More,RagBag_L,RagBag_M,Clustsq_L,Clustsq_M
purity_score,0.714,0.786,0.786,0.786,0.556,0.556,1.0,1.0
inv_purity_score,0.929,0.929,0.786,0.786,1.0,1.0,0.692,0.923
f1_score,0.071,0.071,0.286,0.071,0.222,0.111,0.0,0.0
rand_score,0.769,0.791,0.681,0.703,0.722,0.722,0.949,0.949
adjusted_rand_score,0.413,0.45,0.244,0.311,0.4,0.4,0.804,0.804
jaccard_score,0.037,0.037,0.167,0.037,0.125,0.059,0.0,0.0
fowlkes_mallows_score,0.576,0.596,0.473,0.527,0.612,0.612,0.845,0.845
mutual_info_score,0.856,0.992,0.693,0.693,0.687,0.687,1.519,1.519
variation_of_information,-1.287,-1.09,-1.522,-1.325,-1.29,-1.29,-0.615,-0.278
bcubed_precision,0.599,0.694,0.694,0.694,0.489,0.556,1.0,1.0
