In [91]:
import os
import json
import numpy as np
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from s_dbw import S_Dbw
from pyDRMetrics.pyDRMetrics import *

In [135]:
# Insert path to output from https://observablehq.com/@siliconjazz/epivecs-results-calculate
PATH = "..."

## Helper

In [136]:
def log(source, message):
    message = "{} | {}".format(source, message)
    message = message + " "*(256-len(message))
    print(message, end='\r')
    
def index(rows, f):
    group_dict = {}
    for [i,row] in enumerate(rows):
        k = f(row,i)
        group_dict[k] = row
    return group_dict

def quantization_mse(X, labels, centroids):
    total = 0
    for i,x in enumerate(X):
        total += np.linalg.norm(np.subtract(x, centroids[labels[i]]))
    return total / len(X)

## Load data, calculate metrics

In [137]:
def load_ec_results():
    f = open(PATH)
    data = json.load(f)
    return data

results = load_ec_results()

vectors_dict = {}
for dataset in results['datasets']:
    vectors_dict[dataset['name']] = np.array(dataset['vectors'])

In [138]:
def calculate_cluster_metrics(cluster_results):
    metric_results = []
    
    for i,result in enumerate(cluster_results):
        metric_result = result.copy()
        del metric_result['labels']
        del metric_result['centroids']
        if 'embeddedCentroids' in metric_result:
            del metric_result['embeddedCentroids']
    
        X = vectors_dict[result['dataset']]
        metric_result['silhouette'] = silhouette_score(X, result['labels']) 
        metric_result['db'] = davies_bouldin_score(X, result['labels'])
        metric_result['ch'] = calinski_harabasz_score(X, result['labels'])
        metric_result['sdbw'] = S_Dbw(X, result['labels'])
        metric_result['qmse'] = quantization_mse(X, result['labels'], result['centroids'])
        
        metric_results.append(metric_result)
        log('Cluster metrics', str(round(((i+1)/len(cluster_results))*100)) + "%")
        
    return metric_results
        
cluster_metric_results = calculate_cluster_metrics(results['clustering'])

Cluster metrics | 100%                                                                                                                                                                                                                                          

In [139]:
DR_METRICS = ["AUC", "AUC_C", "Qlocal", "Qglobal", "AUC_T", "Vr", "Vrs"]

def calculate_embedding_metrics(embedding_results):
    metric_results = []
    
    for i,result in enumerate(embedding_results):
        metric_result = result.copy()
        #del metric_result['labels']
        del metric_result['centroids']
        if 'embeddedCentroids' in metric_result:
            del metric_result['embeddedCentroids']

        
        drm = DRMetrics(np.array(result['centroids']), np.array(result['embeddedCentroids']))
        for metric in DR_METRICS:
            metric_result[metric] = getattr(drm, metric)
            if math.isnan(metric_result[metric]):
                metric_result[metric] = None
        
        metric_results.append(metric_result)
        log('Embedding metrics', str(round(((i+1)/len(embedding_results))*100)) + "%")
        
    return metric_results
        
embedding_metric_results = calculate_embedding_metrics(results['embedding'])

Embedding metrics | 0%                                                                                                                                                                                                                                          Embedding metrics | 0%                                                                                                                                                                                                                                          Embedding metrics | 0%                                                                                                                                                                                                                                          Embedding metrics | 0%                                                                                                                                                                                                               

Embedding metrics | 17%                                                                                                                                                                                                                                         

  Qglobal = np.sum(QNN[kmax:-1])/(m - kmax -1) # skip the last. The last is (m-1)-nearest neighbor, including all samples.
  Qglobal = np.sum(QNN[kmax:-1])/(m - kmax -1) # skip the last. The last is (m-1)-nearest neighbor, including all samples.


Embedding metrics | 18%                                                                                                                                                                                                                                         Embedding metrics | 18%                                                                                                                                                                                                                                         Embedding metrics | 18%                                                                                                                                                                                                                                         Embedding metrics | 18%                                                                                                                                                                                                              

Embedding metrics | 29%                                                                                                                                                                                                                                         

  Qglobal = np.sum(QNN[kmax:-1])/(m - kmax -1) # skip the last. The last is (m-1)-nearest neighbor, including all samples.
  Qglobal = np.sum(QNN[kmax:-1])/(m - kmax -1) # skip the last. The last is (m-1)-nearest neighbor, including all samples.
  Qglobal = np.sum(QNN[kmax:-1])/(m - kmax -1) # skip the last. The last is (m-1)-nearest neighbor, including all samples.
  Qglobal = np.sum(QNN[kmax:-1])/(m - kmax -1) # skip the last. The last is (m-1)-nearest neighbor, including all samples.
  Qglobal = np.sum(QNN[kmax:-1])/(m - kmax -1) # skip the last. The last is (m-1)-nearest neighbor, including all samples.
  Qglobal = np.sum(QNN[kmax:-1])/(m - kmax -1) # skip the last. The last is (m-1)-nearest neighbor, including all samples.
  Qglobal = np.sum(QNN[kmax:-1])/(m - kmax -1) # skip the last. The last is (m-1)-nearest neighbor, including all samples.


Embedding metrics | 29%                                                                                                                                                                                                                                         Embedding metrics | 29%                                                                                                                                                                                                                                         Embedding metrics | 29%                                                                                                                                                                                                                                         Embedding metrics | 29%                                                                                                                                                                                                              

Embedding metrics | 39%                                                                                                                                                                                                                                         

  Qglobal = np.sum(QNN[kmax:-1])/(m - kmax -1) # skip the last. The last is (m-1)-nearest neighbor, including all samples.
  Qglobal = np.sum(QNN[kmax:-1])/(m - kmax -1) # skip the last. The last is (m-1)-nearest neighbor, including all samples.
  Qglobal = np.sum(QNN[kmax:-1])/(m - kmax -1) # skip the last. The last is (m-1)-nearest neighbor, including all samples.
  Qglobal = np.sum(QNN[kmax:-1])/(m - kmax -1) # skip the last. The last is (m-1)-nearest neighbor, including all samples.
  Qglobal = np.sum(QNN[kmax:-1])/(m - kmax -1) # skip the last. The last is (m-1)-nearest neighbor, including all samples.
  Qglobal = np.sum(QNN[kmax:-1])/(m - kmax -1) # skip the last. The last is (m-1)-nearest neighbor, including all samples.


Embedding metrics | 39%                                                                                                                                                                                                                                         Embedding metrics | 39%                                                                                                                                                                                                                                         Embedding metrics | 39%                                                                                                                                                                                                                                         Embedding metrics | 39%                                                                                                                                                                                                              

Embedding metrics | 55%                                                                                                                                                                                                                                         

  Qglobal = np.sum(QNN[kmax:-1])/(m - kmax -1) # skip the last. The last is (m-1)-nearest neighbor, including all samples.
  Qglobal = np.sum(QNN[kmax:-1])/(m - kmax -1) # skip the last. The last is (m-1)-nearest neighbor, including all samples.


Embedding metrics | 55%                                                                                                                                                                                                                                         Embedding metrics | 55%                                                                                                                                                                                                                                         Embedding metrics | 55%                                                                                                                                                                                                                                         Embedding metrics | 55%                                                                                                                                                                                                              

Embedding metrics | 75%                                                                                                                                                                                                                                         

  Qglobal = np.sum(QNN[kmax:-1])/(m - kmax -1) # skip the last. The last is (m-1)-nearest neighbor, including all samples.


Embedding metrics | 84%                                                                                                                                                                                                                                         

  Qglobal = np.sum(QNN[kmax:-1])/(m - kmax -1) # skip the last. The last is (m-1)-nearest neighbor, including all samples.


Embedding metrics | 84%                                                                                                                                                                                                                                         Embedding metrics | 84%                                                                                                                                                                                                                                         Embedding metrics | 84%                                                                                                                                                                                                                                         Embedding metrics | 84%                                                                                                                                                                                                              

Embedding metrics | 86%                                                                                                                                                                                                                                         Embedding metrics | 86%                                                                                                                                                                                                                                         Embedding metrics | 86%                                                                                                                                                                                                                                         Embedding metrics | 86%                                                                                                                                                                                                              

Embedding metrics | 87%                                                                                                                                                                                                                                         Embedding metrics | 87%                                                                                                                                                                                                                                         Embedding metrics | 87%                                                                                                                                                                                                                                         Embedding metrics | 87%                                                                                                                                                                                                              

  Qglobal = np.sum(QNN[kmax:-1])/(m - kmax -1) # skip the last. The last is (m-1)-nearest neighbor, including all samples.
  Qglobal = np.sum(QNN[kmax:-1])/(m - kmax -1) # skip the last. The last is (m-1)-nearest neighbor, including all samples.
  Qglobal = np.sum(QNN[kmax:-1])/(m - kmax -1) # skip the last. The last is (m-1)-nearest neighbor, including all samples.
  Qglobal = np.sum(QNN[kmax:-1])/(m - kmax -1) # skip the last. The last is (m-1)-nearest neighbor, including all samples.


Embedding metrics | 90%                                                                                                                                                                                                                                         Embedding metrics | 90%                                                                                                                                                                                                                                         Embedding metrics | 90%                                                                                                                                                                                                                                         Embedding metrics | 90%                                                                                                                                                                                                              

  Qglobal = np.sum(QNN[kmax:-1])/(m - kmax -1) # skip the last. The last is (m-1)-nearest neighbor, including all samples.


Embedding metrics | 96%                                                                                                                                                                                                                                         Embedding metrics | 96%                                                                                                                                                                                                                                         Embedding metrics | 96%                                                                                                                                                                                                                                         Embedding metrics | 96%                                                                                                                                                                                                              

Embedding metrics | 98%                                                                                                                                                                                                                                         Embedding metrics | 98%                                                                                                                                                                                                                                         Embedding metrics | 98%                                                                                                                                                                                                                                         Embedding metrics | 98%                                                                                                                                                                                                              

Embedding metrics | 99%                                                                                                                                                                                                                                         Embedding metrics | 100%                                                                                                                                                                                                                                        Embedding metrics | 100%                                                                                                                                                                                                                                        Embedding metrics | 100%                                                                                                                                                                                                             

In [140]:
def create_output_results():
    keys = list(results.keys())
    keys.remove('clustering')
    keys.remove('embedding')
    keys.remove('datasets')
    
    output_results = {}
    for key in keys:
        output_results[key] = results[key]
        
    output_results['clustering_results'] = cluster_metric_results
    output_results['embedding_results'] = embedding_metric_results
    
    return output_results

output_results = create_output_results()
output_results_json = json.dumps(output_results)
out_path = os.path.join(os.path.dirname(PATH), "metrics_"+os.path.basename(PATH))
with open(out_path, "w+") as outfile:
    outfile.write(output_results_json)