# Computing clustering stability

### Loading dependencies and variables

In [1]:
import os
import numpy as np
import json
import logging
import math

import ast
import pandas as pd

from tqdm import tqdm

from os import makedirs
from os.path import dirname, abspath, join, exists

import time

from sklearn.cluster import KMeans
from sklearn.base import clone
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score

from multiprocessing import Process, Manager

#from rpy2.rinterface_lib.embedded import RRuntimeError
#import rpy2.robjects as ro
#from rpy2.robjects.packages import importr
#from rpy2.robjects.conversion import localconverter
#from rpy2.robjects import pandas2ri
#pandas2ri.activate()

#clue = importr("clue")
#bootcluster = importr("bootcluster")
#OTclust = importr("OTclust")

#from config.definitions import ROOT_DIR
#os.chdir(ROOT_DIR + '\\src\\model\\')

from compute_stability_exp_args import ComputerStabilityArguments

rng = np.random.RandomState(1)

### Clustering stability methods

In [2]:
class Stability:

    """
    Class to compute stability using CPU
    """

    def __init__(self, stability_args):
        self.stability_args = stability_args

    def run(self, 
            data, 
    ):
        """
            param: data
            size_sample: size_sample
        """
        data = data 
        size_sample = data.shape[0]
        clusters = self.stability_args.clusters
        num_bootstrap_samples = self.stability_args.num_bootstrap_samples
        num_train_epochs = self.stability_args.num_train_epochs
        path_and_file_name_to_save = self.stability_args.output_dir +  self.stability_args.output_stab_name
        project_name = self.stability_args.project_name
        random_state = self.stability_args.RNDN
        
        manager = Manager()
        stab_methods = manager.dict()
        stab_methods["adjusted_rand_score"] = manager.list()
        stab_methods["adjusted_mutual_info_score"] = manager.list()
        stab_methods["bagclust"] = manager.list()
        stab_methods["han"] = manager.list()
        stab_methods["OTclust"] = manager.list()

        arguments = {
            "clusters": clusters, 
            "num_train_epochs": num_train_epochs,
            "num_bootstrap_samples": num_bootstrap_samples,
            "random_state": random_state
        }
        stab_epochs = {}

        rData = None
        #with localconverter(ro.default_converter + pandas2ri.converter):
        #    rData = ro.conversion.rpy2py(data)
        for ep in tqdm(range(num_train_epochs)):
            for cluster in clusters:

                kmeans = KMeans(n_clusters=cluster, n_init=10)
                labels, indices = self.get_labels_and_indices(data, kmeans, size_sample, num_bootstrap_samples, random_state)    

                if self.stability_args.adjusted_rand_score: self.adjusted_rand_score(labels, indices, cluster, stab_methods)
                if self.stability_args.adjusted_mutual_info_score: self.adjusted_mutual_info_score(labels, indices, cluster, stab_methods)
                if self.stability_args.bagclust: self.bagclust(rData, num_bootstrap_samples, cluster, stab_methods)
                if self.stability_args.han: self.han(rData, num_bootstrap_samples, cluster, stab_methods)
                if self.stability_args.OTstab: self.OTstab(rData, num_bootstrap_samples, cluster, stab_methods)

            
            #To JSON serialize 
            stab_methods = dict(stab_methods)
            for k in stab_methods.keys():
                stab_methods[k] = list(stab_methods[k])
        
            stab_epochs[ep] = stab_methods            
            stab_methods = manager.dict()
            stab_methods["adjusted_rand_score"] = manager.list()
            stab_methods["adjusted_mutual_info_score"] = manager.list()
            stab_methods["bagclust"] = manager.list()
            stab_methods["han"] = manager.list()
            stab_methods["OTclust"] = manager.list()
        print(stab_epochs)   
        self.save(stab_epochs, arguments, path_and_file_name_to_save)

    def get_labels_and_indices(self, data, clrt_algorithm, size_sample, num_bootstrap_samples, random_state):
        labels = []
        indices = []
        for _ in range(num_bootstrap_samples):
            # draw bootstrap samples, store indices
            sample_indices = rng.randint(0, data.shape[0], size_sample)
            indices.append(sample_indices)
            clrt_algorithm = clone(clrt_algorithm)
            if hasattr(clrt_algorithm, "random_state"):
                # randomize estimator if possible
                clrt_algorithm.random_state = rng.randint(1e5)
            data_bootstrap = data[sample_indices]
            clrt_algorithm.fit(data_bootstrap)
            # store clustering outcome using original indices
            relabel = -np.ones(data.shape[0], dtype=int)
            relabel[sample_indices] = clrt_algorithm.labels_
            labels.append(relabel)
        return (labels, indices)

    #https://scikit-learn.org/stable/modules/generated/sklearn.metrics.adjusted_rand_score.html
    def adjusted_rand_score(self, labels, indices, cluster, stab_methods):
        scores = []
        for l, i in zip(labels, indices):
            for k, j in zip(labels, indices):
                in_both = np.intersect1d(i, j)
                scores.append(adjusted_rand_score(l[in_both], k[in_both])) 
        stab_methods['adjusted_rand_score'].append(np.mean(scores))

    #https://scikit-learn.org/stable/modules/generated/sklearn.metrics.adjusted_mutual_info_score.html
    def adjusted_mutual_info_score(self, labels, indices, cluster, stab_methods):
        scores = []
        for l, i in zip(labels, indices):
            for k, j in zip(labels, indices):
                in_both = np.intersect1d(i, j)
                scores.append(adjusted_mutual_info_score(l[in_both], k[in_both]))
        stab_methods['adjusted_mutual_info_score'].append(np.mean(scores))

    #"A prediction-based resampling method for estimating the number of clusters in a dataset."
    #"Bagging to improve the accuracy of a clustering procedure."
    #Explicação: Stability estimation for unsupervised clustering: A review
    def bagclust(self, rData, num_bootstrap_samples, n_cluster, stab_methods):
        rDataStab = clue.cl_bag(x = rData, B = num_bootstrap_samples, k = n_cluster)
        stab_methods['bagclust'].append(rDataStab.rx2['.Data'].max(axis = 1).mean())

    #Bootstrapping estimates of stability for clusters,observations and model selection
    #Para entender vá para a página 4 Seção 2 Fig. 1.
    def han(self, rData, num_bootstrap_samples, n_cluster, stab_methods):
        try:
            hanStab = bootcluster.stability(x = rData, k = n_cluster, B = num_bootstrap_samples)
            stab_overall = float(0) if math.isnan(float(hanStab.rx2['overall'])) else float(hanStab.rx2['overall'])
            stab_methods['han'].append(stab_overall)
        except RRuntimeError:
            stab_methods['han'].append(float(0))

    #CPS Analysis for cluster validation
    #Install from github https://github.com/cran/OTclust
    #Melhor explicação: Denoising Methods for Inferring Microbiome Community Content and Abundance
    def OTstab(self, rData, num_bootstrap_samples, n_cluster,stab_methods): 
        otclust = OTclust.clustCPS(rData, k=n_cluster, l= False, pre=False, noi="after",
                 nPCA = 2, nEXP = num_bootstrap_samples)
        stab_methods['OTclust'].append(float(otclust.rx2['tight_all']))

    def save(self, stab_epochs, arguments, path_and_file_name_to_save):
        data = json.dumps([stab_epochs, arguments], indent = 4)
        i = 1
        path_and_file_name_to_save = path_and_file_name_to_save.replace(".json", "")
        while os.path.exists(f"{path_and_file_name_to_save}-{i}.json"):
            i += 1
        file = open(f"{path_and_file_name_to_save}-{i}.json","w")
        file.write(data)
        file.close()

### Stability arguments

In [14]:
stability_args = ComputerStabilityArguments(
    project_name='Stability - SERIEMA',
    data_path='/hadatasets/fillipe.silva/LLMSegm/data/yelp/gpt2-medium_25_test_embeddings.csv',
    output_dir='/hadatasets/fillipe.silva/LLMSegm/data/yelp/',
    output_stab_name='stability.json', 
    clusters=[2], 
    num_train_epochs=1, 
    num_bootstrap_samples=5,
    num_random_samples=0, 
    repeat_experimet=1, 
    output_log_name='stability.json', 
    mode='CPU', 
    adjusted_rand_score=True, 
    adjusted_mutual_info_score=True, 
    bagclust=False, 
    han=False, 
    OTstab=False, 
    RNDN=1, 
    report_to=False
)

In [15]:

if not exists(stability_args.output_dir):
    makedirs(stability_args.output_dir)

st = time.time()
for i in range(1, stability_args.repeat_experimet + 1):
    data = pd.read_csv(stability_args.data_path, skiprows = 0)
    if stability_args.num_random_samples:
        data = data.sample(n=stability_args.num_random_samples, replace = True)
    data = data.to_numpy()

    stabO = Stability(stability_args)
    stabO.run(data)

et = time.time()
elapsed_time = et - st

100%|██████████| 1/1 [02:38<00:00, 158.30s/it]

{0: {'adjusted_rand_score': [0.9659552926148248], 'adjusted_mutual_info_score': [0.9410173995499824], 'bagclust': [], 'han': [], 'OTclust': []}}



