In [6]:
import os
import numpy as np
import pandas as pd
import dataset as d
import parameters as p
from sklearn.datasets import load_iris
from sklearn.utils.validation import check_random_state

In [7]:
X, y = d.load_breast_cancer_wisconsin('')

In [8]:
X.shape

(569, 30)

In [9]:
ann_algorithms = list(p.prm.copy())

In [10]:
out_dir = 'out'
datasets = d.data_path.copy()
datasets.pop('cmc')
datasets.pop('ml100k')
datasets = list(datasets.keys())
datasets

['blood',
 'bcw',
 'ccrf',
 'diabetic',
 'google_reviews',
 'frogs',
 'mm',
 'spam',
 'ml1m']

In [11]:
recog_rate_cols = ['algorithm', 'max_k', 'k', 'n_clusters', 'kmeans_gmm_db_score', 'opf_gmm_db_score', 'random_gmm_db_score']
comp_cost_cols = ['algorithm', 'max_k', 'k', 'n_clusters', 'kmeans_init_time', 'opf_init_time', 'random_init_time']

In [12]:
db_cols = ['max_k', 'k', 'n_clusters', 'kmeans_gmm_db_score', 'opf_gmm_db_score', 'random_gmm_db_score']
v_cols = ['max_k', 'k', 'n_clusters', 'kmeans_gmm_v_measure', 'opf_gmm_v_measure', 'random_gmm_v_measure']

In [13]:
tex_cols = ['max k', 'k', '\# clusters', '$k$-means', 'Proposed', 'Random']
tex_index = pd.Index(['KNN', 'KD-Tree', 'ANNOY', 'HNSW'], name='Search algorithm')

In [14]:
for ds in datasets:
    
    print('Data: ', ds)

    for f in os.scandir('/'.join([out_dir, ds])):
        
        if f.name.find('nov-19-2021') > -1:
            out_rr_file = pd.read_csv(f.path, index_col=1, engine='c', dtype=object)
    
    # Datasets without true target labels --> DB-index measure
    if ds in ['frogs', 'google_reviews', 'ml1m', 'spam']:
        
        res_file = out_rr_file[db_cols]
        
        for alg in ann_algorithms:

            db_scores = res_file.loc[f'opf_{alg}'][3:].values
            res_file.loc[f'opf_{alg}'][3:] = db_scores.astype(float).round(4)

            idx_min = np.argmin(db_scores.astype(float))
            min_score = float(res_file.loc[f'opf_{alg}'][idx_min + 3])
            res_file.loc[f'opf_{alg}'][idx_min + 3] = '\textbf{' + f'{min_score:.4f}' + '}'
    
    # Datasets with true target labels --> V-measure
    else:
        
        res_file = out_rr_file[v_cols]
    
        for alg in ann_algorithms:

            v_measures = res_file.loc[f'opf_{alg}'][3:].values
            res_file.loc[f'opf_{alg}'][3:] = v_measures.astype(float).round(4)

            idx_max = np.argmax(v_measures.astype(float))
            max_score = float(res_file.loc[f'opf_{alg}'][idx_max + 3])
            res_file.loc[f'opf_{alg}'][idx_max + 3] = '\textbf{' + f'{max_score:.4f}' + '}'
 

    res_file.columns = tex_cols
    res_file.index = tex_index
    
    res_file.insert(0, 'Search algorithm', out_rr_file.index)
    
    # Saving dataframe as .tex
    res_file.to_latex(f'tables/{ds}.tex', index=False, column_format='ccccccc', sparsify=False, 
                         escape=False, caption=f'Experimental results concerning {ds.upper()} dataset.', position='!ht')    

Data:  blood
Data:  bcw
Data:  ccrf
Data:  diabetic
Data:  google_reviews
Data:  frogs
Data:  mm
Data:  spam
Data:  ml1m


In [None]:
df = pd.DataFrame('out/blood/')

In [27]:
out_columns = ['c_index', 'ch_score', 'db_score', 'homogeneity', 'completeness', 'v_measure',
               'max_k', 'best_k', 'n_clusters', 'gmm_time', 'gmm_iterations', 'gmm_log_likelihood']
# out_columns[:6]

In [28]:
max_k = list(range(20, 110, 10))
datasets = ['Blood', 'CCRF', 'CMC', 'Diabetic', 'Google_Reviews', 'MM', 'SPAM']
cols = ['c_index', 'ch_score', 'db_score', 'homogeneity', 'completeness', 'v_measure', 'n_clusters']

In [29]:
index_names = ['GMM_{Kmeans}', 'GMM_{OPF}']
multi_index = pd.MultiIndex.from_product([datasets, index_names])
df = pd.DataFrame(index=multi_index, columns=cols)

for k in [100]:
    for data in datasets:
        out_file = '/'.join([out_dir, 'k_' + str(k) + '_' + nn_search, data.lower()])
        
        print(f'Data path: {out_file}')
        
        i = 0
        for f in os.scandir(out_file):
            if f.path.endswith('.csv'):
                print(f.path)
                res_file = pd.read_csv(f.path, usecols=cols)
                df.loc[data, index_names[i]] = res_file.iloc[0].values
                i += 1
    
    df_tab = df.T

    for data in datasets:
        # Format c-index best result
        measure = 'c_index'
        opf = df_tab.loc[measure][data, index_names[1]]
        kmeans = df_tab.loc[measure][data, index_names[0]]

        if opf < kmeans:
            df_tab.loc[measure][data, index_names[1]] = ''.join(['\textbf{', f'{opf:.4f}', '}'])
            df_tab.loc[measure][data, index_names[0]] = f'{kmeans:.4f}'
        else:
            df_tab.loc[measure][data, index_names[0]] = ''.join(['\textbf{', f'{kmeans:.4f}', '}'])
            df_tab.loc[measure][data, index_names[1]] = f'{opf:.4f}'

        # Format c-index best result
        measure = 'ch_score'
        opf = df_tab.loc[measure][data, index_names[1]]
        kmeans = df_tab.loc[measure][data, index_names[0]]

        if opf > kmeans:
            df_tab.loc[measure][data, index_names[1]] = ''.join(['\textbf{', f'{opf:.4f}', '}'])
            df_tab.loc[measure][data, index_names[0]] = f'{kmeans:.4f}'
        else:
            df_tab.loc[measure][data, index_names[0]] = ''.join(['\textbf{', f'{kmeans:.4f}', '}'])
            df_tab.loc[measure][data, index_names[1]] = f'{opf:.4f}'

        # Format db_score best result
        measure = 'db_score'
        opf = df_tab.loc[measure][data, index_names[1]]
        kmeans = df_tab.loc[measure][data, index_names[0]]

        if opf < kmeans:
            df_tab.loc[measure][data, index_names[1]] = ''.join(['\textbf{', f'{opf:.4f}', '}'])
            df_tab.loc[measure][data, index_names[0]] = f'{kmeans:.4f}'
        else:
            df_tab.loc[measure][data, index_names[0]] = ''.join(['\textbf{', f'{kmeans:.4f}', '}'])
            df_tab.loc[measure][data, index_names[1]] = f'{opf:.4f}'

        # Format homogeneity best result
        measure = 'homogeneity'
        opf = float(df_tab.loc[measure][data, index_names[1]])
        kmeans = float(df_tab.loc[measure][data, index_names[0]])

        if (opf + kmeans) != np.nan:
            if opf > kmeans:
                df_tab.loc[measure][data, index_names[1]] = ''.join(['\textbf{', f'{opf:.4f}', '}'])
                df_tab.loc[measure][data, index_names[0]] = f'{kmeans:.4f}'
            else:
                df_tab.loc[measure][data, index_names[0]] = ''.join(['\textbf{', f'{kmeans:.4f}', '}'])
                df_tab.loc[measure][data, index_names[1]] = f'{opf:.4f}'

        # Format completeness best result
        measure = 'completeness'
        opf = float(df_tab.loc[measure][data, index_names[1]])
        kmeans = float(df_tab.loc[measure][data, index_names[0]])
        
        if (opf + kmeans) != np.nan:
            if opf > kmeans:
                df_tab.loc[measure][data, index_names[1]] = ''.join(['\textbf{', f'{opf:.4f}', '}'])
                df_tab.loc[measure][data, index_names[0]] = f'{kmeans:.4f}'
            else:
                df_tab.loc[measure][data, index_names[0]] = ''.join(['\textbf{', f'{kmeans:.4f}', '}'])
                df_tab.loc[measure][data, index_names[1]] = f'{opf:.4f}'

        # Format v-measure best result
        measure = 'v_measure'
        opf = float(df_tab.loc[measure][data, index_names[1]])
        kmeans = float(df_tab.loc[measure][data, index_names[0]])

        if (opf + kmeans) != np.nan:
            if opf > kmeans:
                df_tab.loc[measure][data, index_names[1]] = ''.join(['\textbf{', f'{opf:.4f}', '}'])
                df_tab.loc[measure][data, index_names[0]] = f'{kmeans:.4f}'
            else:
                df_tab.loc[measure][data, index_names[0]] = ''.join(['\textbf{', f'{kmeans:.4f}', '}'])
                df_tab.loc[measure][data, index_names[1]] = f'{opf:.4f}'
        
        df_tab.rename(columns={data: ''.join(['\textbf{', data, '}'])}, inplace=True)

    # Format clusters column as integer number
    df_tab.loc['n_clusters'] = df_tab.loc['n_clusters'].apply(lambda x: int(x))
    
    # Reset and rename dataframe index
    df_tab.index.name = '\textbf{Measure}'
    df_tab.reset_index(inplace=True)
    
    measures_names = ['C-index', 'Calinski-Harabasz score', 'Davies-Bouldin score', 'Homogeneity', 'Completeness', 'V-measure', '\# clusters']
    df_tab['\textbf{Measure}'] = measures_names
    
    # Save tex table to file
    df_tab.to_latex(
        f'tables/results_k{k}_{nn_search}.tex',
        index=False,
        column_format='ccccccccccccc',
        multicolumn=True,
        multicolumn_format='ccccccccccccc',
        sparsify=False,
        escape=False,
        caption=f'Experiments for maximum k = {k}.',
        position='!ht')

Data path: out/k_100_hnsw/blood
out/k_100_hnsw/blood/kmeans_init_gmm.csv
out/k_100_hnsw/blood/opf_init_gmm.csv
Data path: out/k_100_hnsw/ccrf
out/k_100_hnsw/ccrf/kmeans_init_gmm.csv
out/k_100_hnsw/ccrf/opf_init_gmm.csv
Data path: out/k_100_hnsw/cmc
out/k_100_hnsw/cmc/kmeans_init_gmm.csv
out/k_100_hnsw/cmc/opf_init_gmm.csv
Data path: out/k_100_hnsw/diabetic
out/k_100_hnsw/diabetic/kmeans_init_gmm.csv
out/k_100_hnsw/diabetic/opf_init_gmm.csv
Data path: out/k_100_hnsw/google_reviews
out/k_100_hnsw/google_reviews/kmeans_init_gmm.csv
out/k_100_hnsw/google_reviews/opf_init_gmm.csv
Data path: out/k_100_hnsw/mm
out/k_100_hnsw/mm/kmeans_init_gmm.csv
out/k_100_hnsw/mm/opf_init_gmm.csv
Data path: out/k_100_hnsw/spam
out/k_100_hnsw/spam/kmeans_init_gmm.csv
out/k_100_hnsw/spam/opf_init_gmm.csv


In [176]:
df_tab = df.T

for data in datasets:
    # Format c-index best result
    measure = 'c_index'
    opf = df_tab.loc[measure][data, index_names[0]]
    kmeans = df_tab.loc[measure][data, index_names[1]]
    
    if opf < kmeans:
        df_tab.loc[measure][data, index_names[0]] = ''.join(['\textbf{', f'{opf:.4f}', '}'])
        df_tab.loc[measure][data, index_names[1]] = f'{kmeans:.4f}'
    else:
        df_tab.loc[measure][data, index_names[1]] = ''.join(['\textbf{', f'{kmeans:.4f}', '}'])
        df_tab.loc[measure][data, index_names[0]] = f'{opf:.4f}'

    # Format c-index best result
    measure = 'ch_score'
    opf = df_tab.loc[measure][data, index_names[0]]
    kmeans = df_tab.loc[measure][data, index_names[1]]
    
    if opf > kmeans:
        df_tab.loc[measure][data, index_names[0]] = ''.join(['\textbf{', f'{opf:.4f}', '}'])
        df_tab.loc[measure][data, index_names[1]] = f'{kmeans:.4f}'
    else:
        df_tab.loc[measure][data, index_names[1]] = ''.join(['\textbf{', f'{kmeans:.4f}', '}'])
        df_tab.loc[measure][data, index_names[0]] = f'{opf:.4f}'
    
    # Format db_score best result
    measure = 'db_score'
    opf = df_tab.loc[measure][data, index_names[0]]
    kmeans = df_tab.loc[measure][data, index_names[1]]
    
    if opf < kmeans:
        df_tab.loc[measure][data, index_names[0]] = ''.join(['\textbf{', f'{opf:.4f}', '}'])
        df_tab.loc[measure][data, index_names[1]] = f'{kmeans:.4f}'
    else:
        df_tab.loc[measure][data, index_names[1]] = ''.join(['\textbf{', f'{kmeans:.4f}', '}'])
        df_tab.loc[measure][data, index_names[0]] = f'{opf:.4f}'
    
    # Format homogeneity best result
    measure = 'homogeneity'
    opf = df_tab.loc[measure][data, index_names[0]]
    kmeans = df_tab.loc[measure][data, index_names[1]]
    
    if opf > kmeans:
        df_tab.loc[measure][data, index_names[0]] = ''.join(['\textbf{', f'{opf:.4f}', '}'])
        df_tab.loc[measure][data, index_names[1]] = f'{kmeans:.4f}'
    else:
        df_tab.loc[measure][data, index_names[1]] = ''.join(['\textbf{', f'{kmeans:.4f}', '}'])
        df_tab.loc[measure][data, index_names[0]] = f'{opf:.4f}'
    
    # Format completeness best result
    measure = 'completeness'
    opf = df_tab.loc[measure][data, index_names[0]]
    kmeans = df_tab.loc[measure][data, index_names[1]]
    
    if opf > kmeans:
        df_tab.loc[measure][data, index_names[0]] = ''.join(['\textbf{', f'{opf:.4f}', '}'])
        df_tab.loc[measure][data, index_names[1]] = f'{kmeans:.4f}'
    else:
        df_tab.loc[measure][data, index_names[1]] = ''.join(['\textbf{', f'{kmeans:.4f}', '}'])
        df_tab.loc[measure][data, index_names[0]] = f'{opf:.4f}'
    
    # Format v-measure best result
    measure = 'v_measure'
    opf = df_tab.loc[measure][data, index_names[0]]
    kmeans = df_tab.loc[measure][data, index_names[1]]
    
    if opf > kmeans:
        df_tab.loc[measure][data, index_names[0]] = ''.join(['\textbf{', f'{opf:.4f}', '}'])
        df_tab.loc[measure][data, index_names[1]] = f'{kmeans:.4f}'
    else:
        df_tab.loc[measure][data, index_names[1]] = ''.join(['\textbf{', f'{kmeans:.4f}', '}'])
        df_tab.loc[measure][data, index_names[0]] = f'{opf:.4f}'

# Format clusters column as integer number
df_tab.loc['n_clusters'] = df_tab.loc['n_clusters'].apply(lambda x: int(x))

In [21]:
df_tab

Unnamed: 0_level_0,\textbf{Measure},\textbf{Blood},\textbf{Blood},\textbf{CCRF},\textbf{CCRF},\textbf{CMC},\textbf{CMC},\textbf{Diabetic},\textbf{Diabetic},\textbf{Google_Reviews},\textbf{Google_Reviews},\textbf{MM},\textbf{MM},\textbf{SPAM},\textbf{SPAM}
Unnamed: 0_level_1,Unnamed: 1_level_1,GMM_{Kmeans},GMM_{OPF},GMM_{Kmeans},GMM_{OPF},GMM_{Kmeans},GMM_{OPF},GMM_{Kmeans},GMM_{OPF},GMM_{Kmeans},GMM_{OPF},GMM_{Kmeans},GMM_{OPF},GMM_{Kmeans},GMM_{OPF}
0,C-index,\textbf{0.1934},0.4292,0.079,\textbf{0.0716},0.1699,\textbf{0.1628},\textbf{0.2599},0.3649,\textbf{0.3500},0.5585,0.1167,\textbf{0.0982},\textbf{0.1417},0.4232
1,Calinski-Harabasz score,\textbf{140.8150},55.7465,32.9564,\textbf{77.0668},167.0799,\textbf{186.9043},\textbf{104.4707},36.7954,\textbf{133.5460},64.3268,106.3788,\textbf{312.0561},\textbf{54.3774},23.7821
2,Davies-Bouldin score,1.2722,\textbf{0.7979},1.959,\textbf{1.3171},1.8291,\textbf{1.6385},\textbf{1.6132},1.6799,2.5029,\textbf{2.2417},3.3007,\textbf{1.0180},2.2556,\textbf{1.2843}
3,Homogeneity,\textbf{0.1121},0.0343,0.122,\textbf{0.1264},\textbf{0.0391},0.0261,\textbf{0.0277},0.0096,\textbf{nan},,\textbf{0.2675},0.2459,\textbf{0.5317},0.3105
4,Completeness,\textbf{0.0482},0.0321,0.0137,\textbf{0.0153},\textbf{0.0304},0.0188,\textbf{0.0186},0.007,\textbf{nan},,0.1251,\textbf{0.1280},0.1081,\textbf{0.1791}
5,V-measure,\textbf{0.0675},0.0331,0.0246,\textbf{0.0274},\textbf{0.0342},0.0219,\textbf{0.0222},0.0081,\textbf{nan},,\textbf{0.1705},0.1683,0.1796,\textbf{0.2272}
6,\# clusters,4,4,19.0,19,5,5,5,5.0,15,15,6,6,46,46
