In [94]:
import pandas as pd
import numpy as np
from networkx import Graph, write_adjlist
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
from ela_feature_definition import ela_feature_names
from networkx.algorithms.dominating import dominating_set
from networkx.algorithms.mis import maximal_independent_set

In [112]:
ela = pd.read_csv("data/ela_all.csv",index_col=[0])
print(ela.shape)
ela = ela[ela['suite']!='all']
print(ela.shape)
ela = ela[~((ela['suite']=='cec2017')&(ela['fid']==2))]
print(ela.shape)
ela.to_csv('data/ela.csv')

(10170, 70)
(6630, 70)
(6630, 70)


In [None]:
ela[~((ela['suite']=='cec2017')&(ela['fid']==2))]

In [111]:
ela[(ela['suite']=='cec2017')]['fid'].drop_duplicates()

5761     1
5791     3
5821     4
5851     5
5881     6
5911     7
5941     8
5971     9
6001    10
6031    11
6061    12
6091    13
6121    14
6151    15
6181    16
6211    17
6241    18
6271    19
6301    20
6331    21
6361    22
6391    23
6421    24
6451    25
6481    26
6511    27
6541    28
6571    29
6601    30
Name: fid, dtype: int64

In [101]:
ela['suite'].drop_duplicates()

1          bbob
3601    cec2013
4411    cec2014
5311    cec2015
5761    cec2017
Name: suite, dtype: object

In [3]:
from dom_mis import *

In [69]:
similarity_thresholds=[0.5,0.7,0.9]
id_columns=['suite','fid', 'iid']

In [91]:
all_results=pd.DataFrame()
produced_instances_statistics=[]
for na_handling_setting_name in ['dropna']:
    ela_representation_df = pd.read_csv(f'data/aggregated_ela_representation_{na_handling_setting_name}.csv', index_col=[0,1,2])
    s = cosine_similarity(ela_representation_df.values,ela_representation_df.values)
    similarity_df=pd.DataFrame(s,index=ela_representation_df.index,columns=ela_representation_df.index)
    
    for min_similarity_threshold in similarity_thresholds:
        print(f'missing value handling: {na_handling_setting_name} THRESHOLD: {min_similarity_threshold}')
        g=generate_graph_from_similarity_matrix(similarity_df, min_similarity_threshold)
        print(f' -node count: {len(g.nodes)}')
        print(f' -edge count: {len(g.edges)}')
        for algorithm_name in ['dominant','mis']:
            instances_produced=[]
            for run_id in range(0,30):
                result_directory=os.path.join('results',na_handling_setting_name,algorithm_name)
                result_file_name=os.path.join(result_directory,f'{min_similarity_threshold}_{run_id}.csv')
                result_df = pd.read_csv(result_file_name,index_col=[0,1,2])
                result_df['run_id']=run_id
                result_df['algorithm_name']=algorithm_name
                result_df['min_similarity_threshold']=min_similarity_threshold
                result_df['na_handling_setting_name']=na_handling_setting_name
                all_results=all_results.append(result_df)
                instances_produced.append(result_df.shape[0])
                
            instances_produced=np.array(instances_produced)
            algorithm_config_stats = {'edge count': len(g.edges),
                                      'min': instances_produced.min(),
                                          'max': instances_produced.max(),
                                          'mean': round(instances_produced.mean(),2),
                                     'algorithm': 'DS' if algorithm_name=='dominant' else 'MIS',
                                     'min similarity threshold': min_similarity_threshold}
            produced_instances_statistics.append(algorithm_config_stats)
            for stat_name in ['min','max','mean']:
                print(f' -{algorithm_name} {stat_name}: {algorithm_config_stats[stat_name]}')
            
        print()

missing value handling: dropna THRESHOLD: 0.5
 -node count: 339
 -edge count: 34642
 -dominant min: 7
 -dominant max: 10
 -dominant mean: 8.73
 -mis min: 8
 -mis max: 10
 -mis mean: 8.77

missing value handling: dropna THRESHOLD: 0.7
 -node count: 339
 -edge count: 32275
 -dominant min: 11
 -dominant max: 15
 -dominant mean: 12.77
 -mis min: 11
 -mis max: 15
 -mis mean: 12.37

missing value handling: dropna THRESHOLD: 0.9
 -node count: 339
 -edge count: 30513
 -dominant min: 31
 -dominant max: 35
 -dominant mean: 32.97
 -mis min: 31
 -mis max: 35
 -mis mean: 33.33



In [92]:
produced_instances_statistics_df = pd.DataFrame.from_records(produced_instances_statistics).set_index(['min similarity threshold','algorithm'])

In [93]:
produced_instances_statistics_df.to_latex('produced_instances_statistics')

In [None]:
df = pd.read_csv('data/ela.csv', index_col=[0])

In [None]:
df['suite'].drop_duplicates()

In [42]:
id_cluster = df[['cluster']+id_columns].drop_duplicates()

In [47]:
all_results_copy=all_results.reset_index()

In [49]:
all_results_copy.columns

Index(['level_0', 'suite', 'fid', 'iid', 'run_id', 'algorithm_name',
       'min_similarity_threshold', 'na_handling_setting_name'],
      dtype='object')

In [52]:
all_results_copy=all_results_copy.merge(id_cluster, left_on=id_columns, right_on=id_columns)

In [61]:
all_results_grouped = all_results_copy.groupby(['na_handling_setting_name','min_similarity_threshold','algorithm_name','run_id','cluster']).count()
all_results_grouped['fid'].to_csv('cluster_dist.csv')

In [65]:
all_results_grouped = all_results_copy[['na_handling_setting_name','min_similarity_threshold','algorithm_name','run_id','cluster']].drop_duplicates().groupby(['na_handling_setting_name','min_similarity_threshold','algorithm_name','run_id']).count()
print(all_results_grouped)
all_results_grouped['cluster'].to_csv('number of unique clusters.csv')

                                                                         cluster
na_handling_setting_name min_similarity_threshold algorithm_name run_id         
dropna                   0.50                     dominant       0             6
                                                                 1             6
                                                                 2             6
                                                                 3             8
                                                                 4             5
...                                                                          ...
fillna                   0.95                     mis            25           14
                                                                 26           16
                                                                 27           16
                                                                 28           19
                            

In [58]:
all_results_copy

Unnamed: 0,level_0,suite,fid,iid,run_id,algorithm_name,min_similarity_threshold,na_handling_setting_name,cluster
0,0,bbob,17,5,0,dominant,0.50,dropna,20
1,0,bbob,17,5,8,dominant,0.50,dropna,20
2,0,bbob,17,5,26,dominant,0.50,dropna,20
3,0,bbob,17,5,0,dominant,0.70,dropna,20
4,0,bbob,17,5,8,dominant,0.70,dropna,20
...,...,...,...,...,...,...,...,...,...
12554,0,all,12,1,17,mis,0.95,fillna,23
12555,0,cec2015,10,1,19,mis,0.95,fillna,21
12556,0,cec2017,22,1,23,mis,0.95,fillna,19
12557,0,cec2014,20,1,25,mis,0.95,fillna,21
