In [1]:
import pandas as pd
import numpy as np
from networkx import Graph, write_adjlist
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
from ela_feature_definition import ela_feature_names
from networkx.algorithms.dominating import dominating_set
from networkx.algorithms.mis import maximal_independent_set

In [3]:
from dom_mis import *

In [12]:
similarity_thresholds=[0.5,0.7,0.9,0.95]
id_columns=['suite','fid', 'iid']

In [11]:
all_results=pd.DataFrame()
for na_handling_setting_name in ['dropna','fillna']:
    ela_representation_df = pd.read_csv(f'data/aggregated_ela_representation_{na_handling_setting_name}.csv', index_col=[0,1,2])
    s = cosine_similarity(ela_representation_df.values,ela_representation_df.values)
    similarity_df=pd.DataFrame(s,index=ela_representation_df.index,columns=ela_representation_df.index)
    
    for min_similarity_threshold in similarity_thresholds:
        print(f'missing value handling: {na_handling_setting_name} THRESHOLD: {min_similarity_threshold}')
        g=generate_graph_from_similarity_matrix(similarity_df, min_similarity_threshold)
        print(f' -node count: {len(g.nodes)}')
        print(f' -edge count: {len(g.edges)}')
        for algorithm_name in ['dominant','mis']:
            instances_produced=[]
            for run_id in range(0,30):
                result_directory=os.path.join('results',na_handling_setting_name,algorithm_name)
                result_file_name=os.path.join(result_directory,f'{min_similarity_threshold}_{run_id}.csv')
                result_df = pd.read_csv(result_file_name,index_col=[0,1,2])
                result_df['run_id']=run_id
                result_df['algorithm_name']=algorithm_name
                result_df['min_similarity_threshold']=min_similarity_threshold
                result_df['na_handling_setting_name']=na_handling_setting_name
                all_results=all_results.append(result_df)
                instances_produced.append(result_df.shape[0])
            
            instances_produced=np.array(instances_produced)
            print(f' -{algorithm_name} min: {instances_produced.min()}')
            print(f' -{algorithm_name} max: {instances_produced.max()}')
            print(f' -{algorithm_name} average: {instances_produced.mean()}')
        print()

missing value handling: dropna THRESHOLD: 0.5
 -node count: 339
 -edge count: 34642
 -dominant min: 7
 -dominant max: 10
 -dominant average: 8.733333333333333
 -mis min: 8
 -mis max: 10
 -mis average: 8.766666666666667

missing value handling: dropna THRESHOLD: 0.7
 -node count: 339
 -edge count: 32275
 -dominant min: 11
 -dominant max: 15
 -dominant average: 12.766666666666667
 -mis min: 11
 -mis max: 15
 -mis average: 12.366666666666667

missing value handling: dropna THRESHOLD: 0.9
 -node count: 339
 -edge count: 30513
 -dominant min: 31
 -dominant max: 35
 -dominant average: 32.96666666666667
 -mis min: 31
 -mis max: 35
 -mis average: 33.333333333333336

missing value handling: dropna THRESHOLD: 0.95
 -node count: 339
 -edge count: 27678
 -dominant min: 48
 -dominant max: 53
 -dominant average: 50.233333333333334
 -mis min: 47
 -mis max: 52
 -mis average: 49.833333333333336

missing value handling: fillna THRESHOLD: 0.5
 -node count: 339
 -edge count: 34637
 -dominant min: 7
 -domi