# Compute statistical results of the experiment

The goal of this notebook is to load results obtain from the multi graph matching algorithms coded in matlab (that were given previously simulated graphs) and to compute statistics about these results

## Load the libraries

In [2]:
import numpy as np
import scipy.io as sio
import matplotlib.pyplot as plt
import os
import pickle

## Load the data

In [3]:
# Get the path to the folder
path = "test_for_pairwise/"

We will load the data in a 3 dimensional tensor where the first dimension correspond to the noise parameter, the second to the outliers parameter and the third one to the different runs for a given set of parameters. First we need to give to each possible value of a parameter an integer value.

In [4]:
# We first go through all files to determine the parameters that were
# chosen and assign an integer to them

def get_noise_and_outliers_params(path_to_folder, number_parameters=2):
    """ go through all the folders to determine the possible
        parameters and assign them an integer value
    """
    
    result = {}
    
    for folder_name in os.listdir(path_to_folder):
        list_splitted = folder_name.split(",")
        
        for param_num, elem_param in enumerate(list_splitted):
            param_name, param_value = elem_param.split("_")
            param_value = float(param_value)
            
            # Create the parameter entries in the result
            if param_name not in result:
                result[param_name] = []
                
            if param_value not in result[param_name]: 
                result[param_name].append(param_value)
            
    # We sort the dict for later plotting purposes
    for param_name in result:
        result[param_name].sort()
        
    return result


#dict_parameters_correspondence = get_noise_and_outliers_params(path)
#print(dict_parameters_correspondence)

# MES TESTS D'ACCURACY

In [None]:
# Test de fonctions d'accuracy
path_run_folder = os.path.join(path, "noise_1000,outliers_4","0")
ground_truth = np.load(os.path.join(path_run_folder,"ground_truth.npy"))

In [None]:
# path_run_folder_1000 = os.path.join(path, "noise_1000,outliers_0","0")
# ground_truth_1000 = np.load(os.path.join(path_run_folder_1000,"ground_truth.npy"))

In [None]:
# # Pairwise
# print("gt shape :", ground_truth.shape) # nb_graphs, nb_graphs, nb_vertices

# name_of_result_file = "X_cao_cst_o"
# algorithm_res = sio.loadmat(os.path.join(path_run_folder,name_of_result_file+".mat"))["X"]
# print("algo res shape :",algorithm_res.shape)

In [None]:
# # Multigraph
# print("gt shape :", ground_truth.shape) # nb_graphs, nb_graphs, nb_vertices

# name_of_result_file = "A_mALS"
# algorithm_res = sio.loadmat(os.path.join(path_run_folder,name_of_result_file+".mat"))["A"]
# algorithm_res = algorithm_res@algorithm_res.T

# algorithm_res = np.where(algorithm_res > 0.3, 1, 0)

# print("algo res shape :",algorithm_res.shape)

In [None]:
# algorithm_res= np.load(os.path.join(path_run_folder+"/matchALS_result_new.npy"))
# algorithm_res.shape
# algorithm_res = np.where(algorithm_res > 0.45, 1, 0)
# algorithm_res

In [6]:
hippi_result = np.load("Hippi_res_mat.npy")
gt = np.load("ground_truth.npy")

In [9]:
hippi_result

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 1., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 1., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [None]:
def get_accuracy_permutation(target_permutation, given_permutation):
    """ Return the accuracy of a given permutation knowing the ground truth permutation
        - ground_truth_permutation is a np array where the first axis corresponds to one graph of the family
        the second axis corresponds to another graph and the third axis contains the
        ground truth correspondence matrix between the first and second graph
        - given_permutation is a bulk matrix where all the permutation marix of the algorithm
        are put together.
    """
    
    count_ok = 0 # match
    count_not_ok = 0 # mismatch
    count_total = 0
    
    nb_graphs = target_permutation.shape[0]
    nb_nodes = int(given_permutation.shape[0] / nb_graphs)
    
    for graph_1 in range(target_permutation.shape[0]):
        for graph_2 in range(target_permutation.shape[0]):
            
            # If we are looking at two different graphs we compute the accuracy
            if graph_1 != graph_2:
                
                ground_truth_list = target_permutation[graph_1, graph_2, :]
                sub_permutation = given_permutation[graph_1 * nb_nodes: (graph_1 + 1) * nb_nodes, graph_2 * nb_nodes: (graph_2 + 1) * nb_nodes]

                for node_1, node_2 in enumerate(ground_truth_list):
                    if sub_permutation[node_1, node_2] == 1:
                        count_ok += 1
                    elif sub_permutation[node_1, node_2] == 0:
                        count_not_ok+=1
                        
                    count_total += 1
                    
    print("Number of Mismatch: ",count_not_ok)
    print(count_total)
    print(count_ok)
                    
    return count_ok/count_total
                
                
#get_accuracy_permutation(ground_truth, algorithm_res)
get_accuracy_permutation(gt, hippi_result)

In [None]:
#get_accuracy_permutation(gt,Qmatch_mat)

In [None]:
#get_accuracy_permutation(gt,Qmatch_pyt)

In [None]:
# algorithm_res[0*30:2*30,1*30:2*30]

In [None]:
noise = [100,400,1000]
acc = [0.42,0.88,0.98]

In [None]:
plt.figure(figsize=(9,5))
plt.plot(noise,acc,linestyle='--', marker='o', color='black',label = "Noise vs accuracy")
#plt.plot(outliers,noise_200,linestyle='--', marker='o', color='r',label = "noise 200")
plt.xlabel("Noise")
plt.ylabel("Accuracy")
plt.legend()

In [None]:
# outliers = [0,4,8,12,16]
# noise_1000 = [0.98,0.65,0.36,0.34,0.19]
# noise_200 = [0.88,0.33,0.4,0.13,0.14]

In [None]:
plt.figure(figsize=(9,5))
plt.plot(outliers,noise_1000,linestyle='--', marker='o', color='b',label = "noise 1000")
plt.plot(outliers,noise_200,linestyle='--', marker='o', color='r',label = "noise 200")
plt.xlabel("Outliers")
plt.ylabel("Accuracy")
plt.legend()

# FIN DES TESTS

We now need to go through all the folder and build the 3D tensors that will hold the result. But first we need to define an accuracy metric.

In [None]:
def get_accuracy_permutation(target_permutation, given_permutation):
    """ Return the accuracy of a given permutation knowing the ground truth permutation
        - ground_truth_permutation is a np array where the first axis corresponds to one graph of the family
        the second axis corresponds to another graph and the third axis contains the
        ground truth correspondence matrix between the first and second graph
        - given_permutation is a bulk matrix where all the permutation marix of the algorithm
        are put together.
    """
    
    count_ok = 0
    count_total = 0
    
    nb_graphs = target_permutation.shape[0]
    nb_nodes = int(given_permutation.shape[0] / nb_graphs)
    
    for graph_1 in range(target_permutation.shape[0]):
        for graph_2 in range(target_permutation.shape[0]):
            
            # If we are looking at two different graphs we compute the accuracy
            if graph_1 != graph_2:
                
                ground_truth_list = target_permutation[graph_1, graph_2, :]
                sub_permutation = given_permutation[graph_1 * nb_nodes: (graph_1 + 1) * nb_nodes, graph_2 * nb_nodes: (graph_2 + 1) * nb_nodes]

                for node_1, node_2 in enumerate(ground_truth_list):
                    if sub_permutation[node_1, node_2] == 1:
                        count_ok += 1
                    count_total += 1
                    
    return count_ok/count_total

In [None]:
def get_result_tensor_for_given_algorithm(path_to_folder, param_correspondence_dict, name_of_result_file="X_kergm", transposed=False):
    """Go through all folders and build a 3D tensor that holds
    the accuracy metric for all set of parameters for a given algorithm.
    
    Arguments:
        path_to_folder: The path where the results have been calculated
        param_correspondence_dict: The result of the get_noise_and_outliers_params
    function that give an integer correspondence to each parameter value
        name_of_result_file: The file name to load in each folder (each one
    corresponds to a given algorithm)
    """
    
    # We need to find how many runs were used so we go though the first folder
    nbRuns = len(os.listdir(os.path.join(path_to_folder, os.listdir(path_to_folder)[0])))
    
    # We initialise the final tensor that hold the results
    result_tensor_shape = [len(param_correspondence_dict[param_name]) for param_name in param_correspondence_dict]\
                          + [nbRuns]
    result_tensor = np.zeros(result_tensor_shape)
    
    #We go through all folders
    for parameter_folder in os.listdir(path_to_folder):
        
        # define the new path
        path_parameter_folder = os.path.join(path_to_folder, parameter_folder)
        
        # get the parameters
        splitted_param = parameter_folder.split(",")
        param_1_name, param_1_value = splitted_param[0].split("_")
        param_1_value = float(param_1_value)
        param_1_integer = param_correspondence_dict[param_1_name].index(param_1_value)
        
        param_2_name, param_2_value = splitted_param[1].split("_")
        param_2_value = float(param_2_value)
        param_2_integer = param_correspondence_dict[param_2_name].index(param_2_value)
        
        #print(param_1_name, param_1_value, param_2_name, param_2_value, param_1_integer, param_2_integer)
        
        # We go through all the runs
        for run_i, run_folder in enumerate(os.listdir(path_parameter_folder)):
            
            path_run_folder = os.path.join(path_parameter_folder, run_folder)
            
            # load the ground truth corespondence
            ground_truth = np.load(os.path.join(path_run_folder,"ground_truth.npy"))
            
            # load the algorithm result
            algorithm_res = sio.loadmat(os.path.join(path_run_folder,name_of_result_file+".mat"))["X"]
            
            # transpose it if necessary
            if transposed:
                algorithm_res = np.transpose(algorithm_res)
            
            # get the accuracy result
            accuracy = get_accuracy_permutation(ground_truth, algorithm_res)
            
            # add the result to the tensor
            result_tensor[param_1_integer, param_2_integer, run_i] = accuracy
            
    return result_tensor
    
#kergm_res = get_result_tensor_for_given_algorithm(path, res)
#kergm_res.shape

In [None]:
# Coose the path to the result
path = "generated_graphs_medium/"
name_dataset = "medium"

#load the dictionary of parameters integer correspondence
dict_parameters_correspondence = get_noise_and_outliers_params(path)

# compute the result of all the given algorithms
transposed = True
kergm_res = get_result_tensor_for_given_algorithm(path, dict_parameters_correspondence, "X_kergm", False)
# ipf_res = get_result_tensor_for_given_algorithm(path, dict_parameters_correspondence, "X_ipf", transposed)
# rrwm_res = get_result_tensor_for_given_algorithm(path, dict_parameters_correspondence, "X_rrwm", transposed)
# smac_res = get_result_tensor_for_given_algorithm(path, dict_parameters_correspondence, "X_smac", transposed)

In [None]:
dict_parameters_correspondence

## Load the data from script

In [None]:
path_to_data = "results/medium"

pickle_in = open(path_to_data,"rb")
dict_results = pickle.load(pickle_in)

# Affect all the necesary variables
dict_parameters_correspondence = dict_results["parameter_correspondence"]

## Plot interesting statistics

First we will plot the effect of the noise on the matching

In [None]:
# We choose to plot first the effect of the noise.

name_dataset = "big_2"

# get the interesting data
outliers_constant_integer = 3
x = dict_parameters_correspondence["noise"]
y_kergm = np.mean(kergm_res[:,outliers_constant_integer,:],1)
std_kergm = np.std(kergm_res[:,outliers_constant_integer,:],1)
y_ipf = np.mean(ipf_res[:,outliers_constant_integer,:],1)
std_ipf = np.std(ipf_res[:,outliers_constant_integer,:],1)
y_rrwm = np.mean(rrwm_res[:,outliers_constant_integer,:],1)
std_rrwm = np.std(rrwm_res[:,outliers_constant_integer,:],1)
y_smac = np.mean(smac_res[:,outliers_constant_integer,:],1)
std_smac = np.std(smac_res[:,outliers_constant_integer,:],1)


# plot the curves
plt.figure(figsize=(10,6))
plt.plot(x, y_kergm, label="KerGM", alpha=1)
plt.fill_between(x, y_kergm-std_kergm, y_kergm+std_kergm, alpha=0.2)
plt.plot(x, y_ipf, label="IPF", alpha=1)
plt.fill_between(x, y_ipf-std_ipf, y_ipf+std_ipf, alpha=0.2)
plt.plot(x, y_rrwm, label="RRWM", alpha=1)
plt.fill_between(x, y_rrwm-std_rrwm, y_rrwm+std_rrwm, alpha=0.2)
plt.plot(x, y_smac, label="smac", alpha=1)
plt.fill_between(x, y_smac-std_smac, y_smac+std_smac, alpha=0.2)
plt.xlabel("Noise value")
plt.ylabel("Accuracy")
plt.ylim(-0.2,1.1)
plt.title("Accuracy vs Noise (with "+str(int(dict_parameters_correspondence["outliers"][outliers_constant_integer]))+" outliers)")
plt.legend()
plt.savefig("plots/Accuracy_vs_noise_"+str(name_dataset))


##### Plot the accuracy vs_outlier #####
# get the interesting data
noise_constant_integer = 3
x = dict_parameters_correspondence["outliers"]
y_kergm = np.mean(kergm_res[noise_constant_integer,:,:],1)
std_kergm = np.std(kergm_res[noise_constant_integer,:,:],1)
y_ipf = np.mean(ipf_res[noise_constant_integer,:,:],1)
std_ipf = np.std(ipf_res[noise_constant_integer,:,:],1)
y_rrwm = np.mean(rrwm_res[noise_constant_integer,:,:],1)
std_rrwm = np.std(rrwm_res[noise_constant_integer,:,:],1)
y_smac = np.mean(smac_res[noise_constant_integer,:,:],1)
std_smac = np.std(smac_res[noise_constant_integer,:,:],1)


# plot the curves
plt.figure(figsize=(10,6))
plt.plot(x, y_kergm, label="KerGM", alpha=1)
plt.fill_between(x, y_kergm-std_kergm, y_kergm+std_kergm, alpha=0.2)
plt.plot(x, y_ipf, label="IPF", alpha=1)
plt.fill_between(x, y_ipf-std_ipf, y_ipf+std_ipf, alpha=0.2)
plt.plot(x, y_rrwm, label="RRWM", alpha=1)
plt.fill_between(x, y_rrwm-std_rrwm, y_rrwm+std_rrwm, alpha=0.2)
plt.plot(x, y_smac, label="smac", alpha=1)
plt.fill_between(x, y_smac-std_smac, y_smac+std_smac, alpha=0.2)
plt.xlabel("nb_outliers value")
plt.ylabel("Accuracy")
plt.ylim(-0.1,1.1)
plt.title("Accuracy vs Outliers (with noise = "+str(dict_parameters_correspondence["noise"][noise_constant_integer])+")")
plt.legend()
plt.savefig("plots/Accuracy_vs_outliers_"+str(name_dataset))

In [None]:
def plot_one_curve(dict_results, name_in_dict, label, x, variable_to_show="accuracy", outliers_constant_integer = None, noise_constant_integer=None, color=None):
    """
    Plot the mean and std of the result of one algorithm
    given some constraints on the parameters
    """
    
    if variable_to_show == "accuracy":
        place_tuple = 0
    elif variable_to_show == "time":
        place_tuple = 1
    
    alg_res = dict_results[name_in_dict][place_tuple]
    if outliers_constant_integer != None:
        y = np.mean(alg_res[:,outliers_constant_integer,:],1)
        std = np.std(alg_res[:,outliers_constant_integer,:],1)
    elif noise_constant_integer != None:
        y = np.mean(alg_res[noise_constant_integer,:,:],1)
        std = np.std(alg_res[noise_constant_integer,:,:],1)
        
    if color == None:
        plt.plot(x, y, label=label, alpha=1)
        plt.fill_between(x, y-std, y+std, alpha=0.2)
    else:
        plt.plot(x, y, label=label, alpha=1, color=color)
        plt.fill_between(x, y-std, y+std, alpha=0.2, color=color)

In [None]:
name_dataset = "simus_complete_F1"
nb_vertices = 85
nb_graphs = 25
method = "KerGM"
path_to_data = "results/simus_complete_F1.pickle" # change if recall is needed

data_to_show = "F1"
if data_to_show == "accuracy":
    name_dataset = "accuracy_"+name_dataset
elif data_to_show =="time":
    name_dataset = "time_"+name_dataset
elif data_to_show =="recall":
    name_dataset = "recall_"+name_dataset
elif data_to_show =="prec":
    name_dataset = "prec_"+name_dataset
elif data_to_show == "F1":
    name_dataset = "F1_" + name_dataset

pickle_in = open(path_to_data,"rb")
dict_results = pickle.load(pickle_in)

# Affect all the necesary variables
dict_parameters_correspondence = dict_results["parameter_correspondence"]

print("======================================")
print("================ NOISE ===============")
print("======================================")


for i in range(len(dict_parameters_correspondence["outliers"])):
    
    outliers_constant_integer = i
    nb_outliers = dict_parameters_correspondence["outliers"][i]
    percent_outliers = int(nb_outliers / (nb_vertices + nb_outliers) * 100)
    
    x = dict_parameters_correspondence["noise"]
    
    plt.figure(figsize=(10,6))
    if method == "KerGM":
        plot_one_curve(dict_results, "KerGM", "KerGM", x, outliers_constant_integer = outliers_constant_integer)#, variable_to_show=data_to_show)
    elif method == "good_guess":
        plot_one_curve(dict_results, "good_guess", "Good Guess", x, outliers_constant_integer = outliers_constant_integer)#, variable_to_show=data_to_show)
    
    #plot_one_curve(dict_results, 
    #               "mSync", 
    #               "mSync", 
    #               x, 
    #               outliers_constant_integer = outliers_constant_integer, 
    #               #variable_to_show=data_to_show,
    #               color="tab:orange"
    #              )

    plot_one_curve(dict_results, 
                   "mALS", 
                   "mALS", 
                   x, 
                   outliers_constant_integer = outliers_constant_integer, 
                   #variable_to_show=data_to_show,
                   color = "tab:red"
                  )

    #plot_one_curve(dict_results, 
    #               "cao_cst_o", 
    #               "cao_cst_o", 
    #               x, 
    #               outliers_constant_integer = outliers_constant_integer, 
    #               #variable_to_show=data_to_show,
    #               color="tab:purple"
    #              )
    
    plot_one_curve(dict_results, 
                   "ipf", 
                   "ipf", 
                   x, 
                   outliers_constant_integer = outliers_constant_integer, 
                   #variable_to_show=data_to_show,
                   #color="tab:purple"
                  )
    
    plot_one_curve(dict_results, 
                   "smac", 
                   "smac", 
                   x, 
                   outliers_constant_integer = outliers_constant_integer, 
                   #variable_to_show=data_to_show,
                   #color="tab:purple"
                  )
    
    plot_one_curve(dict_results, 
                   "rrwm", 
                   "rrwm", 
                   x, 
                   outliers_constant_integer = outliers_constant_integer, 
                   #variable_to_show=data_to_show,
                   #color="tab:purple"
                  )
    
    #plot_one_curve(dict_results, 
    #               "KerGM_2", 
    #               "KerGM_2", 
    #               x, 
    #               outliers_constant_integer = outliers_constant_integer, 
                   #variable_to_show=data_to_show,
                   #color="tab:purple"
    #              )
    
    #plot_one_curve(dict_results, "cao_cst_s_o", "cao_cst_s_o", x, outliers_constant_integer = outliers_constant_integer, variable_to_show=data_to_show)
    
    
    plt.xlabel("Noise variance $\sigma$")
    if data_to_show == "accuracy":
        plt.ylabel("Accuracy")
        plt.ylim(-0.1,1.1)
    elif data_to_show == "recall":
        plt.ylabel("Recall")
        plt.ylim(-0.1,1.1)
    elif data_to_show == "prec":
        plt.ylabel("Precision")
        plt.ylim(-0.1,1.1)
    elif data_to_show == "F1":
        plt.ylabel("F1-Score")
        plt.ylim(-0.1,1.1)
    elif data_to_show == "time":
        plt.ylabel("Time (in seconds)")
        #plt.ylim(-0.2,1.1)
    #plt.title("Accuracy vs Noise Variance (with {:0d}% outliers, {:0d} graphs, pairwise method {})".format(percent_outliers, nb_graphs, method))
    plt.legend()
    plt.rc('font', size=18)
    plt.xticks(x)
    plt.savefig("plots/"+data_to_show+"_vs_noise_"+str(name_dataset)+"_"+str(int(dict_parameters_correspondence["outliers"][outliers_constant_integer]))+","+str(nb_graphs)+","+method+".png")
    plt.plot()
    plt.show()

print("======================================")
print("============== OUTLIERS ==============")
print("======================================")

#for i in range(len(dict_parameters_correspondence["noise"])):
#    noise_constant_integer = i
#    x = dict_parameters_correspondence["outliers"]
#    x = [int(nb_outliers / (nb_vertices + nb_outliers) * 100) for nb_outliers in x]
#    plt.figure(figsize=(10,6))
    #plot_one_curve(dict_results, "KerGM", "KerGM", x, noise_constant_integer = noise_constant_integer)
    #plot_one_curve(dict_results, "mSync", "mSync", x, noise_constant_integer = noise_constant_integer)
    #plot_one_curve(dict_results, "mOpt", "mOpt", x, noise_constant_integer = noise_constant_integer)
    #plot_one_curve(dict_results, "cao", "cao", x, noise_constant_integer = noise_constant_integer)
    #plot_one_curve(dict_results, "cao_o", "cao_o", x, noise_constant_integer = noise_constant_integer)
    #plot_one_curve(dict_results, "cao_s_o", "cao_s_o", x, noise_constant_integer = noise_constant_integer)
    #plot_one_curve(dict_results, "cao_uc", "cao_uc", x, noise_constant_integer = noise_constant_integer)
    #plot_one_curve(dict_results, "cao_uc_o", "cao_uc_o", x, noise_constant_integer = noise_constant_integer)
    #plot_one_curve(dict_results, "cao_uc_s_o", "cao_uc_s_o", x, noise_constant_integer = noise_constant_integer)
    #plot_one_curve(dict_results, "cao_pc", "cao_pc", x, noise_constant_integer = noise_constant_integer)
    #plot_one_curve(dict_results, "cao_pc_o", "cao_pc_o", x, noise_constant_integer = noise_constant_integer)
    #plot_one_curve(dict_results, "cao_pc_s_o", "cao_pc_s_o", x, noise_constant_integer = noise_constant_integer)
    #plot_one_curve(dict_results, "cao_c", "cao_c", x, noise_constant_integer = noise_constant_integer)
    #plot_one_curve(dict_results, "cao_c_o", "cao_c_o", x, noise_constant_integer = noise_constant_integer)
    #plot_one_curve(dict_results, "cao_c_s_o", "cao_c_s_o", x, noise_constant_integer = noise_constant_integer)
    
    
    #plt.xlabel("% of outliers")
    #plt.ylabel("Accuracy")
    #plt.ylim(-0.2,1.1)
    #plt.title("Accuracy vs Outliers (with noise variance = "+str(dict_parameters_correspondence["noise"][noise_constant_integer])+")")
    #plt.legend()
    #plt.savefig("plots/Accuracy_vs_outliers_"+str(name_dataset)+"_"+str(dict_parameters_correspondence["noise"][noise_constant_integer])+".png" )
    #plt.show()
    
    

In [None]:
dict_results.keys()

## Plots results from mixed pairwise

In [None]:
def plot_one_curve_mix(dict_results, name_in_dict, label, x, variable_to_show="accuracy", outliers_constant_integer = 0, noise_constant_integer=0, color=None):
    """
    Plot the mean and std of the result of one algorithm - changed for mixed pairwise
    given some constraints on the parameters
    """
    
    if variable_to_show == "accuracy":
        place_tuple = 0
        type_data = "accuracy"
    elif variable_to_show == "distance":
        place_tuple = 0
        type_data = "distance_to_original"
        
    
    list_res_y = []
    list_res_std = []
    for elem in x:
        alg_res = dict_results[elem][type_data][name_in_dict][place_tuple]
        list_res_y.append(np.mean(alg_res[noise_constant_integer,outliers_constant_integer,:]))
        list_res_std.append(np.std(alg_res[noise_constant_integer,outliers_constant_integer,:]))
         
    list_res_y = np.array(list_res_y)
    list_res_std = np.array(list_res_std)
    if color == None:
        plt.plot(x, list_res_y, label=label, alpha=1)
        plt.fill_between(x, list_res_y-list_res_std, list_res_y+list_res_std, alpha=0.2)
    else:
        plt.plot(x, list_res_y, label=label, alpha=1, color=color)
        plt.fill_between(x, list_res_y-list_res_std, list_res_y+list_res_std, alpha=0.2, color=color)

In [None]:
import os

# Define some variables for the plot
name_dataset = "big_high_noise_prec"
nb_vertices = 85
nb_graphs = 25
method = "mix"
folder_res = method + "_" + name_dataset
folder_path = os.path.join("plots",folder_res)
path_to_data = "results/big_high_noise_mix_prec.pickle"

data_to_show = "accuracy"
if data_to_show == "accuracy":
    name_dataset = "accuracy_"+name_dataset
    name_to_show = "Accuracy"
elif data_to_show =="distance":
    name_dataset = "distance_"+name_dataset
    name_to_show = "Distance to KerGM"

pickle_in = open(path_to_data,"rb")
dict_results = pickle.load(pickle_in)

# Create the plot sub folder if it does not exist
if not os.path.exists(folder_path):
    os.mkdir(folder_path)


# Affect all the necesary variables
dict_parameters_correspondence = dict_results["parameter_correspondence"]
del dict_results["parameter_correspondence"]
x = list(dict_results.keys())
x.sort()

print("======================================")
print("================ NOISE ===============")
print("======================================")


for i_outliers in range(len(dict_parameters_correspondence["outliers"])):
    for j_noise in range(len(dict_parameters_correspondence["noise"])):
        
        outliers_constant_integer = i_outliers
        noise_constant_integer = j_noise
        nb_outliers = dict_parameters_correspondence["outliers"][i_outliers]
        percent_outliers = int(nb_outliers / (nb_vertices + nb_outliers) * 100)
    
        plt.figure(figsize=(10,6))
        plot_one_curve_mix(dict_results, 
                           "mSync", 
                           "mSync", 
                           x, 
                           data_to_show, 
                           outliers_constant_integer, 
                           noise_constant_integer,
                           "tab:orange"
                          )
        plot_one_curve_mix(dict_results, 
                           "mALS", 
                           "mALS", 
                           x, 
                           data_to_show, 
                           outliers_constant_integer, 
                           noise_constant_integer,
                           "tab:red"
                          )
        plot_one_curve_mix(dict_results, 
                           "cao_cst_o", 
                           "cao_cst_o", 
                           x, 
                           data_to_show, 
                           outliers_constant_integer, 
                           noise_constant_integer,
                           "tab:purple"
                          )
        
    
    
    
        plt.xlabel("% Of matching done with good guess")
        if data_to_show == "accuracy":
            plt.ylabel("Accuracy")
            plt.ylim(-0.1,1.1)
        elif data_to_show == "distance":
            plt.ylabel("Difference with the results using full KerGM")
            plt.ylim(-0.1,1.1)
        plt.title(name_to_show + " vs Mix Rate (with {:0d}% outliers, {:0d} Noise variance, {:0d} graphs, pairwise method {})".format(percent_outliers,
                                                                                                                                      int(dict_parameters_correspondence["noise"][noise_constant_integer]), 
                                                                                                                                      nb_graphs, 
                                                                                                                                      method))
        plt.legend()
        plt.savefig(os.path.join(folder_path,data_to_show+"_vs_mix_"+str(name_dataset)+"_"+str(int(dict_parameters_correspondence["outliers"][outliers_constant_integer]))+","+str(int(dict_parameters_correspondence["noise"][noise_constant_integer]))+","+str(nb_graphs)+","+method+".png"))
        plt.plot()
        plt.show()

# Statistics when using different numbers of graphs

In [None]:
def plot_one_curve_subgraphs(dict_results, name_in_dict, label, x, variable_to_show="accuracy", outliers_constant_integer = 0, noise_constant_integer=0, color=None):
    """
    Plot the mean and std of the result of one algorithm - changed for different numbers of subgraphs
    given some constraints on the parameters
    """
    
    if variable_to_show == "accuracy":
        place_tuple = 0
    elif variable_to_show == "time":
        place_tuple = 1
        
    list_res_y = []
    list_res_std = []
    for elem in x:
        #print(elem, place_tuple, name_in_dict)
        #alg_res = dict_results[elem][name_in_dict]
        alg_res = dict_results[elem][name_in_dict][place_tuple]
        list_res_y.append(np.mean(alg_res[noise_constant_integer,outliers_constant_integer,:]))
        list_res_std.append(np.std(alg_res[noise_constant_integer,outliers_constant_integer,:]))
         
    list_res_y = np.array(list_res_y)
    list_res_std = np.array(list_res_std)
    if color == None:
        plt.plot(x, list_res_y, label=label, alpha=1)
        plt.fill_between(x, list_res_y-list_res_std, list_res_y+list_res_std, alpha=0.2)
    else:
        plt.plot(x, list_res_y, label=label, alpha=1, color=color)
        plt.fill_between(x, list_res_y-list_res_std, list_res_y+list_res_std, alpha=0.2, color=color)

In [None]:
import os

# Define some variables for the plot
name_dataset = "big_high_noise_subgraphs"
nb_vertices = 85
#nb_graphs = 10
method = "KerGM"
folder_res = "sub_graphs" + "_" + name_dataset
folder_path = os.path.join("plots",folder_res)
path_to_data = "results/big_high_noise_sub_graphs.pickle"

data_to_show = "accuracy"
if data_to_show == "accuracy":
    name_dataset = "accuracy_"+name_dataset
    name_to_show = "Accuracy"
elif data_to_show =="time":
    name_dataset = "time_"+name_dataset
    name_to_show = "Time"

pickle_in = open(path_to_data,"rb")
dict_results = pickle.load(pickle_in)

# Create the plot sub folder if it does not exist
if not os.path.exists(folder_path):
    os.mkdir(folder_path)


# Affect all the necesary variables
dict_parameters_correspondence = dict_results["parameter_correspondence"]
del dict_results["parameter_correspondence"]
x = list(dict_results.keys())
x.sort()

print("======================================")
print("================ NOISE ===============")
print("======================================")


for i_outliers in range(len(dict_parameters_correspondence["outliers"])):
    for j_noise in range(len(dict_parameters_correspondence["noise"])):
        
        outliers_constant_integer = i_outliers
        noise_constant_integer = j_noise
        nb_outliers = dict_parameters_correspondence["outliers"][i_outliers]
        percent_outliers = int(nb_outliers / (nb_vertices + nb_outliers) * 100)
    
        plt.figure(figsize=(10,6))
        plot_one_curve_subgraphs(dict_results, 
                           "mSync", 
                           "mSync", 
                           x, 
                           data_to_show, 
                           outliers_constant_integer, 
                           noise_constant_integer,
                           "tab:orange"
                          )
        plot_one_curve_subgraphs(dict_results, 
                           "mALS", 
                           "mALS", 
                           x, 
                           data_to_show, 
                           outliers_constant_integer, 
                           noise_constant_integer,
                           "tab:red"
                          )
        plot_one_curve_subgraphs(dict_results, 
                           "cao_cst_o", 
                           "cao_cst_o", 
                           x, 
                           data_to_show, 
                           outliers_constant_integer, 
                           noise_constant_integer,
                           "tab:purple"
                          )
        
    
    
    
        plt.xlabel("Number of graphs used")
        if data_to_show == "accuracy":
            plt.ylabel("Accuracy")
            plt.ylim(-0.1,1.1)
        elif data_to_show == "time":
            plt.ylabel("Time")
            #plt.ylim(-0.1,1.1)
        plt.title(name_to_show + " vs nb of graphs (with {:0d}% outliers, {:0d} Noise variance, pairwise method {})".format(percent_outliers,
                                                                                                                                      int(dict_parameters_correspondence["noise"][noise_constant_integer]), 
                                                                                                                                      method))
        plt.legend()
        plt.savefig(os.path.join(folder_path,data_to_show+"_vs_graphs_"+str(name_dataset)+"_"+str(int(dict_parameters_correspondence["outliers"][outliers_constant_integer]))+","+str(int(dict_parameters_correspondence["noise"][noise_constant_integer]))+","+method+".png"))
        plt.plot()
        plt.show()

## Plot labeling results on accuracy

In [None]:
# load the data
path_to_data = "results/des_gros_test_label.pickle"
pickle_in = open(path_to_data,"rb")
dict_results = pickle.load(pickle_in)

In [None]:
def plot_one_curve(dict_parameters, x, result_tensor, label, outliers_constant_integer = None, noise_constant_integer=None):
    """
    Plot the accuracy result by fixing either noise or outliers at one level
    """
    
    if outliers_constant_integer is not None:
        matrix_to_use = result_tensor[:,outliers_constant_integer,:]
    else:
        matrix_to_use = result_tensor[noise_constant_integer,:,:]
        
    y_mean = matrix_to_use.mean(1)
    print(y_mean)
    y_std = matrix_to_use.std(1)
    
    # plot
    plt.plot(x, y_mean, label=label)
    plt.fill_between(x, y_mean - y_std, y_mean + y_std, alpha=0.2)
    
    

In [None]:
import os

# Define some variables for the plot
name_dataset = "big_high_noise_labelling_dbscan_precision"
nb_vertices = 85
nb_graphs = 25
folder_res = name_dataset
folder_path = os.path.join("plots",folder_res)

# load mALS basline accuracy
path_to_baseline = "results/big_high_noise_prec.pickle"
pickle_in = open(path_to_baseline,"rb")
dict_results = pickle.load(pickle_in)
result_baseline_mALS = dict_results["mALS"][0]

# load the labelling from dbscan
path_to_data = "results/big_high_noise_clustering_precision.pickle"
pickle_in = open(path_to_data,"rb")
dict_results = pickle.load(pickle_in)

# Create the plot sub folder if it does not exist
if not os.path.exists(folder_path):
    os.mkdir(folder_path)


# Affect all the necesary variables
dict_parameters_correspondence = dict_results["parameters_correspondence"]
result_tensor = dict_results["mALS"]
#del dict_results["parameter_correspondence"]


print("======================================")
print("================ NOISE ===============")
print("======================================")


for i_outliers in range(len(dict_parameters_correspondence["outliers"])):

        
    outliers_constant_integer = i_outliers
    nb_outliers = dict_parameters_correspondence["outliers"][i_outliers]
    percent_outliers = int(nb_outliers / (nb_vertices + nb_outliers) * 100)
    x = dict_parameters_correspondence["noise"]
    plt.figure(figsize=(10,6))
    
    # plot mALS baseline
    plot_one_curve(dict_parameters_correspondence,
                  x,
                  result_baseline_mALS,
                  label="mALS",
                  outliers_constant_integer=outliers_constant_integer,
                  )
    
    plot_one_curve(dict_parameters_correspondence, 
                   x,
                   result_tensor,
                   label="DBSCAN",
                   outliers_constant_integer=outliers_constant_integer,
                  )
    
        
    
    
    
    plt.xlabel("Noise variance")
    plt.ylabel("Precision")
    plt.ylim(-0.1,1.1)
        
    plt.title("Precision vs Noise Variance (with {:0d}% outliers,)".format(percent_outliers))
    plt.legend()
    plt.savefig(os.path.join(folder_path,"Precision"+"_vs_noise_"+str(dict_parameters_correspondence["outliers"][outliers_constant_integer])+".png"))
    plt.plot()
    plt.show()

# Plot effect of parameters on labelling results

In [None]:
def plot_one_curve(dict_parameters, x, result_tensor, label, outliers_constant_integer = None, noise_constant_integer=None):
    """
    Plot the accuracy result by fixing either noise or outliers at one level
    """
    
    if outliers_constant_integer is not None:
        matrix_to_use = result_tensor[:,outliers_constant_integer,:]
    else:
        matrix_to_use = result_tensor[noise_constant_integer,:,:]
        
    y_mean = matrix_to_use.mean(1)
    print(y_mean)
    y_std = matrix_to_use.std(1)
    
    # plot
    plt.plot(x, y_mean, label=label)
    plt.fill_between(x, y_mean - y_std, y_mean + y_std, alpha=0.2)

In [None]:
import os

# Define some variables for the plot
name_dataset = "big_high_noise_labelling_dbscan_params_prec"
nb_vertices = 85
nb_graphs = 25
folder_res = name_dataset
folder_path = os.path.join("plots",folder_res)

# load mALS basline accuracy
path_to_baseline = "results/big_high_noise.pickle"
pickle_in = open(path_to_baseline,"rb")
dict_results = pickle.load(pickle_in)
result_baseline_mALS = dict_results["mALS"][0]

# load the labelling from dbscan
path_to_data = "results/big_high_noise_clustering_params_prec.pickle"
pickle_in = open(path_to_data,"rb")
dict_results = pickle.load(pickle_in)

# Create the plot sub folder if it does not exist
if not os.path.exists(folder_path):
    os.mkdir(folder_path)


# Affect all the necesary variables
dict_parameters_correspondence = dict_results["parameters_correspondence"]
result_tensor = dict_results["DBSCAN"]
#del dict_results["parameter_correspondence"]


print("======================================")
print("================ NOISE ===============")
print("======================================")


for i_outliers in range(len(dict_parameters_correspondence["outliers"])):

        
    outliers_constant_integer = i_outliers
    nb_outliers = dict_parameters_correspondence["outliers"][i_outliers]
    percent_outliers = int(nb_outliers / (nb_vertices + nb_outliers) * 100)
    x = dict_parameters_correspondence["noise"]
    
    for i_minsamp in range(1,len(dict_parameters_correspondence["minsamp"])):
        
        plt.figure(figsize=(10,6))
    
        # plot mALS baseline
        plot_one_curve(dict_parameters_correspondence,
                      x,
                      result_baseline_mALS,
                      label="mALS",
                      outliers_constant_integer=outliers_constant_integer,
                      )
    
        for i_eps in range(1,len(dict_parameters_correspondence["eps"]),2):
            # plot one curve dbscan
    
            eps_val = dict_parameters_correspondence["eps"][i_eps]
            minsamp_val = dict_parameters_correspondence["minsamp"][i_minsamp]
    
            plot_one_curve(dict_parameters_correspondence, 
                       x,
                       result_tensor[:,:,i_eps,i_minsamp,:],
                       label="DBSCAN eps="+str(eps_val)+" min_samp="+str(minsamp_val),
                       outliers_constant_integer=outliers_constant_integer,
                      )
        
            #plot_one_curve(dict_parameters_correspondence, 
            #           x,
            #           result_tensor[:,:,5,10,:],
            #           label="DBSCAN eps="+str(5)+" min_samp="+str(10),
            #           outliers_constant_integer=outliers_constant_integer,
            #          )
    
    
    
        plt.xlabel("Noise variance")
        plt.ylabel("Precision")
        plt.ylim(-0.1,1.1)
        
        plt.title("Precision vs Noise Variance (with {:0d}% outliers,)".format(percent_outliers))
        plt.legend()
        plt.savefig(os.path.join(folder_path,"Precision"+"_vs_noise_"+str(dict_parameters_correspondence["outliers"][outliers_constant_integer])+"_eps_"+str(i_eps)+"_minsamp_"+str(i_minsamp)+".png"))
        plt.plot()
        plt.show()

In [None]:
104*3

In [None]:
result_tensor

## Function to create plots from different experiments

In [None]:
pickle_in = open("results/big_low_noise.pickle","rb")
dict_result_2 = pickle.load(pickle_in)

pickle_in = open("results/big_high_noise.pickle","rb")
dict_result_3 = pickle.load(pickle_in)
print(dict_result_2['parameter_correspondence'])

dict_list = [dict_result_2, dict_result_3]

In [None]:
def plot_from_different_experiments(list_dict_result, x_param="noise", static_param_value=0, name_dataset="full"):
    """ Get the result of different experiments and combine
        them to create a plot for one parameter throughout the experiments
    """
    
    if x_param == "noise":
        static_param = "outliers"
    else:
        static_param = "noise"
    
    name_algorithms = ["KerGM","mALS","mSync","cao_cst_o"]
    label_corres = {"kergm":"KerGM", "ipf":"IPF", "rrwm":"RRWM", "smac":"Smac"}
    
    # create three lists x, y and err for one plot
    x_final = []
    y_final = {alg_name:{} for alg_name in name_algorithms}
    err_final = {alg_name:{} for alg_name in name_algorithms}
    
    #y_tuple = {alg_name:{} for alg_name in name_algorithms}
    #err_tuple = {alg_name:{} for alg_name in name_algorithms}
    
    
    # We go through all the dict_results and we keep the tuple (x,val)
    for dict_result in list_dict_result:
        
        # We load the data
        dict_parameters_correspondence = dict_result["parameter_correspondence"]
        kergm_res = dict_result["KerGM"]
        ipf_res = dict_result["mALS"]
        rrwm_res = dict_result["mSync"]
        smac_res = dict_result["cao_cst_o"]
        
        algos_res = [(name, dict_result[name]) for name in name_algorithms]

        # We get the integer value for the static param
        static_param_integer = dict_parameters_correspondence[static_param].index(static_param_value)
        
        # We fill the result dictionaries
        for variable_param_i, variable_param_val in enumerate(dict_parameters_correspondence[x_param]):
            
            for name_alg, dict_res_alg in algos_res:
                
                # We create the entry for the given x if it does not exist :
                if variable_param_val not in y_final[name_alg]:
                    y_final[name_alg][variable_param_val] = []
                    
                if variable_param_val not in err_final[name_alg]:
                    err_final[name_alg][variable_param_val] = []
                    
                # We compute the result
                if x_param == "noise":
                    #print(variable_param_i, static_param_integer, dict_res_alg[variable_param_i,static_param_integer,:])
                    #print(variable_param_i, static_param_integer, dict_res_alg)
                    #y_tmp = np.mean(dict_res_alg[static_param_integer,variable_param_i,:],1)
                    #err_tmp = np.std(dict_res_alg[static_param_integer,variable_param_i,:],1)
                    y_tmp = np.mean(dict_res_alg[variable_param_i,static_param_integer,:])
                    err_tmp = np.std(dict_res_alg[variable_param_i,static_param_integer,:])
                else:
                    y_tmp = np.mean(dict_res_alg[static_param_integer,variable_param_i,:])
                    err_tmp = np.std(dict_res_alg[static_param_integer,variable_param_i,:])
                    
                y_final[name_alg][variable_param_val].append(y_tmp)
                err_final[name_alg][variable_param_val].append(err_tmp)
                
    # Now that we have the numbers we take the mean of the points where there are several values
    for alg_name in y_final:
        for x_val in y_final[alg_name]:
            y_final[alg_name][x_val] = np.mean(y_final[alg_name][x_val])
            err_final[alg_name][x_val] = np.mean(err_final[alg_name][x_val])
            
            
    # We plot everything of interest   
    plt.figure(figsize=(10,6))
    
    for alg_name in y_final:
        x_list = []
        y_list = []
        err_list = []
        
        for x_val in y_final[alg_name]:
            x_list.append(x_val)
            y_list.append(y_final[alg_name][x_val])
            err_list.append(err_final[alg_name][x_val])
            
        y_list = np.array(y_list)
        err_list = np.array(err_list)
        plt.plot(x_list, y_list, label=label_corres[alg_name])
        plt.fill_between(x_list, y_list-err_list, y_list+err_list, alpha=0.2)
       
    if x_param == "noise":
        percent_outliers = int(static_param_value/(90 + static_param_value) * 100)
        plt.xlabel("Noise variance")
        plt.title("Accuracy vs Noise Variance (with {:0d}% outliers)".format(percent_outliers))
        name_to_save = "plots/Accuracy_vs_noise_"+name_dataset+"_"+str(static_param_value)+".png"

    else:
        x_list = [int(x/(90 + x) * 100) for x in x_list]
        plt.xlabel("% of outliers")
        plt.title("Accuracy vs Outliers (with noise variance = "+str(static_param_value)+")")
        name_to_save = "plots/Accuracy_vs_outliers_"+name_dataset+"_"+str(static_param_value)+".png"
    
    plt.ylabel("Accuracy")
    plt.ylim(-0.2,1.1)
    plt.legend()
    plt.savefig(name_to_save)
    plt.show()
    
    
    return y_final, err_final
                        
a, b = plot_from_different_experiments([dict_results],"outliers",20)
a

In [None]:
a,b = plot_from_different_experiments(dict_list,"noise",25)
a

In [None]:
import numpy as np

np.linspace(0,1,11)

In [None]:
num = 10
[i/num for i in range(0,10)]

In [None]:
import networkx as nx
import os
import pickle

In [None]:
path = "/home/rohit/PhD_Work/stage_nathan/data/simu_graph/noise_1000,outliers_0/0"

In [None]:
graph_1 = pickle.load(open(os.path.join(path,"graphs","graph_"+str(0)+".gpickle"),'rb'))

In [None]:
from platform import python_version
print(python_version())

In [None]:
a = [3,1,7,5,]
b = ['l','m','n','o']
c = ['p','q','r','s']

In [None]:
zipped = zip(a, b, c)
zipped = sorted(zipped,reverse = True)
zipped

In [None]:
a1 = []
a2 = []
for l,m,n in zipped:
    print(l,m,n)
    a1.append(m)
    a2.append(n)

In [None]:
a1

In [None]:
a2

In [None]:
ground_truth[19][19]

In [None]:
os.listdir(path)

In [None]:
for parameters_name in os.listdir(path):
    for run_name in os.listdir(os.path.join(path,parameters_name)):
        path_to_run = os.path.join(path,parameters_name,run_name)
        print(path_to_run)

In [None]:
path_to_graphs = os.path.join(path_to_run,"graphs")

In [None]:
import networkx as nx
g = nx.read_gpickle(os.path.join(path_to_graphs, "graph_0.gpickle"))

In [None]:
path_to_graphs

In [None]:
import pickle

In [None]:
print(pickle.format_version)

In [None]:
nx.__version__