In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import project_path

In [3]:
import glob
import os

from pathlib import Path
from itertools import product

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.patches as patches

from src.plotting import make_result_heatmap, add_lines

In [4]:
pd.read_csv('../experiments.csv')

Unnamed: 0,id,Type of experiment
0,0,Local experiments
1,1589802894,SBM full tree
2,1590579968,SBM
3,1590672352,SBM test nb cuts
4,1590679038,SBM 20 cuts keep 80%
5,1590585263,Mindsets (old)
6,1590663805,Mindsets
7,1590676940,Mindsets 100-50% 20 questions
8,1590906539,Mindsets 100-50% 40 questions
9,1590686580,Mindsets 0->40 questions


In [5]:
experiment_id = '1591102901'

In [6]:
path_in = Path(f'../output/{experiment_id}').resolve()
path_temp = Path(f'/tmp/{experiment_id}.csv').resolve()
path_out = Path(f'../plots/{experiment_id}').resolve()
path_out.mkdir(parents=True, exist_ok=True)

In [7]:
is_first_file = True
with open(path_temp,"wb") as output_file:
    for subdir, dirs, files in os.walk(path_in):
        for file in files:
            input_path = f'{subdir}/{file}'
            if is_first_file:
                is_first_file = False
                with open(input_path, "rb") as input_file:
                    output_file.write(input_file.read())
            else:
                with open(input_path, "rb") as input_file:
                    next(input_file)
                    output_file.write(input_file.read())

In [13]:
full_df = pd.read_csv(path_temp, index_col=0).reset_index(drop=True)
full_df = full_df.rename(columns={'lb_f': 'Lower bound', 'nb_cuts': 'Number of cuts'})

len(full_df['q'].unique())

9

In [36]:
block_size = 100
xs = np.arange(1, 21)
i = (np.log(block_size) / block_size * np.arange(1, 21)).round(2)

idx = np.arange(len(xs))
mask = np.zeros((len(xs), len(xs)), dtype=bool)

for a, b in product(xs, xs):
    mask[a-1, b-1] = ~(i[a-1] < 2*i[b-1])
theory_df = pd.DataFrame(mask, columns=i, index=i).T.sort_index(ascending=False).sort_index(axis=1, ascending=True)
values_theory = theory_df.to_numpy()
mask[:, 4]

[[False, False, False, False, False, False, True, True, True],
 [False, False, False, False, False, True, True, True, True],
 [False, False, False, False, False, True, True, True, True],
 [False, False, False, False, True, True, True, True, True],
 [False, False, False, False, True, True, True, True, True],
 [False, False, False, True, True, True, True, True, True],
 [False, False, False, True, True, True, True, True, True],
 [False, False, True, True, True, True, True, True, True],
 [False, False, True, True, True, True, True, True, True],
 [False, True, True, True, True, True, True, True, True]]

# SBM

In [40]:
if experiment_id == '1591102901':
    experiments = [x for _, x in full_df.groupby(['Lower bound'])]
    for experiment in experiments:

        fig, ax = plt.subplots(figsize=(15, 10))

        name = f"SBM_bound_theory_lb_f_{experiment['Lower bound'].iloc[0]}"

        columns = ['p', 'q', 'Adjusted Rand Score']
        data = experiment[columns].groupby(['p', 'q'], as_index=False).mean().abs()
                                                         
        make_result_heatmap(data, ax, x_column=columns[0], y_column=columns[1], values_column=columns[2], x_label='within-cluster connectivity $p$', y_label='between-cluster connectivity $q$')
        ax.set_axisbelow(True)
        fig.tight_layout()

        fig.savefig(f'{path_out / name}.svg')
        plt.close(fig) 

In [11]:
if experiment_id == '1589802894':
    experiments = [x for _, x in full_df.groupby(['Lower bound'])]
    for experiment in experiments:

        fig, ax = plt.subplots(figsize=(15, 10))

        name = f"SBM_bound_theory_lb_f_{experiment['Lower bound'].iloc[0]}_full_tree"

        columns = ['p', 'q', 'ars']
        data = experiment[columns].groupby(['p', 'q'], as_index=False).mean()
                                                         
        make_result_heatmap(data, ax, x_column=columns[0], y_column=columns[1], values_column=columns[2], x_label='within-cluster connectivity $p$', y_label='between-cluster connectivity $q$')
        add_lines(values_theory, ax, plot_first=True)
        ax.set_axisbelow(True)
        fig.tight_layout()

        fig.savefig(f'{path_out / name}.svg')
        plt.close(fig) 

In [12]:
if experiment_id == '1590679038':
    experiments = [x for _, x in full_df.groupby(['q', 'p'])]
    for experiment in experiments:

        fig, ax = plt.subplots(figsize=(15, 10))

        name = f"SBM_cuts_p_{experiment['p'].iloc[0]}_q_{experiment['q'].iloc[0]}"

        columns = ['Lower bound', 'Number of cuts', 'Adjusted Rand Score']
        data = experiment[columns].groupby(['Lower bound', 'Number of cuts'], as_index=False).mean()
        
        data.to_csv(f'{name}.csv', index=False)
                                                 
        make_result_heatmap(data, ax, x_column=columns[0], y_column=columns[1], values_column=columns[2])
        add_lines(values_theory, ax)
        ax.set_axisbelow(True)
        fig.tight_layout()

        fig.savefig(f'{path_out / name}.pdf')
        plt.close(fig) 

id = 1589802894

In [23]:
if experiment_id == '1589802894':
        
    experiments = [x for _, x in full_df.groupby(['Lower bound'])]
    for experiment in experiments:

        fig, ax = plt.subplots(figsize=(22.5, 15))

        name = f"SBM_bound_theory_lb_f_{experiment['Lower bound'].iloc[0]}_full_tree"

        columns = ['p', 'q', 'ars']
        data = experiment[columns].groupby(['p', 'q'], as_index=False).mean()
                                                         
        make_result_heatmap(data, ax, x_column=columns[0], y_column=columns[1], values_column=columns[2], x_label='within-cluster probability p', y_label='between-cluster probability q')
        add_lines(values_theory, ax, plot_first=True)
        ax.set_axisbelow(True)
        fig.tight_layout()

        fig.savefig(f'{path_out / name}.svg')
        plt.close(fig) 

id = 1590925112

In [14]:
if experiment_id == '1590925112':
    full_df['size'] = full_df['block_sizes'].str.extract(r'(\d+),').astype(int)
    experiments = [x for _, x in full_df.groupby(['p', 'q', 'Lower bound'])]
    for experiment in experiments:

        fig, ax = plt.subplots(figsize=(15, 10))

        name = f"SBM_bound_theory_p_{experiment['p'].iloc[0]}_q_{experiment['q'].iloc[0]}_lb_f_{experiment['Lower bound'].iloc[0]}"

        columns = [ 'size', 'agreement', 'Adjusted Rand Score']
        data = experiment[columns].groupby([ 'size', 'agreement',], as_index=False).mean()
                                                         
        make_result_heatmap(data, ax, x_column=columns[0], y_column=columns[1], values_column=columns[2], x_label='Block size $| V_i |$', y_label='Agreement $a$')
        add_lines(values_theory, ax, plot_first=True)
        ax.set_axisbelow(True)
        fig.tight_layout()

        fig.savefig(f'{path_out / name}.pdf')
        plt.close(fig) 

id = 1590928658

In [15]:
if experiment_id == '1590928658':
    full_df['size'] = full_df['block_sizes'].str.extract(r'(\d+),').astype(int)
    experiments = [x for _, x in full_df.groupby(['p', 'q', 'Lower bound'])]
    for experiment in experiments:

        fig, ax = plt.subplots(figsize=(15, 10))

        name = f"SBM_bound_theory_p_{experiment['p'].iloc[0]}_q_{experiment['q'].iloc[0]}_lb_f_{experiment['Lower bound'].iloc[0]}"

        columns = [ 'percentile_orders', 'agreement', 'Adjusted Rand Score']
        data = experiment[columns].groupby([ 'percentile_orders', 'agreement',], as_index=False).mean()
                                                         
        make_result_heatmap(data, ax, x_column=columns[0], y_column=columns[1], values_column=columns[2], x_label='percentile of orders considered', y_label='agreement')
        add_lines(values_theory, ax, plot_first=True)
        ax.set_axisbelow(True)
        fig.tight_layout()

        fig.savefig(f'{path_out / name}.pdf')
        plt.close(fig) 

id = 1590935943

In [16]:
full_df['percentile_orders'].unique()

array([100.])

In [17]:
if experiment_id == '1590935943':
    experiments = [x for _, x in full_df.groupby(['p', 'q', 'percentile_orders'])]
    for experiment in experiments:

        fig, ax = plt.subplots(figsize=(15, 10))

        name = f"SBM_bound_theory_p_{experiment['p'].iloc[0]}_q_{experiment['q'].iloc[0]}_per_{experiment['percentile_orders'].iloc[0]}"

        columns = [ 'Lower bound', 'Number of cuts', 'Adjusted Rand Score']
        data = experiment[columns].groupby(['Lower bound', 'Number of cuts'], as_index=False).mean()
                                                         
        make_result_heatmap(data, ax, x_column=columns[0], y_column=columns[1], values_column=columns[2], x_label='lower bound', y_label='Number of cuts')
        ax.set_axisbelow(True)
        fig.tight_layout()

        fig.savefig(f'{path_out / name}.pdf')
        plt.close(fig) 

id = 1590934402

In [18]:
if experiment_id == '1590934402':
    
    full_df['size'] = full_df['block_sizes'].str.extract(r', (\d+)').astype(int)
    experiments = [x for _, x in full_df.groupby(['p', 'q', 'percentile_orders'])]
    for experiment in experiments:

        fig, ax = plt.subplots(figsize=(15, 10))

        name = f"SBM_bound_theory_p_{experiment['p'].iloc[0]}_q_{experiment['q'].iloc[0]}_per_{experiment['percentile_orders'].iloc[0]}"

        columns = [ 'size', 'Lower bound',  'Adjusted Rand Score']
        data = experiment[columns].groupby(['size', 'Lower bound'], as_index=False).mean()
                                                         
        make_result_heatmap(data, ax, x_column=columns[0], y_column=columns[1], values_column=columns[2], x_label='block size $| V_i |$', y_label='lower bound')
        ax.set_axisbelow(True)
        fig.tight_layout()

        fig.savefig(f'{path_out / name}.pdf')
        plt.close(fig) 

id = 1591003301

In [19]:
if experiment_id == '1591014530':
    experiments = [x for _, x in full_df.groupby(['p', 'q'])]
    for experiment in experiments:

        fig, ax = plt.subplots(figsize=(15, 10))

        name = f"SBM_bound_theory_p_{experiment['p'].iloc[0]}_q_{experiment['q'].iloc[0]}"

        columns = [ 'agreement', 'percentile_orders',  'Adjusted Rand Score']
        data = experiment[columns].groupby(['agreement', 'percentile_orders'], as_index=False).mean()
                                                         
        make_result_heatmap(data, ax, x_column=columns[0], y_column=columns[1], values_column=columns[2], x_label='agreement $a$', y_label='percentile of orders')
        ax.set_axisbelow(True)
        fig.tight_layout()

        fig.savefig(f'{path_out / name}.pdf')
        plt.close(fig) 

In [20]:
full_df

Unnamed: 0,agreement,ars,block_sizes,dataset_name,dataset_type,Lower bound,max_order,Number of cuts,order_best,p,percentile_orders,preprocessing_name,q,seed,unique_id
0,50.0,0.000000,"[100, 100]",sbm,graph,0.2,5160.0,50.0,3315.0,0.28,100.0,fid_mat,0.83,42.0,1589802894
1,50.0,0.000000,"[100, 100]",sbm,graph,0.3,7616.0,50.0,6087.0,0.60,100.0,fid_mat,0.92,42.0,1589802894
2,50.0,0.000000,"[100, 100]",sbm,graph,0.4,5311.0,50.0,4722.0,0.18,100.0,fid_mat,0.87,42.0,1589802894
3,50.0,0.000000,"[100, 100]",sbm,graph,0.4,4887.0,50.0,4252.0,0.28,100.0,fid_mat,0.69,42.0,1589802894
4,50.0,0.590879,"[100, 100]",sbm,graph,0.4,7805.0,50.0,7604.0,0.87,100.0,fid_mat,0.78,42.0,1589802894
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11995,50.0,0.459964,"[100, 100]",sbm,graph,0.3,4590.0,50.0,3542.0,0.55,100.0,fid_mat,0.37,42.0,1589802894
11996,50.0,1.000000,"[100, 100]",sbm,graph,0.4,512.0,50.0,512.0,0.55,100.0,fid_mat,0.05,42.0,1589802894
11997,50.0,1.000000,"[100, 100]",sbm,graph,0.3,5292.0,50.0,3217.0,0.74,100.0,fid_mat,0.32,42.0,1589802894
11998,50.0,0.000000,"[100, 100]",sbm,graph,0.2,824.0,50.0,531.0,0.09,100.0,fid_mat,0.14,42.0,1589802894


In [21]:
if experiment_id == '1591037064':
    experiments = [x for _, x in full_df.groupby(['p', 'q'])]
    for experiment in experiments:

        fig, ax = plt.subplots(figsize=(20, 13))

        name = f"SBM_bound_theory_p_{experiment['p'].iloc[0]}_q_{experiment['q'].iloc[0]}_lb_0.2"

        columns = [ 'agreement', 'percentile_orders',  'Adjusted Rand Score']
        data = experiment[columns].groupby(['agreement', 'percentile_orders'], as_index=False).mean()
                                                         
        make_result_heatmap(data, ax, x_column=columns[0], y_column=columns[1], values_column=columns[2], x_label='agreement $a$', y_label='percentile of costs')
        ax.set_axisbelow(True)
        fig.tight_layout()

        fig.savefig(f'{path_out / name}.svg')
        plt.close(fig) 

In [22]:
full_df[full_df['agreement'] == 20].groupby('percentile_orders').mean()

Unnamed: 0_level_0,agreement,ars,Lower bound,max_order,Number of cuts,order_best,p,q,seed,unique_id
percentile_orders,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
