In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [3]:
import project_path

In [4]:
import glob
import os

from pathlib import Path
from itertools import product

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.patches as patches

from src.plotting import make_result_heatmap, add_lines

In [5]:
pd.read_csv('../experiments.csv')

Unnamed: 0,id,Type of experiment
0,0,Local experiments
1,1589802894,SBM full tree
2,1590579968,SBM
3,1590672352,SBM test nb cuts
4,1590679038,SBM 20 cuts keep 80%
5,1590585263,Mindsets (old)
6,1590663805,Mindsets
7,1590676940,Mindsets 100-50% 20 questions
8,1590906539,Mindsets 100-50% 40 questions
9,1590686580,Mindsets 0->40 questions


In [17]:
experiment_id = '1591037064'

In [18]:
path_in = Path(f'../output/{experiment_id}').resolve()
path_temp = Path(f'/tmp/{experiment_id}.csv').resolve()
path_out = Path(f'../plots/{experiment_id}').resolve()
path_out.mkdir(parents=True, exist_ok=True)

In [19]:
is_first_file = True
with open(path_temp,"wb") as output_file:
    for subdir, dirs, files in os.walk(path_in):
        for file in files:
            input_path = f'{subdir}/{file}'
            if is_first_file:
                is_first_file = False
                with open(input_path, "rb") as input_file:
                    output_file.write(input_file.read())
            else:
                with open(input_path, "rb") as input_file:
                    next(input_file)
                    output_file.write(input_file.read())

In [20]:
full_df = pd.read_csv(path_temp, index_col=0).reset_index(drop=True)
full_df = full_df.rename(columns={'lb_f': 'Lower bound', 'nb_cuts': 'Number of cuts'})

In [21]:
block_size = 100
xs = np.arange(1, 21)
i = (np.log(block_size) / block_size * np.arange(1, 21)).round(2)

idx = np.arange(len(xs))
mask = np.zeros((len(xs), len(xs)), dtype=bool)

for a, b in product(xs, xs):
    mask[a-1, b-1] = ~(i[a-1] < 2*i[b-1])
theory_df = pd.DataFrame(mask, columns=i, index=i).T.sort_index(ascending=False).sort_index(axis=1, ascending=True)
values_theory = theory_df.to_numpy()
mask[:, 4]

array([False, False, False, False, False, False, False, False, False,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True])

# SBM

In [10]:
if experiment_id == '1590579968':
    experiments = [x for _, x in full_df.groupby(['Lower bound'])]
    for experiment in experiments:

        fig, ax = plt.subplots(figsize=(22.5, 15))

        name = f"SBM_bound_theory_lb_f_{experiment['Lower bound'].iloc[0]}"

        columns = ['p', 'q', 'Adjusted Rand Score']
        data = experiment[columns].groupby(['p', 'q'], as_index=False).mean()
                                                         
        make_result_heatmap(data, ax, x_column=columns[0], y_column=columns[1], values_column=columns[2], x_label='within-cluster connectivity $p$', y_label='between-cluster connectivity $q$')
        add_lines(values_theory, ax, plot_first=True)
        add_lines(values_theory, ax, plot_first=True)
        ax.set_axisbelow(True)
        fig.tight_layout()

        fig.savefig(f'{path_out / name}.pdf')
        plt.close(fig) 

In [71]:
if experiment_id == '1589802894':
    experiments = [x for _, x in full_df.groupby(['Lower bound'])]
    for experiment in experiments:

        fig, ax = plt.subplots(figsize=(15, 10))

        name = f"SBM_bound_theory_lb_f_{experiment['Lower bound'].iloc[0]}_full_tree"

        columns = ['p', 'q', 'ars']
        data = experiment[columns].groupby(['p', 'q'], as_index=False).mean()
                                                         
        make_result_heatmap(data, ax, x_column=columns[0], y_column=columns[1], values_column=columns[2], x_label='within-cluster connectivity $p$', y_label='between-cluster connectivity $q$')
        add_lines(values_theory, ax, plot_first=True)
        ax.set_axisbelow(True)
        fig.tight_layout()

        fig.savefig(f'{path_out / name}.svg')
        plt.close(fig) 

In [72]:
if experiment_id == '1590679038':
    experiments = [x for _, x in full_df.groupby(['q', 'p'])]
    for experiment in experiments:

        fig, ax = plt.subplots(figsize=(15, 10))

        name = f"SBM_cuts_p_{experiment['p'].iloc[0]}_q_{experiment['q'].iloc[0]}"

        columns = ['Lower bound', 'Number of cuts', 'Adjusted Rand Score']
        data = experiment[columns].groupby(['Lower bound', 'Number of cuts'], as_index=False).mean()
        
        data.to_csv(f'{name}.csv', index=False)
                                                 
        make_result_heatmap(data, ax, x_column=columns[0], y_column=columns[1], values_column=columns[2])
        add_lines(values_theory, ax)
        ax.set_axisbelow(True)
        fig.tight_layout()

        fig.savefig(f'{path_out / name}.pdf')
        plt.close(fig) 

id = 1589802894

In [73]:
if experiment_id == '1589802894':
        
    experiments = [x for _, x in full_df.groupby(['Lower bound'])]
    for experiment in experiments:

        fig, ax = plt.subplots(figsize=(15, 10))

        name = f"SBM_bound_theory_lb_f_{experiment['Lower bound'].iloc[0]}_full_tree"

        columns = ['p', 'q', 'ars']
        data = experiment[columns].groupby(['p', 'q'], as_index=False).mean()
                                                         
        make_result_heatmap(data, ax, x_column=columns[0], y_column=columns[1], values_column=columns[2], x_label='within-cluster probability p', y_label='between-cluster probability q')
        add_lines(values_theory, ax, plot_first=True)
        ax.set_axisbelow(True)
        fig.tight_layout()

        fig.savefig(f'{path_out / name}.pdf')
        plt.close(fig) 

id = 1590925112

In [74]:
if experiment_id == '1590925112':
    full_df['size'] = full_df['block_sizes'].str.extract(r'(\d+),').astype(int)
    experiments = [x for _, x in full_df.groupby(['p', 'q', 'Lower bound'])]
    for experiment in experiments:

        fig, ax = plt.subplots(figsize=(15, 10))

        name = f"SBM_bound_theory_p_{experiment['p'].iloc[0]}_q_{experiment['q'].iloc[0]}_lb_f_{experiment['Lower bound'].iloc[0]}"

        columns = [ 'size', 'agreement', 'Adjusted Rand Score']
        data = experiment[columns].groupby([ 'size', 'agreement',], as_index=False).mean()
                                                         
        make_result_heatmap(data, ax, x_column=columns[0], y_column=columns[1], values_column=columns[2], x_label='Block size $| V_i |$', y_label='Agreement $a$')
        add_lines(values_theory, ax, plot_first=True)
        ax.set_axisbelow(True)
        fig.tight_layout()

        fig.savefig(f'{path_out / name}.pdf')
        plt.close(fig) 

id = 1590928658

In [75]:
if experiment_id == '1590928658':
    full_df['size'] = full_df['block_sizes'].str.extract(r'(\d+),').astype(int)
    experiments = [x for _, x in full_df.groupby(['p', 'q', 'Lower bound'])]
    for experiment in experiments:

        fig, ax = plt.subplots(figsize=(15, 10))

        name = f"SBM_bound_theory_p_{experiment['p'].iloc[0]}_q_{experiment['q'].iloc[0]}_lb_f_{experiment['Lower bound'].iloc[0]}"

        columns = [ 'percentile_orders', 'agreement', 'Adjusted Rand Score']
        data = experiment[columns].groupby([ 'percentile_orders', 'agreement',], as_index=False).mean()
                                                         
        make_result_heatmap(data, ax, x_column=columns[0], y_column=columns[1], values_column=columns[2], x_label='percentile of orders considered', y_label='agreement')
        add_lines(values_theory, ax, plot_first=True)
        ax.set_axisbelow(True)
        fig.tight_layout()

        fig.savefig(f'{path_out / name}.pdf')
        plt.close(fig) 

id = 1590935943

In [76]:
full_df['percentile_orders'].unique()

array([ 80, 100])

In [77]:
if experiment_id == '1590935943':
    experiments = [x for _, x in full_df.groupby(['p', 'q', 'percentile_orders'])]
    for experiment in experiments:

        fig, ax = plt.subplots(figsize=(15, 10))

        name = f"SBM_bound_theory_p_{experiment['p'].iloc[0]}_q_{experiment['q'].iloc[0]}_per_{experiment['percentile_orders'].iloc[0]}"

        columns = [ 'Lower bound', 'Number of cuts', 'Adjusted Rand Score']
        data = experiment[columns].groupby(['Lower bound', 'Number of cuts'], as_index=False).mean()
                                                         
        make_result_heatmap(data, ax, x_column=columns[0], y_column=columns[1], values_column=columns[2], x_label='lower bound', y_label='Number of cuts')
        ax.set_axisbelow(True)
        fig.tight_layout()

        fig.savefig(f'{path_out / name}.pdf')
        plt.close(fig) 

id = 1590934402

In [78]:
if experiment_id == '1590934402':
    
    full_df['size'] = full_df['block_sizes'].str.extract(r', (\d+)').astype(int)
    experiments = [x for _, x in full_df.groupby(['p', 'q', 'percentile_orders'])]
    for experiment in experiments:

        fig, ax = plt.subplots(figsize=(15, 10))

        name = f"SBM_bound_theory_p_{experiment['p'].iloc[0]}_q_{experiment['q'].iloc[0]}_per_{experiment['percentile_orders'].iloc[0]}"

        columns = [ 'size', 'Lower bound',  'Adjusted Rand Score']
        data = experiment[columns].groupby(['size', 'Lower bound'], as_index=False).mean()
                                                         
        make_result_heatmap(data, ax, x_column=columns[0], y_column=columns[1], values_column=columns[2], x_label='block size $| V_i |$', y_label='lower bound')
        ax.set_axisbelow(True)
        fig.tight_layout()

        fig.savefig(f'{path_out / name}.pdf')
        plt.close(fig) 

id = 1591003301

In [79]:
if experiment_id == '1591014530':
    experiments = [x for _, x in full_df.groupby(['p', 'q'])]
    for experiment in experiments:

        fig, ax = plt.subplots(figsize=(15, 10))

        name = f"SBM_bound_theory_p_{experiment['p'].iloc[0]}_q_{experiment['q'].iloc[0]}"

        columns = [ 'agreement', 'percentile_orders',  'Adjusted Rand Score']
        data = experiment[columns].groupby(['agreement', 'percentile_orders'], as_index=False).mean()
                                                         
        make_result_heatmap(data, ax, x_column=columns[0], y_column=columns[1], values_column=columns[2], x_label='agreement $a$', y_label='percentile of orders')
        ax.set_axisbelow(True)
        fig.tight_layout()

        fig.savefig(f'{path_out / name}.pdf')
        plt.close(fig) 

In [11]:
full_df

Unnamed: 0,seed,dataset_name,preprocessing_name,agreement,percentile_orders,unique_id,dataset_type,block_sizes,p,q,Number of cuts,Lower bound,Adjusted Rand Score
0,40,sbm,fid_mat,20,40,1591029988,graph,"[100, 100]",0.3,0.4,20,0.2,-0.000883
1,70,sbm,fid_mat,40,90,1591029988,graph,"[100, 100]",0.6,0.4,20,0.2,0.000000
2,70,sbm,fid_mat,25,90,1591029988,graph,"[100, 100]",0.3,0.4,20,0.2,-0.000084
3,70,sbm,fid_mat,20,10,1591029988,graph,"[100, 100]",0.3,0.1,20,0.2,0.165366
4,100,sbm,fid_mat,30,10,1591029988,graph,"[100, 100]",0.3,0.1,20,0.2,0.173644
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3595,10,sbm,fid_mat,45,40,1591029988,graph,"[100, 100]",0.6,0.1,20,0.2,1.000000
3596,50,sbm,fid_mat,25,20,1591029988,graph,"[100, 100]",0.3,0.1,20,0.2,0.740963
3597,50,sbm,fid_mat,35,30,1591029988,graph,"[100, 100]",0.6,0.1,20,0.2,0.750032
3598,70,sbm,fid_mat,10,70,1591029988,graph,"[100, 100]",0.6,0.4,20,0.2,0.475605


In [23]:
if experiment_id == '1591037064':
    experiments = [x for _, x in full_df.groupby(['p', 'q'])]
    for experiment in experiments:

        fig, ax = plt.subplots(figsize=(22.5, 15))

        name = f"SBM_bound_theory_p_{experiment['p'].iloc[0]}_q_{experiment['q'].iloc[0]}"

        columns = [ 'agreement', 'percentile_orders',  'Adjusted Rand Score']
        data = experiment[columns].groupby(['agreement', 'percentile_orders'], as_index=False).mean()
                                                         
        make_result_heatmap(data, ax, x_column=columns[0], y_column=columns[1], values_column=columns[2], x_label='agreement $a$', y_label='percentile of orders')
        ax.set_axisbelow(True)
        fig.tight_layout()

        fig.savefig(f'{path_out / name}.pdf')
        plt.close(fig) 

In [32]:
full_df[full_df['agreement'] == 20].groupby('percentile_orders').mean()

Unnamed: 0_level_0,seed,agreement,unique_id,p,q,Number of cuts,Lower bound,Adjusted Rand Score
percentile_orders,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
10,55,20,1591037064,0.3,0.1,20,0.2,0.571191
20,55,20,1591037064,0.3,0.1,20,0.2,0.725839
30,55,20,1591037064,0.3,0.1,20,0.2,0.675591
40,55,20,1591037064,0.3,0.1,20,0.2,0.550799
50,55,20,1591037064,0.3,0.1,20,0.2,0.520976
60,55,20,1591037064,0.3,0.1,20,0.2,0.493462
70,55,20,1591037064,0.3,0.1,20,0.2,0.532705
80,55,20,1591037064,0.3,0.1,20,0.2,0.536526
90,55,20,1591037064,0.3,0.1,20,0.2,0.531514
100,55,20,1591037064,0.3,0.1,20,0.2,0.543566
