In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import project_path

In [3]:
import glob
import os

from pathlib import Path
from itertools import product

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.patches as patches

from src.plotting import make_result_heatmap, add_lines

In [43]:
pd.read_csv('../experiments.csv')

Unnamed: 0,id,Type of experiment
0,0,Local experiments
1,1589802894,SBM full tree
2,1590579968,SBM
3,1590672352,SBM test nb cuts
4,1590679038,SBM 20 cuts keep 80%
5,1590585263,Mindsets (old)
6,1590663805,Mindsets
7,1590676940,Mindsets 100-50% 20 questions
8,1590906539,Mindsets 100-50% 40 questions
9,1590686580,Mindsets 0->40 questions


In [44]:
experiment_id = '1591047201'

In [45]:
path_in = Path(f'../output/{experiment_id}').resolve()
path_temp = Path(f'/tmp/{experiment_id}.csv').resolve()
path_out = Path(f'../plots/{experiment_id}').resolve()
path_out.mkdir(parents=True, exist_ok=True)

In [46]:
is_first_file = True
with open(path_temp,"wb") as output_file:
    for subdir, dirs, files in os.walk(path_in):
        for file in files:
            input_path = f'{subdir}/{file}'
            if is_first_file:
                is_first_file = False
                with open(input_path, "rb") as input_file:
                    output_file.write(input_file.read())
            else:
                with open(input_path, "rb") as input_file:
                    next(input_file)
                    output_file.write(input_file.read())

In [47]:
full_df = pd.read_csv(path_temp, index_col=0).reset_index(drop=True)


In [48]:
full_df['n_mindsets'] = full_df['mindset_sizes'].str.count(',') + 1
full_df['size'] = full_df['mindset_sizes'].str.extract(r'(\d+),').astype(int)
full_df = full_df.rename(columns={"noise": "p", "size": "n/k"})
full_df['n'] = full_df['n/k'] * full_df['n_mindsets']

In [49]:
def get_mask_constant_a(df, k):
    
    ps = df['p'].unique()
    ps.sort()
    ps = ps[::-1]
    sizes = df['n/k'].unique() * k
    sizes.sort()
    
    Is = np.arange(len(ps))
    Js = np.arange(len(sizes))

    mask = np.zeros((len(ps), len(sizes)), dtype=bool)

    for i, j in product(Is, Js):
        p = ps[i]
        size = sizes[j]
        n = size 
        n_k = n / k
        a = n_k / 3

        mask[i, j] = (p < 1/(k+3) and (a > p*n and a < (1-3*p) * n_k))
        
    return mask

id = 1590676940

In [50]:
if experiment_id == '1590676940':

    experiments = [x for _, x in full_df.groupby(['nb_useless', 'n_mindsets', 'percentile_orders'])]
    for experiment in experiments:

        fig, ax = plt.subplots(figsize=(15, 10))
        
        n_mindset = experiment['n_mindsets'].iloc[0]
        name = f"Mindset_theory_n_mindsets_{n_mindset}_per_{experiment['percentile_orders'].iloc[0]}_useless_{experiment['nb_useless'].iloc[0]}"
        
        columns = ['n/k', 'p', 'Adjusted Rand Score']
        data = experiment[columns].groupby(['n/k', 'p'], as_index=False).mean()
                                                         
        make_result_heatmap(data, ax, x_column=columns[0], y_column=columns[1], values_column=columns[2], x_label='size of the clusters n/k', y_label='Noise parameter p')
        
        mask = get_mask(experiment, n_mindset)
        add_lines(mask, ax)
        ax.set_axisbelow(True)
        fig.tight_layout()

        fig.savefig(f'{path_out / name}.pdf')
        plt.close(fig) 

id = 1590906539

In [51]:
if experiment_id == '1590906539':

    experiments = [x for _, x in full_df.groupby(['nb_useless', 'n_mindsets', 'percentile_orders'])]
    for experiment in experiments:

        fig, ax = plt.subplots(figsize=(15, 10))
        
        n_mindset = experiment['n_mindsets'].iloc[0]
        name = f"Mindset_theory_n_mindsets_{n_mindset}_per_{experiment['percentile_orders'].iloc[0]}_useless_{experiment['nb_useless'].iloc[0]}"
        
        columns = ['n/k', 'p', 'Adjusted Rand Score']
        data = experiment[columns].groupby(['n/k', 'p'], as_index=False).mean()
                                                         
        make_result_heatmap(data, ax, x_column=columns[0], y_column=columns[1], values_column=columns[2], x_label='mindset size $|V_i|$', y_label='Noise $p$')
        
        mask = get_mask_constant_a(experiment, n_mindset)
        add_lines(mask, ax)
        ax.set_axisbelow(True)
        fig.tight_layout()

        fig.savefig(f'{path_out / name}.svg')
        plt.close(fig) 

id = 1590686580

In [52]:
full_df

Unnamed: 0,seed,dataset_name,preprocessing_name,agreement,percentile_orders,unique_id,dataset_type,mindset_sizes,nb_questions,nb_useless,p,Adjusted Rand Score,n_mindsets,n/k,n
0,80,mindsets,features,66,70,1591047201,discrete,"[40, 40]",10,20,0.1,0.000000,2,40,80
1,60,mindsets,features,10,90,1591047201,discrete,"[40, 40]",10,40,0.1,0.009678,2,40,80
2,20,mindsets,features,55,40,1591047201,discrete,"[40, 40]",10,10,0.2,0.000000,2,40,80
3,10,mindsets,features,10,90,1591047201,discrete,"[40, 40]",20,10,0.1,0.496734,2,40,80
4,60,mindsets,features,32,80,1591047201,discrete,"[40, 40]",40,40,0.1,0.807589,2,40,80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6617,70,mindsets,features,21,70,1591047201,discrete,"[40, 40]",10,10,0.1,0.901253,2,40,80
6618,10,mindsets,features,21,20,1591047201,discrete,"[40, 40]",20,40,0.2,0.382817,2,40,80
6619,60,mindsets,features,32,30,1591047201,discrete,"[40, 40]",40,40,0.1,0.807589,2,40,80
6620,90,mindsets,features,32,50,1591047201,discrete,"[40, 40]",40,10,0.1,0.901253,2,40,80


In [53]:
if experiment_id == '1590919210':
    
    full_df['size'] = full_df['mindset_sizes'].str.extract(r', (\d+)').astype(int)
    experiments = [x for _, x in full_df.groupby(['nb_useless', 'n_mindsets', 'percentile_orders'])]
    for experiment in experiments:

        fig, ax = plt.subplots(figsize=(15, 10))
        
        n_mindset = experiment['n_mindsets'].iloc[0]
        name = f"Mindset_theory_unbalanced"
        
        columns = ['size', 'p', 'Adjusted Rand Score']
        data = experiment[columns].groupby(['size', 'p'], as_index=False).mean()
                                                         
        make_result_heatmap(data, ax, x_column=columns[0], y_column=columns[1], values_column=columns[2], x_label='size of the smallest mindset $| V_2 |$', y_label='noise $p$')

        ax.set_axisbelow(True)
        fig.tight_layout()

        fig.savefig(f'{path_out / name}.pdf')
        plt.close(fig) 

In [54]:
if experiment_id == '1590686580':

    experiments = [x for _, x in full_df.groupby(['nb_useless', 'n_mindsets', 'percentile_orders'])]
    for experiment in experiments:

        fig, ax = plt.subplots(figsize=(30, 20))
        
        n_mindset = experiment['n_mindsets'].iloc[0]
        name = f"Mindset_theory_n_mindsets_{n_mindset}_per_{experiment['percentile_orders'].iloc[0]}_useless_{experiment['nb_useless'].iloc[0]}"
        
        columns = ['nb_questions', 'p', 'Adjusted Rand Score']
        data = experiment[columns].groupby(['nb_questions', 'p'], as_index=False).mean()
                                                         
        make_result_heatmap(data, ax, x_column=columns[0], y_column=columns[1], values_column=columns[2], x_label='number of questions $m$', y_label='noise $p$')

        ax.set_axisbelow(True)
        fig.tight_layout()

        fig.savefig(f'{path_out / name}.pdf')
        plt.close(fig) 

In [55]:
def get_mask_variable_a(df, k):
    
    p = df['p'].unique()
    n_ks = df['n/k'].unique()
    n_ks.sort()
    As = df['agreement'].unique()
    As.sort()
    As = As[::-1]

    Js = np.arange(len(As))
    Is = np.arange(len(n_ks))

    mask = np.zeros((len(Is), len(Js)), dtype=bool)

    for i, j in product(Is, Js):
        a = As[j]
        n_k = n_ks[i]
        
        n = n_k * 2     
        
        mask[i, j] = (p < 1/(k+3) and (a > p*n and a < (1-3*p) * n_k))

    return mask

In [56]:
if experiment_id == '1590906222' or experiment_id == '1591017278':
    experiments = [x for _, x in full_df.groupby(['p'])]
    for experiment in experiments:

        fig, ax = plt.subplots(figsize=(15, 10))
        
        n_mindset = experiment['n_mindsets'].iloc[0]
        name = f"Mindset_theory_n_mindsets_{n_mindset}_noise_{experiment['p'].iloc[0]}"
        
        columns = ['n/k', 'agreement', 'Adjusted Rand Score']
        data = experiment[columns].groupby(['n/k', 'agreement'], as_index=False).mean().fillna(0)
                                                         
        make_result_heatmap(data, ax, x_column=columns[0], y_column=columns[1], values_column=columns[2], x_label=r'mindset size $| V_i |$', y_label='Agreement $a$')
        
        mask = get_mask_variable_a(experiment, n_mindset)
        print(mask)
        #add_lines(mask, ax)
        ax.set_axisbelow(True)
        fig.tight_layout()

        fig.savefig(f'{path_out / name}.svg')
        plt.close(fig) 

In [57]:
full_df

Unnamed: 0,seed,dataset_name,preprocessing_name,agreement,percentile_orders,unique_id,dataset_type,mindset_sizes,nb_questions,nb_useless,p,Adjusted Rand Score,n_mindsets,n/k,n
0,80,mindsets,features,66,70,1591047201,discrete,"[40, 40]",10,20,0.1,0.000000,2,40,80
1,60,mindsets,features,10,90,1591047201,discrete,"[40, 40]",10,40,0.1,0.009678,2,40,80
2,20,mindsets,features,55,40,1591047201,discrete,"[40, 40]",10,10,0.2,0.000000,2,40,80
3,10,mindsets,features,10,90,1591047201,discrete,"[40, 40]",20,10,0.1,0.496734,2,40,80
4,60,mindsets,features,32,80,1591047201,discrete,"[40, 40]",40,40,0.1,0.807589,2,40,80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6617,70,mindsets,features,21,70,1591047201,discrete,"[40, 40]",10,10,0.1,0.901253,2,40,80
6618,10,mindsets,features,21,20,1591047201,discrete,"[40, 40]",20,40,0.2,0.382817,2,40,80
6619,60,mindsets,features,32,30,1591047201,discrete,"[40, 40]",40,40,0.1,0.807589,2,40,80
6620,90,mindsets,features,32,50,1591047201,discrete,"[40, 40]",40,10,0.1,0.901253,2,40,80


In [58]:
full_df

Unnamed: 0,seed,dataset_name,preprocessing_name,agreement,percentile_orders,unique_id,dataset_type,mindset_sizes,nb_questions,nb_useless,p,Adjusted Rand Score,n_mindsets,n/k,n
0,80,mindsets,features,66,70,1591047201,discrete,"[40, 40]",10,20,0.1,0.000000,2,40,80
1,60,mindsets,features,10,90,1591047201,discrete,"[40, 40]",10,40,0.1,0.009678,2,40,80
2,20,mindsets,features,55,40,1591047201,discrete,"[40, 40]",10,10,0.2,0.000000,2,40,80
3,10,mindsets,features,10,90,1591047201,discrete,"[40, 40]",20,10,0.1,0.496734,2,40,80
4,60,mindsets,features,32,80,1591047201,discrete,"[40, 40]",40,40,0.1,0.807589,2,40,80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6617,70,mindsets,features,21,70,1591047201,discrete,"[40, 40]",10,10,0.1,0.901253,2,40,80
6618,10,mindsets,features,21,20,1591047201,discrete,"[40, 40]",20,40,0.2,0.382817,2,40,80
6619,60,mindsets,features,32,30,1591047201,discrete,"[40, 40]",40,40,0.1,0.807589,2,40,80
6620,90,mindsets,features,32,50,1591047201,discrete,"[40, 40]",40,10,0.1,0.901253,2,40,80


In [59]:
if experiment_id == '1590928392':
    experiments = [x for _, x in full_df.groupby(['nb_useless'])]
    for experiment in experiments:

        fig, ax = plt.subplots(figsize=(15, 10))
        
        n_mindset = experiment['mindset_sizes'].iloc[0]
        name = f"Mindset_theory_n_mindsets_{n_mindset}_nb_useless_{experiment['nb_useless'].iloc[0]}"
        
        columns = ['percentile_orders', 'agreement', 'Adjusted Rand Score']
        data = experiment[columns].groupby(['percentile_orders', 'agreement'], as_index=False).mean().fillna(0)
                                                         
        make_result_heatmap(data, ax, x_column=columns[0], y_column=columns[1], values_column=columns[2], x_label='percentile of orders considered', y_label='agreement')

        ax.set_axisbelow(True)
        fig.tight_layout()

        fig.savefig(f'{path_out / name}.pdf')
        plt.close(fig) 

1591003606

In [60]:
full_df

Unnamed: 0,seed,dataset_name,preprocessing_name,agreement,percentile_orders,unique_id,dataset_type,mindset_sizes,nb_questions,nb_useless,p,Adjusted Rand Score,n_mindsets,n/k,n
0,80,mindsets,features,66,70,1591047201,discrete,"[40, 40]",10,20,0.1,0.000000,2,40,80
1,60,mindsets,features,10,90,1591047201,discrete,"[40, 40]",10,40,0.1,0.009678,2,40,80
2,20,mindsets,features,55,40,1591047201,discrete,"[40, 40]",10,10,0.2,0.000000,2,40,80
3,10,mindsets,features,10,90,1591047201,discrete,"[40, 40]",20,10,0.1,0.496734,2,40,80
4,60,mindsets,features,32,80,1591047201,discrete,"[40, 40]",40,40,0.1,0.807589,2,40,80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6617,70,mindsets,features,21,70,1591047201,discrete,"[40, 40]",10,10,0.1,0.901253,2,40,80
6618,10,mindsets,features,21,20,1591047201,discrete,"[40, 40]",20,40,0.2,0.382817,2,40,80
6619,60,mindsets,features,32,30,1591047201,discrete,"[40, 40]",40,40,0.1,0.807589,2,40,80
6620,90,mindsets,features,32,50,1591047201,discrete,"[40, 40]",40,10,0.1,0.901253,2,40,80


In [61]:
if experiment_id == '1591046047' or experiment_id == '1591047201':
    experiments = [x for _, x in full_df.groupby(['p', 'nb_questions', 'nb_useless'])]
    for experiment in experiments:

        fig, ax = plt.subplots(figsize=(22.5, 15))
        
        n_mindset = experiment['mindset_sizes'].iloc[0]
        name = f"Mindset_a_vs_max_noise_{experiment['p'].iloc[0]}_nb_questions_{experiment['nb_questions'].iloc[0]}_nb_useless_{experiment['nb_useless'].iloc[0]}"
        
        columns = ['percentile_orders', 'agreement', 'Adjusted Rand Score']
        data = experiment[columns].groupby(['percentile_orders', 'agreement'], as_index=False).mean().fillna(0)
                                                         
        make_result_heatmap(data, ax, x_column=columns[0], y_column=columns[1], values_column=columns[2], y_label=r'agreement $a$', x_label=r'percentile of orders considered')

        ax.set_axisbelow(True)
        fig.tight_layout()

        fig.savefig(f'{path_out / name}.svg')
        plt.close(fig) 