In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import project_path

In [3]:
import glob
import os

from pathlib import Path
from itertools import product

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.patches as patches

from src.plotting import make_result_heatmap, add_lines

In [4]:
pd.read_csv('../experiments.csv')

Unnamed: 0,id,Type of experiment
0,0,Local experiments
1,1590579968,SBM
2,1590672352,SBM test nb cuts
3,1590679038,SBM 20 cuts keep 80%
4,1590585263,Mindsets (old)
5,1590663805,Mindsets
6,1590669671,Mindsets better resolution
7,1590676940,Mindsets better resolution 100-50%
8,1590680669,Mindsets better resolution 80%
9,1590686580,Mindsets 0->40 questions


In [9]:
experiment_id = '1590579968'

In [10]:
path_in = Path(f'../output/{experiment_id}').resolve()
path_temp = Path(f'/tmp/{experiment_id}.csv').resolve()
path_out = Path(f'../plots/{experiment_id}').resolve()
path_out.mkdir(parents=True, exist_ok=True)

In [11]:
is_first_file = True
with open(path_temp,"wb") as output_file:
    for subdir, dirs, files in os.walk(path_in):
        for file in files:
            input_path = f'{subdir}/{file}'
            if is_first_file:
                is_first_file = False
                with open(input_path, "rb") as input_file:
                    output_file.write(input_file.read())
            else:
                with open(input_path, "rb") as input_file:
                    next(input_file)
                    output_file.write(input_file.read())

In [12]:
full_df = pd.read_csv(path_temp, index_col=0).reset_index(drop=True)
full_df = full_df.rename(columns={'lb_f': 'Lower bound', 'nb_cuts': 'Number of cuts'})
full_df['q'].unique()

array([0.51, 0.78, 0.64, 0.69, 0.37, 0.09, 0.87, 0.46, 0.55, 0.32, 0.14,
       0.23, 0.05, 0.18, 0.92, 0.6 , 0.28, 0.74, 0.83, 0.41])

# SBM

### Plot SBM for 2 blocks with theoretical bounds id = 1590679038

In [36]:
block_size = 100
xs = np.arange(1, 21)
i = (np.log(block_size) / block_size * np.arange(1, 21)).round(2)

idx = np.arange(len(xs))
mask = np.zeros((len(xs), len(xs)), dtype=bool)

for a, b in product(xs, xs):
    mask[a-1, b-1] = ~(i[a-1] < 2*i[b-1])
theory_df = pd.DataFrame(mask, columns=i, index=i).T.sort_index(ascending=False).sort_index(axis=1, ascending=True)
values_theory = theory_df.to_numpy()
mask[:, 4]

array([False, False, False, False, False, False, False, False, False,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True])

In [30]:
if experiment_id == '1590579968':
    experiments = [x for _, x in full_df.groupby(['Lower bound'])]
    for experiment in experiments:

        fig, ax = plt.subplots(figsize=(15, 10))

        name = f"SBM_bound_theory_lb_f_{experiment['Lower bound'].iloc[0]}"

        columns = ['p', 'q', 'Adjusted Rand Score']
        data = experiment[columns].groupby(['p', 'q'], as_index=False).mean()
                                                         
        make_result_heatmap(data, ax, x_column=columns[0], y_column=columns[1], values_column=columns[2], x_label='in-group probability p', y_label='out-group probability q')
        add_lines(values_theory, ax, plot_first=True)
        ax.set_axisbelow(True)
        fig.tight_layout()

        fig.savefig(f'{path_out / name}.pdf')
        plt.close(fig) 

In [None]:
if experiment_id == '1590679038':
    experiments = [x for _, x in full_df.groupby(['q', 'p'])]
    for experiment in experiments:

        fig, ax = plt.subplots(figsize=(15, 10))

        name = f"SBM_cuts_p_{experiment['p'].iloc[0]}_q_{experiment['q'].iloc[0]}"

        columns = ['Lower bound', 'Number of cuts', 'Adjusted Rand Score']
        data = experiment[columns].groupby(['Lower bound', 'Number of cuts'], as_index=False).mean()
        
        data.to_csv(f'{name}.csv', index=False)
                                                 
        make_result_heatmap(data, ax, x_column=columns[0], y_column=columns[1], values_column=columns[2])
        add_lines(values_theory, ax)
        ax.set_axisbelow(True)
        fig.tight_layout()

        fig.savefig(f'{path_out / name}.pdf')
        plt.close(fig) 

### Plot SBM for 2/3 blocks and unbalanced id = 1587919855

In [76]:
if experiment_name == '1587919855':
        
    full_df = full_df[full_df['preprocessing_name'] == 'fid_mat']
    experiments = [x for _, x in full_df.groupby(['block_sizes', 'lb_f', 'agreement'])]
    for experiment in experiments:
        
        fig, ax = plt.subplots(figsize=(16.18, 10))

        title = f"SBM with {experiment['block_sizes'].iloc[0]} blocks using nb_cuts = 50, lb_f = {experiment['lb_f'].iloc[0]} and a = {experiment['agreement'].astype(int).iloc[0]}"
        name = f"block_sizes_{experiment['block_sizes'].iloc[0]}_a_{experiment['agreement'].iloc[0]}_lb_f_{experiment['lb_f'].iloc[0]}"
        
        data = experiment[['p', 'q', 'homogeneity']]
        make_result_heatmap(data, title, ax)

        fig.tight_layout()

        fig.savefig(f'{path_out / name}.pdf')
        plt.close(fig) 

### Plot SBM for 4 blocks id = 1587918440

In [27]:
if experiment_name == '1587918440':
        
    full_df = full_df[full_df['preprocessing_name'] == 'fid_mat']
    experiments = [x for _, x in full_df.groupby(['block_sizes', 'lb_f', 'agreement'])]
    for experiment in experiments:
        
        fig, ax = plt.subplots(figsize=(15, 10))

        title = f"SBM with {experiment['block_sizes'].iloc[0]} blocks using nb_cuts = 50, lb_f = {experiment['lb_f'].iloc[0]} and a = {experiment['agreement'].astype(int).iloc[0]}"
        name = f"block_sizes_{experiment['block_sizes'].iloc[0]}_a_{experiment['agreement'].iloc[0]}_lb_f_{experiment['lb_f'].iloc[0]}"
        
        data = experiment[['p', 'q', 'homogeneity']]
        make_result_heatmap(data, title, ax)

        fig.tight_layout()

        fig.savefig(f'{path_out / name}.svg')
        plt.close(fig) 

In [78]:
full_df.groupby(['block_sizes', 'p', 'q']).mean()['Adjusted Rand Score'].round(2)

block_sizes           p    q  
[100, 100, 100, 100]  0.3  0.1    0.53
                           0.4   -0.00
                      0.6  0.1    0.54
                           0.4    0.51
[100, 100, 100]       0.3  0.1    0.61
                           0.4   -0.00
                      0.6  0.1    0.97
                           0.4    0.67
[50, 75, 100, 100]    0.3  0.1    0.60
                           0.4   -0.00
                      0.6  0.1    0.77
                           0.4    0.61
[50, 75, 100]         0.3  0.1    0.90
                           0.4   -0.01
                      0.6  0.1    0.95
                           0.4    0.62
Name: Adjusted Rand Score, dtype: float64

In [73]:
full_df

Unnamed: 0,seed,dataset_name,preprocessing_name,agreement,percentile_orders,unique_id,dataset_type,block_sizes,p,q,Number of cuts,Lower bound,Adjusted Rand Score
0,20,sbm,fid_mat,50,80,1590690879,graph,"[100, 100, 100]",0.6,0.1,20,0.2,0.941444
1,50,sbm,fid_mat,50,80,1590690879,graph,"[100, 100, 100]",0.6,0.1,20,0.2,1.000000
2,30,sbm,fid_mat,50,80,1590690879,graph,"[100, 100, 100]",0.3,0.1,20,0.2,0.920756
3,10,sbm,fid_mat,25,80,1590690879,graph,"[50, 75, 100]",0.3,0.1,20,0.2,0.900316
4,30,sbm,fid_mat,25,80,1590690879,graph,"[50, 75, 100]",0.6,0.4,20,0.2,0.436824
...,...,...,...,...,...,...,...,...,...,...,...,...,...
154,10,sbm,fid_mat,25,80,1590690879,graph,"[50, 75, 100, 100]",0.6,0.1,20,0.2,0.929641
155,50,sbm,fid_mat,50,80,1590690879,graph,"[100, 100, 100, 100]",0.6,0.1,20,0.2,0.349608
156,40,sbm,fid_mat,25,80,1590690879,graph,"[50, 75, 100, 100]",0.6,0.1,20,0.2,0.903654
157,40,sbm,fid_mat,25,80,1590690879,graph,"[50, 75, 100, 100]",0.3,0.1,20,0.2,0.611306
