In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import project_path

In [3]:
import glob
import os

from pathlib import Path
from itertools import product

import numpy as np
import pandas as pd

from altair_saver import save

import matplotlib.pyplot as plt
import matplotlib.patches as patches

from src.plotting import make_homogeneity_plot, make_full_plot, graphic_settings, make_result_heatmap, add_lines

In [4]:
pd.read_csv('../experiments.csv')

Unnamed: 0,id,Type of experiment
0,0,Local experiments
1,1587919855,p vs q algorithms for SBM with 50 cuts
2,1587918440,p vs q algorithms for SBM with 50 cuts for 4 ...
3,1587804052,p vs q algorithms for SBM with 50 cuts for 5 ...
4,1587703699,KnnBlobs experiments with different k and 50 ...
5,1586946592,SBM experiments
6,1587639002,SBM with 100 cuts fixed p and q testing diffe...
7,1587621343,KnnBlobs with 100 cuts testing different a
8,1587629564,KL vs MF algorithms on SBM
9,1587632158,SBM with fixed p and q and different numbers ...


In [28]:
experiment_name = '1587919855'

In [29]:
path_in = Path(f'../output/{experiment_name}').resolve()
path_out = Path(f'../plots/{experiment_name}').resolve()
path_out.mkdir(parents=True, exist_ok=True)

In [30]:
full_df = pd.DataFrame()
for subdir, dirs, files in os.walk(path_in):
    for file in files:
        
        current_df = pd.read_csv(f'{subdir}/{file}', index_col=0)
        full_df = full_df.append(current_df)
full_df = full_df.reset_index(drop=True)

In [31]:
full_df

Unnamed: 0,agreement,block_sizes,completeness,dataset_name,dataset_type,homogeneity,lb_f,max_order,nb_cuts,order_best,p,percentile_orders,preprocessing_name,q,seed,unique_id,v_measure_score
0,35.0,"[70, 100]",1.000000,sbm,graph,1.000000,0.3,2212.0,50.0,1575.0,0.40,100.0,fid_mat,0.20,42.0,1587919855,1.000000
1,35.0,"[70, 100]",1.000000,sbm,graph,1.000000,0.4,1250.0,50.0,271.0,0.30,100.0,fid_mat,0.04,42.0,1587919855,1.000000
2,25.0,"[50, 100]",0.079530,sbm,graph,0.059687,0.2,1611.0,50.0,979.0,0.10,100.0,karnig_lin,0.60,42.0,1587919855,0.068195
3,35.0,"[70, 100]",0.002786,sbm,graph,0.002123,0.2,1216.0,50.0,601.0,0.05,100.0,fid_mat,0.30,42.0,1587919855,0.002409
4,50.0,"[100, 100]",1.000000,sbm,graph,1.000000,0.4,1573.0,50.0,635.0,0.80,100.0,karnig_lin,0.05,42.0,1587919855,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6926,50.0,"[100, 100, 100]",0.000135,sbm,graph,0.000062,0.2,14388.0,50.0,9234.0,0.20,100.0,fid_mat,0.90,42.0,1587919855,0.000085
6927,50.0,"[100, 100, 100]",1.000000,sbm,graph,1.000000,0.4,4933.0,50.0,1480.0,0.60,100.0,fid_mat,0.03,42.0,1587919855,1.000000
6928,50.0,"[100, 100, 100]",1.000000,sbm,graph,1.000000,0.3,1910.0,50.0,690.0,0.60,100.0,karnig_lin,0.03,42.0,1587919855,1.000000
6929,50.0,"[100, 100, 100]",1.000000,sbm,graph,1.000000,0.4,1366.0,50.0,1160.0,0.30,100.0,karnig_lin,0.04,42.0,1587919855,1.000000


# SBM

### Plot SBM for 2 blocks with theoretical bounds id = 1588572854

In [32]:
block_size = 100
xs = np.arange(1, 21)
i = (np.log(block_size) / block_size * np.arange(1, 21)).round(2)

idx = np.arange(len(xs))
mask = np.zeros((len(xs), len(xs)), dtype=bool)

for a, b in product(xs, xs):
    if a >= b:
        mask[a-1, b-1] = (np.abs(np.sqrt(a) - np.sqrt(b)) >= np.sqrt(2))
theory_df = pd.DataFrame(mask, columns=i, index=i).T.sort_index(ascending=False).sort_index(axis=1, ascending=True)
values_theory = theory_df.to_numpy()

In [33]:
if experiment_name == '1588572854':
    full_df = full_df[full_df['preprocessing_name'] == 'fid_mat']
    experiments = [x for _, x in full_df.groupby(['block_sizes', 'lb_f'])]
    for experiment in experiments:

        fig, ax = plt.subplots(figsize=(15, 10))

        title = f"SBM with {experiment['block_sizes'].iloc[0]} blocks using nb_cuts = 50, lb_f = {experiment['lb_f'].iloc[0]} and a = {experiment['agreement'].astype(int).iloc[0]}"
        name = f"block_sizes_{experiment['block_sizes'].iloc[0]}_lb_f_{experiment['lb_f'].iloc[0]}"

        data = experiment[['p', 'q', 'homogeneity']]
        make_result_heatmap(data, title, ax)
        add_lines(values_theory, ax)

        fig.tight_layout()

        fig.savefig(f'{path_out / name}.svg')
        plt.close(fig) 

### Plot SBM for 2/3 blocks and unbalanced id = 1587919855

In [34]:
if experiment_name == '1587919855':
        
    full_df = full_df[full_df['preprocessing_name'] == 'fid_mat']
    experiments = [x for _, x in full_df.groupby(['block_sizes', 'lb_f', 'agreement'])]
    for experiment in experiments:
        
        fig, ax = plt.subplots(figsize=(15, 10))

        title = f"SBM with {experiment['block_sizes'].iloc[0]} blocks using nb_cuts = 50, lb_f = {experiment['lb_f'].iloc[0]} and a = {experiment['agreement'].astype(int).iloc[0]}"
        name = f"block_sizes_{experiment['block_sizes'].iloc[0]}_a_{experiment['agreement'].iloc[0]}_lb_f_{experiment['lb_f'].iloc[0]}"
        
        data = experiment[['p', 'q', 'homogeneity']]
        make_result_heatmap(data, title, ax)

        fig.tight_layout()

        fig.savefig(f'{path_out / name}.svg')
        plt.close(fig) 

### Plot SBM for 4 blocks id = 1587918440

In [27]:
if experiment_name == '1587918440':
        
    full_df = full_df[full_df['preprocessing_name'] == 'fid_mat']
    experiments = [x for _, x in full_df.groupby(['block_sizes', 'lb_f', 'agreement'])]
    for experiment in experiments:
        
        fig, ax = plt.subplots(figsize=(15, 10))

        title = f"SBM with {experiment['block_sizes'].iloc[0]} blocks using nb_cuts = 50, lb_f = {experiment['lb_f'].iloc[0]} and a = {experiment['agreement'].astype(int).iloc[0]}"
        name = f"block_sizes_{experiment['block_sizes'].iloc[0]}_a_{experiment['agreement'].iloc[0]}_lb_f_{experiment['lb_f'].iloc[0]}"
        
        data = experiment[['p', 'q', 'homogeneity']]
        make_result_heatmap(data, title, ax)

        fig.tight_layout()

        fig.savefig(f'{path_out / name}.svg')
        plt.close(fig) 