# Test Parallel ExIFFI

Host capri Davide

    Hostname capri.dei.unipd.it
    User p1026u27

In [1]:
import sys
import numpy as np
import pandas as pd
from tqdm import trange
from append_dir import append_dirname
append_dirname('ExIFFI')
from utils.utils import partition_data
from utils.feature_selection import *
#from plot import *
#from simulation_setup import *
from models import *
from models.Extended_IF import *
from models.Extended_DIFFI_parallel import *
from models.Extended_DIFFI_original import *
import math
import seaborn as sns
sns.set()

from sklearn.preprocessing import StandardScaler
import time

import os
import pickle 
from scipy.io import loadmat
from glob import glob

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)

## Set up file paths

In [15]:
path = os.getcwd()
path = os.path.dirname(path)
path_real = os.path.join(path, "data", "real")
mat_files_real = glob(os.path.join(path_real, "*.mat"))
mat_file_names_real = {os.path.basename(x).split(".")[0]: x for x in mat_files_real}
csv_files_real = glob(os.path.join(path_real, "*.csv"))
csv_file_names_real = {os.path.basename(x).split(".")[0]: x for x in csv_files_real}
dataset_names = list(mat_file_names_real.keys()) + list(csv_file_names_real.keys())
mat_file_names_real.update(csv_file_names_real)
dataset_paths = mat_file_names_real.copy()

## Utility Functions

Drop Duplicates from the loaded dataset 

In [16]:
def drop_duplicates(X, y):
    S = np.c_[X, y]
    S = pd.DataFrame(S).drop_duplicates().to_numpy()
    X, y = S[:, :-1], S[:, -1]
    return X, y

Load dataset coming from a `.mat` file 

In [17]:
def load_data(path):
    data = loadmat(path)
    X, y = data["X"], data["y"]
    y = np.hstack(y)
    X, y = drop_duplicates(X, y)
    return X, y

Load dataset coming from a `.csv` file

In [18]:
def load_data_csv(path):
    data = pd.read_csv(path, index_col=0)
    if "Unnamed: 0" in data.columns:
        data = data.drop(columns=["Unnamed: 0"])

    X = data[data.columns[data.columns != "Target"]]
    y = data["Target"]

    X, y = drop_duplicates(X, y)

    return X, y

Load the data (with `load_data` or with `load_data_csv`), scale the data and split it into train and test set obtaining `X_train`, `X_test` that will be passed to `compute_imps`. 

In [19]:
def pre_process(path):
    extension = os.path.splitext(path)[1]

    if extension == ".csv":
        X, y = load_data_csv(path)
    elif extension == ".mat":
        X, y = load_data(path)
    else:
        raise ValueError("Extension not supported")

    X_train, X_test = partition_data(X, y)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    X_test = np.r_[X_train, X_test]

    return X_train, X_test

Compute the Global Importance of a given dataset `n_runs` times. At the end a matrix with shape `(n_runs, n_features)` is returned. Each row contains the global importance of the features for a given run.

In [20]:
def compute_imps(model, X_train, X_test, n_runs):

    X_test=np.r_[X_train,X_test]

    imps = np.zeros(shape=(n_runs, X_train.shape[1]))
    for i in tqdm(range(n_runs)):
        model.fit(X_train)
        imps[i, :] = model.Global_importance(
            X_test, calculate=True, overwrite=False, depth_based=False
        )

    return imps

### `test_exiffi`

This is the function called in the `main` of `test_parallel.py` used to do the experiments on the CAPRI HPC server. For a given set of datasets it computes the global importance `n_runs` times using `Extended_DIFFI_parallel` or `Extended_DIFFI_original`and saves the importances matrices, the time stats obtained and the test arguments in a `.npz` file.

#### `test_exiffi` Parameters

- `X_train`: the train set
- `X_test`: the test set
- `savedir`: directory where to save the results in `.npz` format
- `n_runs`: number of runs to do
- `seed`: random seed to obtain reproducibile results and compare the importances matrices obtaind from the parallel and the serial version of the algorithm (they must be the same to certify the correctness of the parallel version)
- `parallel`: Boolean variable used to choose between the parallel and the serial version of the algorithm
- `n_cores`: Number of threads to use in the parallel version of the algorithm. This coincides with the number of cores set with the `--cpus-per-task` options in the `.job` file
- `num_trees`: Number of trees used by ExIFFI. The higher the more complex and more computationally expensive the algorithm is
- `name`: Name of the dataset

In [8]:
def test_exiffi(
    X_train,
    X_test,
    savedir,
    n_runs=10,
    seed=None,
    parallel=False,
    n_cores=2,
    num_trees=300,
    name="",
):
    args_to_avoid = ["X_train", "X_test", "savedir", "args_to_avoid", "args"]
    args = dict()
    for k, v in locals().items():
        if k in args_to_avoid:
            continue
        args[k] = v

    ex_time = []
    ex_imps = {}

    for i in trange(n_runs):
        seed = None if seed is None else seed + i

        if parallel:
            EDIFFI = Extended_DIFFI_parallel(
                n_trees=num_trees, max_depth=100, subsample_size=256, plus=1, seed=seed
            )
            EDIFFI.set_num_processes(n_cores, n_cores)
        else:
            EDIFFI = Extended_DIFFI_original(
                n_trees=num_trees, max_depth=100, subsample_size=256, plus=1, seed=seed
            )

        start = time.time()
        imps = compute_imps(EDIFFI, X_train, X_test, 10)
        ex_imps["Execution " + str(i)] = imps
        end = time.time()
        ex_time.append(end - start)

    # print(ex_imps)
    time_stat = {"mean": np.mean(ex_time), "std": np.std(ex_time)}
    filename = "test_stat_parallel.npz" if parallel else "test_stat_serial.npz"
    t = time.localtime()
    current_time = time.strftime("%d-%m-%Y_%H-%M-%S", t)
    filename = current_time + "_" + name + "_" + filename

    # if dir does not exist, create it
    if not os.path.exists(savedir):
        os.makedirs(savedir)
    filepath = os.path.join(savedir, filename)

    np.savez(
        filepath,
        execution_time_stat=time_stat,
        importances_matrix=ex_imps,
        arguments=args,
    )

## Load Data

## Wine Dataset

In [9]:
name='wine'
X,y=load_data(dataset_paths[name])
X_train,X_test=partition_data(X,y)
X.shape,y.shape

((129, 13), (129,))

### Serial ExIFFI

In [13]:
test_exiffi(
    X_train=X_train,
    X_test=X_test,
    savedir='../results/npz',
    n_runs=1,
    seed=120,
    parallel=False,
    n_cores=12,
    num_trees=10,
    name=name,
)

100%|██████████| 10/10 [00:01<00:00,  5.02it/s]
100%|██████████| 1/1 [00:01<00:00,  2.00s/it]


### Parallel ExIFFI

In [None]:
test_exiffi(
    X_train=X_train,
    X_test=X_test,
    savedir='../results/npz',
    n_runs=1,
    seed=120,
    parallel=True,
    n_cores=12,
    num_trees=200,
    name=name,
)

## Ionosphere Dataset

In [12]:
name='ionosphere'
X,y=load_data(dataset_paths[name])
X_train,X_test=partition_data(X,y)
X.shape,y.shape

((350, 33), (350,))

### Serial ExIFFI

In [None]:
test_exiffi(
    X_train=X_train,
    X_test=X_test,
    savedir='../results/npz',
    n_runs=1,
    seed=120,
    parallel=False,
    n_cores=12,
    num_trees=10,
    name=name,
)

### Parallel ExIFFI

In [None]:
test_exiffi(
    X_train=X_train,
    X_test=X_test,
    savedir='../results/npz',
    n_runs=1,
    seed=120,
    parallel=True,
    n_cores=12,
    num_trees=200,
    name=name,
)

## Moodify Dataset

In [26]:
name='moodify'
X,y=load_data_csv(dataset_paths[name])
X_train,X_test=partition_data(X,y)
X.shape,y.shape

((276260, 11), (276260,))

# Test Results 

## Parallel

In [9]:
stats=np.load('26-01-2024_17-42-35_test_stat_parallel_7000.npz',allow_pickle=True)
data_parallel=stats['importances_matrix'].tolist()
time_data_parallel=stats['execution_time_stat']
arguments_parallel=stats['arguments'].tolist()

In [12]:
print(arguments_parallel.keys())

args_to_avoid = ["X_train", "X_test", "X"]
for key in arguments_parallel.keys():
    if key not in args_to_avoid:
        print(key,arguments_parallel[key])

dict_keys(['X_train', 'X_test', 'X', 'n_runs', 'seed', 'parallel', 'n_cores'])
n_runs 2
seed None
parallel True
n_cores 8


In [33]:
time_data_parallel

array({'mean': 3.36362202167511, 'std': 0.23129910849918273}, dtype=object)

In [34]:
data_parallel.keys()

dict_keys(['Execution 0', 'Execution 1', 'Execution 2', 'Execution 3', 'Execution 4', 'Execution 5', 'Execution 6', 'Execution 7', 'Execution 8', 'Execution 9'])

## Serial

In [3]:
stats=np.load('test_stat_serial.npz',allow_pickle=True)
data_serial=stats['importances_matrix'].tolist()
time_data_serial=stats['execution_time_stat']

FileNotFoundError: [Errno 2] No such file or directory: 'test_stat_serial.npz'

In [30]:
time_data_serial

array({'mean': 3.5636572360992433, 'std': 0.5714168745774968},
      dtype=object)

In [31]:
data_serial.keys()

dict_keys(['Execution 0', 'Execution 1', 'Execution 2', 'Execution 3', 'Execution 4', 'Execution 5', 'Execution 6', 'Execution 7', 'Execution 8', 'Execution 9'])

Check if `data_parallel` and `data_serial` are equal


In [36]:
for k in data_serial.keys():
    print(np.sum(data_serial[k]-data_parallel[k]))

-5.995204332975845e-14
1.532107773982716e-13
-3.4638958368304884e-13
7.327471962526033e-14
5.306866057708248e-13
-3.774758283725532e-13
-3.197442310920451e-13
1.3522516439934407e-12
-1.2434497875801753e-13
-1.0769163338864018e-12


## Results Thyroid

### Parallel

In [9]:
path_to_load = (
    "../capri_code/results/npz/28-01-2024_17-45-18_annthyroid_test_stat_parallel.npz"
)

stats = np.load(path_to_load, allow_pickle=True)

display(stats['execution_time_stat'])
display(stats['arguments'].tolist())

array({'mean': 89.01771640777588, 'std': 0.0}, dtype=object)

{'n_runs': 1,
 'seed': 120,
 'parallel': True,
 'n_cores': 2,
 'num_trees': 10,
 'name': 'annthyroid',
 'args_to_avoid': ['X_train', 'X_test', 'savedir'],
 'args': {...}}

# All results

We use the script `process_results.py` to read the stats of the experiments from the `.npz` files and display them on a `pd.DataFrame` that can be saved as a `.csv` file.

In [2]:
from append_dir import append_dirname
append_dirname('ExIFFI')

from capri_code.process_results import load_stats, display_stats, compute_cpu_efficiency


results_dirpath = "../../container/job4/results/"

stats = load_stats(results_dirpath, use_pkl=True)

for i, row in stats.iterrows():
    n_cores = max([row["n_cores_fit"], row["n_cores_importance"], row["n_cores_anomaly"]])
    stats.loc[i, "cpu_efficiency"] = compute_cpu_efficiency(row["real_time"], row["user_time"], n_cores)
    

# display_stats(stats)
display_stats(stats.groupby("parallel").get_group(True))
display_stats(stats.groupby("parallel").get_group(False))

Unnamed: 0_level_0,n_cores_fit,n_cores_importance,n_cores_anomaly,n_runs,seed,parallel,n_trees,name,n_runs_imps,mean_time,std_time,mean_MB,std_MB,max_MB,real_time,user_time,sys_time,cpu_efficiency
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2024-01-30 15:35:08.255912,4,1,1,5,120,True,300,wine,5,36.076435,0.999008,443.505377,11.205296,466.96448,182.849,347.223,10.888,47.474009
2024-01-30 15:40:41.691636,4,1,1,5,120,True,300,glass,5,65.843605,0.978784,421.421711,24.829751,494.87872,331.75,615.674,14.859,46.395931
2024-01-30 15:47:36.088014,1,1,4,5,120,True,300,wine,5,65.135395,1.78364,421.945016,3.925508,428.290048,327.901,340.537,9.254,25.963401
2024-01-30 15:57:14.231006,1,1,4,5,120,True,300,glass,5,115.173798,1.689243,489.580954,6.211631,500.228096,578.169,602.999,12.623,26.073648
2024-01-30 16:50:32.736658,4,1,1,5,120,True,300,cardio,5,380.160016,1.962829,736.371671,33.649745,762.687488,1908.615,2271.178,24.292,29.749033
2024-01-30 18:01:22.290632,1,1,4,5,120,True,300,cardio,5,387.130552,7.122314,599.86133,5.510218,609.349632,1938.39,2174.331,23.124,28.043002
2024-01-30 18:04:00.427470,4,4,4,5,120,True,300,wine,5,30.545294,0.439893,442.673889,12.379361,450.551808,155.403,412.897,29.45,66.423589
2024-01-30 18:08:17.140138,4,4,4,5,120,True,300,glass,5,50.799305,0.744255,528.016835,21.841332,538.07104,257.075,726.041,41.006,70.605952
2024-01-30 18:20:54.680314,4,4,4,5,120,True,300,cardio,5,150.260168,1.874259,633.598607,27.5503,651.247616,757.341,2450.098,59.949,80.878297
2024-01-30 19:31:23.714165,1,1,4,5,120,True,300,cardio,5,399.47297,5.623879,596.438876,5.487201,605.732864,2000.773,2177.073,35.361,27.202899


Unnamed: 0_level_0,n_cores_fit,n_cores_importance,n_cores_anomaly,n_runs,seed,parallel,n_trees,name,n_runs_imps,mean_time,std_time,mean_MB,std_MB,max_MB,real_time,user_time,sys_time,cpu_efficiency
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2024-01-30 15:34:38.531939,1,1,1,5,120,False,100,wine,5,22.670529,1.310402,326.859817,7.030727,333.770752,120.767,113.376,0.583,93.879951
2024-01-30 15:37:59.648176,1,1,1,5,120,False,100,glass,5,38.201773,1.994781,258.565734,48.463303,355.602432,199.292,192.154,1.794,96.418321
2024-01-30 15:49:56.673645,1,1,1,5,120,False,100,cardio,5,142.643763,8.523221,393.009725,1.83851,395.968512,716.112,712.554,1.992,99.50315
2024-01-30 16:22:03.105891,1,1,1,5,120,False,300,wine,5,65.463616,4.144099,404.238664,2.465636,407.486464,329.567,327.989,1.48,99.52119
2024-01-30 16:31:21.686496,1,1,1,5,120,False,300,glass,5,111.261842,7.624729,470.430024,8.476952,480.997376,558.595,555.479,2.952,99.442172
2024-01-30 17:07:52.333205,1,1,1,5,120,False,300,cardio,5,437.642479,18.793363,591.404073,9.619989,608.17408,2190.869,2184.656,5.743,99.716414
2024-01-30 17:52:08.295689,1,1,1,5,120,False,100,pima,5,73.023921,1.989698,375.667261,2.361366,378.130432,367.396,366.16,1.081,99.663578
2024-01-30 17:56:01.986867,1,1,1,5,120,False,100,breastw,5,46.272488,3.636108,359.480361,1.736259,362.409984,233.615,232.774,0.724,99.640006
2024-01-30 18:00:02.972313,1,1,1,5,120,False,100,ionosphere,5,47.742123,0.766028,389.004493,3.439041,394.592256,240.911,239.895,0.872,99.578267
2024-01-30 18:37:58.826395,1,1,1,5,120,False,100,annthyroid,5,454.719296,30.255202,392.356987,2.78269,396.845056,2275.915,2274.126,1.248,99.921394


In [12]:
dataset_names=stats['name'].unique()
dataset_names

array(['cardio', 'glass', 'pendigits', 'wine', 'diabetes', 'ionosphere',
       'annthyroid', 'shuttle', 'breastw', 'pima'], dtype=object)

In [15]:
for name in dataset_names:
    display_stats(stats.groupby(['parallel',"name"]).get_group((True,name)))

Unnamed: 0_level_0,n_cores_fit,n_cores_importance,n_cores_anomaly,n_runs,seed,parallel,n_trees,name,n_runs_imps,mean_time,std_time,mean_MB,std_MB,max_MB,real_time,user_time,sys_time,cpu_efficiency
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2024-01-30 16:50:32.736658,4,1,1,5,120,True,300,cardio,5,380.160016,1.962829,736.371671,33.649745,762.687488,1908.615,2271.178,24.292,29.749033
2024-01-30 18:01:22.290632,1,1,4,5,120,True,300,cardio,5,387.130552,7.122314,599.86133,5.510218,609.349632,1938.39,2174.331,23.124,28.043002
2024-01-30 18:20:54.680314,4,4,4,5,120,True,300,cardio,5,150.260168,1.874259,633.598607,27.5503,651.247616,757.341,2450.098,59.949,80.878297
2024-01-30 19:31:23.714165,1,1,4,5,120,True,300,cardio,5,399.47297,5.623879,596.438876,5.487201,605.732864,2000.773,2177.073,35.361,27.202899
2024-01-30 19:50:03.422493,4,4,4,5,120,True,300,cardio,5,136.002594,2.305052,632.74795,29.538196,647.999488,683.232,2324.94,65.379,85.071396


Unnamed: 0_level_0,n_cores_fit,n_cores_importance,n_cores_anomaly,n_runs,seed,parallel,n_trees,name,n_runs_imps,mean_time,std_time,mean_MB,std_MB,max_MB,real_time,user_time,sys_time,cpu_efficiency
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2024-01-30 15:40:41.691636,4,1,1,5,120,True,300,glass,5,65.843605,0.978784,421.421711,24.829751,494.87872,331.75,615.674,14.859,46.395931
2024-01-30 15:57:14.231006,1,1,4,5,120,True,300,glass,5,115.173798,1.689243,489.580954,6.211631,500.228096,578.169,602.999,12.623,26.073648
2024-01-30 18:08:17.140138,4,4,4,5,120,True,300,glass,5,50.799305,0.744255,528.016835,21.841332,538.07104,257.075,726.041,41.006,70.605952
2024-01-30 19:38:40.374176,4,4,4,5,120,True,300,glass,5,49.2539,1.624285,517.520425,20.473205,536.113152,248.956,698.099,45.292,70.102649


Unnamed: 0_level_0,n_cores_fit,n_cores_importance,n_cores_anomaly,n_runs,seed,parallel,n_trees,name,n_runs_imps,mean_time,std_time,mean_MB,std_MB,max_MB,real_time,user_time,sys_time,cpu_efficiency
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2024-01-30 21:11:43.505410,4,4,4,5,120,True,300,pendigits,5,403.024353,39.814432,617.990226,42.473522,653.594624,2018.6,7050.35,81.063,87.317324


Unnamed: 0_level_0,n_cores_fit,n_cores_importance,n_cores_anomaly,n_runs,seed,parallel,n_trees,name,n_runs_imps,mean_time,std_time,mean_MB,std_MB,max_MB,real_time,user_time,sys_time,cpu_efficiency
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2024-01-30 15:35:08.255912,4,1,1,5,120,True,300,wine,5,36.076435,0.999008,443.505377,11.205296,466.96448,182.849,347.223,10.888,47.474009
2024-01-30 15:47:36.088014,1,1,4,5,120,True,300,wine,5,65.135395,1.78364,421.945016,3.925508,428.290048,327.901,340.537,9.254,25.963401
2024-01-30 18:04:00.427470,4,4,4,5,120,True,300,wine,5,30.545294,0.439893,442.673889,12.379361,450.551808,155.403,412.897,29.45,66.423589
2024-01-30 19:34:29.794001,4,4,4,5,120,True,300,wine,5,36.091074,6.986208,442.984202,10.987222,450.961408,184.554,411.978,35.911,55.807243


Unnamed: 0_level_0,n_cores_fit,n_cores_importance,n_cores_anomaly,n_runs,seed,parallel,n_trees,name,n_runs_imps,mean_time,std_time,mean_MB,std_MB,max_MB,real_time,user_time,sys_time,cpu_efficiency
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2024-01-31 02:40:54.708757,4,4,4,5,120,True,300,diabetes,5,3949.582951,70.271875,597.395374,52.073196,650.469376,19753.084,75955.915,283.161,96.131717


Unnamed: 0_level_0,n_cores_fit,n_cores_importance,n_cores_anomaly,n_runs,seed,parallel,n_trees,name,n_runs_imps,mean_time,std_time,mean_MB,std_MB,max_MB,real_time,user_time,sys_time,cpu_efficiency
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2024-01-30 20:07:02.168474,4,4,4,5,120,True,300,ionosphere,5,60.510995,1.262842,611.246899,39.035752,655.515648,305.277,855.462,59.38,70.056211


Unnamed: 0_level_0,n_cores_fit,n_cores_importance,n_cores_anomaly,n_runs,seed,parallel,n_trees,name,n_runs_imps,mean_time,std_time,mean_MB,std_MB,max_MB,real_time,user_time,sys_time,cpu_efficiency
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2024-01-30 20:38:05.180900,4,4,4,5,120,True,300,annthyroid,5,372.051031,5.036259,573.276488,29.364684,615.780352,1862.89,6965.303,63.04,93.474427


KeyError: (True, 'shuttle')

# Datasets Info

In [2]:
name_samples_features = [
    ("Bimodal", 400, 2),
    ("Xaxis", 1100, 6),
    ("Yaxis", 1100, 6),
    ("Bisect", 1100, 6),
    ("Bisec3D", 1100, 6),
    ("Bisec6D", 1100, 6),
    ("Annthyroid", 7200, 6),
    ("Breastw", 683, 9),
    ("Cardio", 1831, 21),
    ("Glass", 213, 9),
    ("Ionosphere", 351, 33),
    ("Pendigits", 6870, 16),
    ("Pima", 768, 8),
    ("Shuttle", 49097, 9),
    ("Wine", 129, 13),
    ("Diabetes", 85916, 4),
    ("Moodify", 276260, 11),
]

size = [(n, s * f) for n, s, f in name_samples_features]
size.sort(key=lambda x: x[1])  # sort by size

print(f"{'Dataset':<15}{'samp*feat':>10}\n{'':=>25}")
for n, s in size:
    print(f"{n:_<15}{s:_>10}")

Dataset         samp*feat
Bimodal_______________800
Wine_________________1677
Glass________________1917
Pima_________________6144
Breastw______________6147
Xaxis________________6600
Yaxis________________6600
Bisect_______________6600
Bisec3D______________6600
Bisec6D______________6600
Ionosphere__________11583
Cardio______________38451
Annthyroid__________43200
Pendigits__________109920
Diabetes___________343664
Shuttle____________441873
Moodify___________3038860


In [3]:
name_samples_features = [
    ("Annthyroid", 7200, 6),
    ("Breastw", 683, 9),
    ("Cardio", 1831, 21),
    ("Glass", 213, 9),
    ("Ionosphere", 351, 33),
    ("Pendigits", 6870, 16),
    ("Pima", 768, 8),
    ("Shuttle", 49097, 9),
    ("Wine", 129, 13),
    ("Diabetes", 85916, 4),
    ("Moodify", 276260, 11),
]

size = [(n, s * f) for n, s, f in name_samples_features]
size.sort(key=lambda x: x[1])  # sort by size

print(f"{'Dataset':<15}{'samp*feat':>10}\n{'':=>25}")
for n, s in size:
    print(f"{n:_<15}{s:_>10}")

Dataset         samp*feat
Wine_________________1677
Glass________________1917
Pima_________________6144
Breastw______________6147
Ionosphere__________11583
Cardio______________38451
Annthyroid__________43200
Pendigits__________109920
Diabetes___________343664
Shuttle____________441873
Moodify___________3038860
