# Test Parallel ExIFFI

Host capri Davide

    Hostname capri.dei.unipd.it
    User p1026u27

In [1]:
import sys
import numpy as np
import pandas as pd
from tqdm import trange
from append_dir import append_dirname
append_dirname('ExIFFI')
from utils.utils import partition_data
from utils.feature_selection import *
#from plot import *
#from simulation_setup import *
from models import *
from models.Extended_IF import *
from models.Extended_DIFFI_parallel import *
from models.Extended_DIFFI_original import *
import math
import seaborn as sns
sns.set()

from sklearn.preprocessing import StandardScaler
import time

import os
import pickle 
from scipy.io import loadmat
from glob import glob

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)

## Set up file paths

In [2]:
path = os.getcwd()
path = os.path.dirname(path)
path_real = os.path.join(path, "data", "real")
mat_files_real = glob(os.path.join(path_real, "*.mat"))
mat_file_names_real = {os.path.basename(x).split(".")[0]: x for x in mat_files_real}
csv_files_real = glob(os.path.join(path_real, "*.csv"))
csv_file_names_real = {os.path.basename(x).split(".")[0]: x for x in csv_files_real}
dataset_names = list(mat_file_names_real.keys()) + list(csv_file_names_real.keys())
mat_file_names_real.update(csv_file_names_real)
dataset_paths = mat_file_names_real.copy()

## Utility Functions

Drop Duplicates from the loaded dataset 

In [3]:
def drop_duplicates(X, y):
    S = np.c_[X, y]
    S = pd.DataFrame(S).drop_duplicates().to_numpy()
    X, y = S[:, :-1], S[:, -1]
    return X, y

Load dataset coming from a `.mat` file 

In [4]:
def load_data(path):
    data = loadmat(path)
    X, y = data["X"], data["y"]
    y = np.hstack(y)
    X, y = drop_duplicates(X, y)
    return X, y

Load dataset coming from a `.csv` file

In [5]:
def load_data_csv(path):
    data = pd.read_csv(path, index_col=0)
    if "Unnamed: 0" in data.columns:
        data = data.drop(columns=["Unnamed: 0"])

    X = data[data.columns[data.columns != "Target"]]
    y = data["Target"]

    X, y = drop_duplicates(X, y)

    return X, y

Load the data (with `load_data` or with `load_data_csv`), scale the data and split it into train and test set obtaining `X_train`, `X_test` that will be passed to `compute_imps`. 

In [6]:
def pre_process(path):
    extension = os.path.splitext(path)[1]

    if extension == ".csv":
        X, y = load_data_csv(path)
    elif extension == ".mat":
        X, y = load_data(path)
    else:
        raise ValueError("Extension not supported")

    X_train, X_test = partition_data(X, y)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    X_test = np.r_[X_train, X_test]

    return X_train, X_test

Compute the Global Importance of a given dataset `n_runs` times. At the end a matrix with shape `(n_runs, n_features)` is returned. Each row contains the global importance of the features for a given run.

In [7]:
def compute_imps(model, X_train, X_test, n_runs):

    X_test=np.r_[X_train,X_test]

    imps = np.zeros(shape=(n_runs, X_train.shape[1]))
    for i in tqdm(range(n_runs)):
        model.fit(X_train)
        imps[i, :] = model.Global_importance(
            X_test, calculate=True, overwrite=False, depth_based=False
        )

    return imps

### `test_exiffi`

This is the function called in the `main` of `test_parallel.py` used to do the experiments on the CAPRI HPC server. For a given set of datasets it computes the global importance `n_runs` times using `Extended_DIFFI_parallel` or `Extended_DIFFI_original`and saves the importances matrices, the time stats obtained and the test arguments in a `.npz` file.

#### `test_exiffi` Parameters

- `X_train`: the train set
- `X_test`: the test set
- `savedir`: directory where to save the results in `.npz` format
- `n_runs`: number of runs to do
- `seed`: random seed to obtain reproducibile results and compare the importances matrices obtaind from the parallel and the serial version of the algorithm (they must be the same to certify the correctness of the parallel version)
- `parallel`: Boolean variable used to choose between the parallel and the serial version of the algorithm
- `n_cores`: Number of threads to use in the parallel version of the algorithm. This coincides with the number of cores set with the `--cpus-per-task` options in the `.job` file
- `num_trees`: Number of trees used by ExIFFI. The higher the more complex and more computationally expensive the algorithm is
- `name`: Name of the dataset

In [8]:
def test_exiffi(
    X_train,
    X_test,
    savedir,
    n_runs=10,
    seed=None,
    parallel=False,
    n_cores=2,
    num_trees=300,
    name="",
):
    args_to_avoid = ["X_train", "X_test", "savedir", "args_to_avoid", "args"]
    args = dict()
    for k, v in locals().items():
        if k in args_to_avoid:
            continue
        args[k] = v

    ex_time = []
    ex_imps = {}

    for i in trange(n_runs):
        seed = None if seed is None else seed + i

        if parallel:
            EDIFFI = Extended_DIFFI_parallel(
                n_trees=num_trees, max_depth=100, subsample_size=256, plus=1, seed=seed
            )
            EDIFFI.set_num_processes(n_cores, n_cores)
        else:
            EDIFFI = Extended_DIFFI_original(
                n_trees=num_trees, max_depth=100, subsample_size=256, plus=1, seed=seed
            )

        start = time.time()
        imps = compute_imps(EDIFFI, X_train, X_test, 10)
        ex_imps["Execution " + str(i)] = imps
        end = time.time()
        ex_time.append(end - start)

    # print(ex_imps)
    time_stat = {"mean": np.mean(ex_time), "std": np.std(ex_time)}
    filename = "test_stat_parallel.npz" if parallel else "test_stat_serial.npz"
    t = time.localtime()
    current_time = time.strftime("%d-%m-%Y_%H-%M-%S", t)
    filename = current_time + "_" + name + "_" + filename

    # if dir does not exist, create it
    if not os.path.exists(savedir):
        os.makedirs(savedir)
    filepath = os.path.join(savedir, filename)

    np.savez(
        filepath,
        execution_time_stat=time_stat,
        importances_matrix=ex_imps,
        arguments=args,
    )

## Load Data

## Wine Dataset

In [9]:
name='wine'
X,y=load_data(dataset_paths[name])
X_train,X_test=partition_data(X,y)
X.shape,y.shape

((129, 13), (129,))

### Serial ExIFFI

In [13]:
test_exiffi(
    X_train=X_train,
    X_test=X_test,
    savedir='../results/npz',
    n_runs=1,
    seed=120,
    parallel=False,
    n_cores=12,
    num_trees=10,
    name=name,
)

100%|██████████| 10/10 [00:01<00:00,  5.02it/s]
100%|██████████| 1/1 [00:01<00:00,  2.00s/it]


### Parallel ExIFFI

In [None]:
test_exiffi(
    X_train=X_train,
    X_test=X_test,
    savedir='../results/npz',
    n_runs=1,
    seed=120,
    parallel=True,
    n_cores=12,
    num_trees=200,
    name=name,
)

## Ionosphere Dataset

In [12]:
name='ionosphere'
X,y=load_data(dataset_paths[name])
X_train,X_test=partition_data(X,y)
X.shape,y.shape

((350, 33), (350,))

### Serial ExIFFI

In [None]:
test_exiffi(
    X_train=X_train,
    X_test=X_test,
    savedir='../results/npz',
    n_runs=1,
    seed=120,
    parallel=False,
    n_cores=12,
    num_trees=10,
    name=name,
)

### Parallel ExIFFI

In [None]:
test_exiffi(
    X_train=X_train,
    X_test=X_test,
    savedir='../results/npz',
    n_runs=1,
    seed=120,
    parallel=True,
    n_cores=12,
    num_trees=200,
    name=name,
)

# Test Results 

## Parallel

In [9]:
stats=np.load('26-01-2024_17-42-35_test_stat_parallel_7000.npz',allow_pickle=True)
data_parallel=stats['importances_matrix'].tolist()
time_data_parallel=stats['execution_time_stat']
arguments_parallel=stats['arguments'].tolist()

In [12]:
print(arguments_parallel.keys())

args_to_avoid = ["X_train", "X_test", "X"]
for key in arguments_parallel.keys():
    if key not in args_to_avoid:
        print(key,arguments_parallel[key])

dict_keys(['X_train', 'X_test', 'X', 'n_runs', 'seed', 'parallel', 'n_cores'])
n_runs 2
seed None
parallel True
n_cores 8


In [33]:
time_data_parallel

array({'mean': 3.36362202167511, 'std': 0.23129910849918273}, dtype=object)

In [34]:
data_parallel.keys()

dict_keys(['Execution 0', 'Execution 1', 'Execution 2', 'Execution 3', 'Execution 4', 'Execution 5', 'Execution 6', 'Execution 7', 'Execution 8', 'Execution 9'])

## Serial

In [3]:
stats=np.load('test_stat_serial.npz',allow_pickle=True)
data_serial=stats['importances_matrix'].tolist()
time_data_serial=stats['execution_time_stat']

FileNotFoundError: [Errno 2] No such file or directory: 'test_stat_serial.npz'

In [30]:
time_data_serial

array({'mean': 3.5636572360992433, 'std': 0.5714168745774968},
      dtype=object)

In [31]:
data_serial.keys()

dict_keys(['Execution 0', 'Execution 1', 'Execution 2', 'Execution 3', 'Execution 4', 'Execution 5', 'Execution 6', 'Execution 7', 'Execution 8', 'Execution 9'])

Check if `data_parallel` and `data_serial` are equal


In [36]:
for k in data_serial.keys():
    print(np.sum(data_serial[k]-data_parallel[k]))

-5.995204332975845e-14
1.532107773982716e-13
-3.4638958368304884e-13
7.327471962526033e-14
5.306866057708248e-13
-3.774758283725532e-13
-3.197442310920451e-13
1.3522516439934407e-12
-1.2434497875801753e-13
-1.0769163338864018e-12


## Results Thyroid

### Parallel

In [9]:
path_to_load = (
    "../capri_code/results/npz/28-01-2024_17-45-18_annthyroid_test_stat_parallel.npz"
)

stats = np.load(path_to_load, allow_pickle=True)

display(stats['execution_time_stat'])
display(stats['arguments'].tolist())

array({'mean': 89.01771640777588, 'std': 0.0}, dtype=object)

{'n_runs': 1,
 'seed': 120,
 'parallel': True,
 'n_cores': 2,
 'num_trees': 10,
 'name': 'annthyroid',
 'args_to_avoid': ['X_train', 'X_test', 'savedir'],
 'args': {...}}

Execution Time

In [6]:
wine_stats['execution_time_stat']

array({'mean': 22.09242186546326, 'std': 2.2678216673392386}, dtype=object)

In [9]:
wine_stats['arguments']

array({'n_runs': 10, 'seed': 120, 'parallel': True, 'n_cores': 12, 'num_trees': 300, 'name': 'wine', 'args_to_avoid': ['X_train', 'X_test', 'savedir'], 'args': {...}},
      dtype=object)

In [13]:
imp_mat_wine=wine_stats['importances_matrix'].tolist()
imp_mat_wine['Execution 2']

array([[1.51150254, 0.94012808, 0.77765704, 0.85334869, 0.92756969,
        1.01631053, 1.27235724, 1.37869805, 0.93163375, 1.2656863 ,
        1.18378956, 1.39478296, 1.38035484],
       [1.51150254, 0.94012808, 0.77765704, 0.85334869, 0.92756969,
        1.01631053, 1.27235724, 1.37869805, 0.93163375, 1.2656863 ,
        1.18378956, 1.39478296, 1.38035484],
       [1.51150254, 0.94012808, 0.77765704, 0.85334869, 0.92756969,
        1.01631053, 1.27235724, 1.37869805, 0.93163375, 1.2656863 ,
        1.18378956, 1.39478296, 1.38035484],
       [1.51150254, 0.94012808, 0.77765704, 0.85334869, 0.92756969,
        1.01631053, 1.27235724, 1.37869805, 0.93163375, 1.2656863 ,
        1.18378956, 1.39478296, 1.38035484],
       [1.51150254, 0.94012808, 0.77765704, 0.85334869, 0.92756969,
        1.01631053, 1.27235724, 1.37869805, 0.93163375, 1.2656863 ,
        1.18378956, 1.39478296, 1.38035484],
       [1.51150254, 0.94012808, 0.77765704, 0.85334869, 0.92756969,
        1.01631053, 1.27235

# All results

We use the script `process_results.py` to read the stats of the experiments from the `.npz` files and display them on a `pd.DataFrame` that can be saved as a `.csv` file.

In [20]:
for data in stats:
    print()
    break

breastw


In [22]:
from append_dir import append_dirname
append_dirname('ExIFFI')

from capri_code.process_results import load_stats, display_stats


results_dirpath = "../capri_code/results/npz/"

stats = load_stats(results_dirpath)

display_stats(stats)
# display_stats(stats.groupby("name").get_group("cardio"))

Unnamed: 0,n_runs,seed,parallel,n_cores,n_trees,name,mean_time,std_time,mean_MB,std_MB,max_MB
0,2,,True,7,10,glass,9.483155,0.089877,338.409062,1.062795,338.743296


In [1]:
import numpy as np
from append_dir import append_dirname
append_dirname('ExIFFI')

from capri_code.process_results import load_stats, display_stats


results_dirpath = "../capri_code/results/npz/new/new"

stats = load_stats(results_dirpath)

# display_stats(stats)
display(stats)

imps_mat = np.array(stats.loc[0, "importances_matrix"])

print("imps_mat.shape", imps_mat.shape)


imp_mat_ex_0 = imps_mat[0]
print("imp_mat_ex_0.shape", imp_mat_ex_0.shape)

# for i in range(len(imp_mat_ex_0)-1):
#     print(imp_mat_ex_0[i] - imp_mat_ex_0[i+1])

imp_mat_ex_1 = imps_mat[1]

# for i in range(len(imp_mat_ex_1)):
#     print(imp_mat_ex_1[i] - imp_mat_ex_0[i])

# print(imps_mat[0] - imps_mat[1])
# print(imps_mat[1] - imps_mat[2])

Unnamed: 0,importances_matrix,n_runs,seed,parallel,n_cores,n_trees,name,mean_time,std_time,mean_MB,std_MB,max_MB,n_runs_imps
0,"[[[0.8197615431146767, 0.9772933701175461, 0.8...",2,123,True,7,100,glass,73.373395,0.33661,415.168922,12.585211,431.443968,
1,"[[[1.6035938204969806, 1.6317114425255999, 1.6...",2,123,True,7,300,breastw,23.554641,1.770169,513.024,25.022464,538.046464,1.0


imps_mat.shape (2, 10, 9)
imp_mat_ex_0.shape (10, 9)


In [18]:
import numpy as np

def func(x):
    return x+10

a = np.ones((3))

# add newaxis to a
a = a[:, np.newaxis]

print(a)
print(a.shape)

output = np.apply_along_axis(func, 1, a)

print("output\n", output)

[[1.]
 [1.]
 [1.]]
(3, 1)
output
 [[11.]
 [11.]
 [11.]]
