# Packing all the diva data

In [1]:
import pandas as pd
import numpy as np
import sys
sys.path.append("../../")
sys.path.append("../../../")
from pathlib import Path
from sklearn.metrics import classification_report

In [2]:
from mlem.utilities import load_pickle_bz2, save_pickle_bz2

In [3]:
DATA_PATH = Path("../../../notebooks/datasets/diva/BB_NO_CLUSTERING/BB_DATA")

In [4]:
%%bash
ls ../../../notebooks/datasets/diva/BB_NO_CLUSTERING/BB_DATA

centroids.npz
clusters_representatives_quantiles_5_40.csv
clusters_representatives_quantiles_7_56.csv
clusters_representatives_quantiles_9_72.csv
diva_rf_noclustering.bz2
diva_rf_noclustering_data_nsamples_5.npz
diva_rf_noclustering_data_nsamples_7.npz
diva_rf_noclustering_data_nsamples_9.npz
kmeans_k5.bz2
test.csv
train_clustered_keep01.csv
train.csv
validation.csv
validation_noisy.csv


In [5]:
black_box = load_pickle_bz2(DATA_PATH / "diva_rf_noclustering.bz2")

In [6]:
black_box

Saving the black box in the current folder as `diva_randbor.bz2`

In [7]:
save_pickle_bz2("diva_randfor.bz2", black_box)

Creating a numpy dictionary with all the useful data

Loading the old dict.

In [7]:
loaded = np.load(str(DATA_PATH / "diva_rf_noclustering_data_nsamples_5.npz"))
print(*loaded.keys())

x_train x_test y_train y_test x_test_clustered y_test_clustered


In [8]:
print(classification_report(loaded['y_test'], black_box.predict(loaded['x_test'])))

              precision    recall  f1-score   support

           0       0.92      0.95      0.94      2094
           1       0.85      0.77      0.81       745

    accuracy                           0.90      2839
   macro avg       0.89      0.86      0.87      2839
weighted avg       0.90      0.90      0.90      2839



In [10]:
valid = pd.read_csv(DATA_PATH / "validation.csv")
valid_noisy = pd.read_csv(DATA_PATH / "validation_noisy.csv")

In [11]:
centroids = np.load(DATA_PATH / "centroids.npz")
print(*centroids.keys())

centroids


In [12]:
X_train = loaded['x_train']
y_train = loaded['y_train']
X_test  = loaded['x_test']
y_test  = loaded['y_test']
X_validation = valid.drop('Target', axis=1).to_numpy()
y_validation = valid.Target.to_numpy()
X_validation_noisy = valid_noisy.drop('Target', axis=1).to_numpy()
y_validation_noisy = valid_noisy.Target.to_numpy()
# on diva there were only two clusters
x_attack_5_per_quantile = loaded['x_test_clustered']
y_attack_5_per_quantile = loaded['y_test_clustered']


In [13]:
valid.columns[:-1].to_list()

['FLG_PRES_RISCOSS',
 'IMP_V_AGG_IVA',
 'SOLVIBILITA',
 'VOL_AFF_DICH',
 'PESO_ADESIONE',
 'TIPO_DICH_ACCERT',
 'IMP_ESISTZ',
 'DETR_IVA_ACC',
 'VAL_ALIQ_MEDIA_ACQ',
 'FLG_PRES_RICORSO',
 'STATO_CONTROLLO',
 'VAL_ALIQ_M_ACQ_IMP',
 'FLG_VC',
 'IMP_V_AGG_IMPON',
 'VAR_RIMBORSO',
 'VAL_ALIQ_MEDIA_VOL',
 'IMP_IMPST_CREDIT',
 'IMP_ACQ_NOIMP',
 'COD_ATTIV_GEN',
 'IVA_OP_IMPON_DIC',
 'IMP_BEN_AMM',
 'IMP_ECC_PREC',
 'FLG_PRES_BILANCIO',
 'imp_tot_pos',
 'VAR_DETRAZIONE',
 'MAG_IMP_RIT_ACC']

In [14]:
np.savez_compressed("diva-blackbox-data.npz",
    X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, X_validation=X_validation, y_validation=y_validation, \
    X_validation_noisy=X_validation_noisy, y_validation_noisy=y_validation_noisy, \
    X_attack_5_per_quantile=x_attack_5_per_quantile, y_attack_5_per_quantile=y_attack_5_per_quantile, \
    categorical_features=[], numerical_features=valid.columns[:-1].to_list(), \
    categorical_features_mask=[False for _ in range(len(valid.columns[:-1].to_list()))], centroids=centroids['centroids']
)

# Adding to the data dictionary set of furthest elements

In these cells I add to the `diva-blackbox-data.npz` another set computed by first selecting the furthest elements of the test set,
and then iteratively selecting the furthest element from the ones already selected

In [9]:
import sys
import numpy as np
import pandas as pd

UTILS_RELATIVE_PATH = "../../"
sys.path.append(UTILS_RELATIVE_PATH)
from utils.filtering import extract_points_furthest_distance

In [10]:
loaded = dict(np.load('diva-blackbox-data.npz'))
X_test = loaded['X_test']
y_test = loaded['y_test']
test_df = pd.DataFrame(X_test)
test_df['Target'] = y_test
test_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,17,18,19,20,21,22,23,24,25,Target
0,-0.658649,-0.000487,0.228899,0.021833,-0.449675,0.477425,-0.082859,-0.077551,0.769873,-0.542534,...,-0.036188,-0.071309,-0.058972,-0.01883,-0.075926,0.828543,-0.139304,0.18425,-0.035737,0
1,-0.658649,-0.017945,0.228899,-0.050791,-0.449675,0.477425,-0.082859,-0.077551,0.781589,-0.542534,...,-0.036188,2.000086,0.151528,-0.025852,-0.075926,0.828543,-0.139304,-0.183469,-0.035737,1
2,-0.658649,-0.014663,-0.737083,-0.042018,-0.449675,0.477425,-0.082859,-0.026768,0.322698,1.843203,...,-0.009822,1.769931,0.154897,-0.020344,-0.075926,0.828543,-0.139304,-0.158994,-0.035737,0
3,-0.658649,-0.039351,1.194882,-0.050732,-0.449675,0.477425,-0.082859,-0.07698,0.785494,1.843203,...,-0.036188,1.539776,0.095825,-0.02636,-0.075926,0.828543,-0.139304,-0.200442,-0.035737,1
4,1.51826,0.006973,0.228899,-0.046487,-0.449675,-1.161895,-0.082859,-0.077551,0.685905,1.843203,...,-0.036171,-0.761774,0.184564,-0.026024,-0.075926,-1.206937,-0.139304,-0.215383,-0.035737,0


extracting $100$ points

In [11]:
points_far = extract_points_furthest_distance(test_df, 100)

In [12]:
X_distance_separated = points_far.drop('Target', axis=1).to_numpy()
y_distance_separated = points_far.Target.to_numpy()

In [13]:
loaded['X_distance_separated'] = X_distance_separated
loaded['y_distance_separated'] = y_distance_separated

In [14]:
np.savez('diva-blackbox-data2.npz', **loaded)