In [1]:
import os
from glob import glob
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import linear_model
from sklearn.metrics import RocCurveDisplay, auc, mean_squared_error, roc_curve
from sklearn.model_selection import RandomizedSearchCV
from sklearn.utils.fixes import loguniform
from scipy.interpolate import interp1d

from label_flip_revised.utils import create_dir

In [2]:
PATH_ROOT = Path(os.getcwd()).absolute().parent
print(PATH_ROOT)

/home/lukec/workspace/diva_01


In [3]:
path_output = os.path.join(PATH_ROOT, 'results_plot')
print(path_output)
create_dir(path_output)

/home/lukec/workspace/diva_01/results_plot


In [4]:
df = pd.read_csv(os.path.join(PATH_ROOT, 'results', 'real', 'real_falfa_nn_db.csv'))
print(df.shape)

datanames = df['Data.Base'].unique()
print(datanames)

columns = df.columns
print(columns)

(90, 38)
['breastcancer_std' 'texture_subset_std' 'ringnorm_subset_std'
 'australian_std' 'cmc_std' 'yeast_subset_std' 'abalone_subset_std'
 'phoneme_subset_std' 'banknote_std' 'htru2_subset_std']
Index(['Data', 'F1', 'F1 SD', 'F1v', 'F2', 'F3', 'F4', 'N1', 'N2', 'N2 SD',
       'N3', 'N3 SD', 'N4', 'N4 SD', 'T1', 'T1 SD', 'LSC', 'L1', 'L2', 'L3',
       'T2', 'T3', 'T4', 'C1', 'C2', 'Density', 'ClsCoef', 'Hubs', 'Hubs SD',
       'Path.Train', 'Path.Poison', 'Path.Test', 'Rate', 'Train.Clean',
       'Test.Clean', 'Train.Poison', 'Test.Poison', 'Data.Base'],
      dtype='object')


In [5]:
workspace_names = [f'diva_{i:02d}' for i in range(1,6)]
print(workspace_names)

['diva_01', 'diva_02', 'diva_03', 'diva_04', 'diva_05']


In [6]:
NAME_MAP = {
    'abalone_subset_std': 'Abalone',
    'australian_std': 'Australian',
    'banknote_std': 'Banknote',
    'breastcancer_std': 'Breastcancer',
    'cmc_std': 'CMC',
    'htru2_subset_std': 'HTRU2',
    'phoneme_subset_std': 'Phoneme',
    'ringnorm_subset_std': 'Ringnorm',
    'texture_subset_std': 'Texture',
    'yeast_subset_std': 'Yeast'
}

ATTACKS = ['falfa_nn', 'alfa_svm', 'rand_svm', 'poison_svm']

In [7]:
RATE = 0.1

tables = []
for workspace in workspace_names:
    for att in ATTACKS:
        df_ = pd.read_csv(os.path.join(PATH_ROOT.parent, workspace, 'results', 'real', f'real_{att}_db.csv'))
        subset_ =  df_[df_['Rate'] == RATE][['Data.Base', 'Train.Clean', 'Test.Clean', 'Train.Poison', 'Test.Poison']]
        n_ = subset_.shape[0]

        data = {
            'Dataset': subset_['Data.Base'],
            'Poisoning Rate': np.array([RATE] * n_),
            'Attack': np.array([att] * n_),
            'Train': (subset_['Train.Clean'] - subset_['Train.Poison']) * 100,
            'Test': (subset_['Test.Clean'] - subset_['Test.Poison']) * 100,
        }
        tables.append(pd.DataFrame(data))
results = pd.concat(tables, ignore_index=True)

In [8]:
results

Unnamed: 0,Dataset,Poisoning Rate,Attack,Train,Test
0,breastcancer_std,0.1,falfa_nn,2.197802,11.403509
1,texture_subset_std,0.1,falfa_nn,6.750000,8.500000
2,ringnorm_subset_std,0.1,falfa_nn,10.625000,3.000000
3,australian_std,0.1,falfa_nn,6.159420,7.971014
4,cmc_std,0.1,falfa_nn,2.546689,3.050847
...,...,...,...,...,...
195,htru2_subset_std,0.1,poison_svm,4.345911,-0.250000
196,phoneme_subset_std,0.1,poison_svm,3.437500,0.750000
197,ringnorm_subset_std,0.1,poison_svm,3.458987,2.250000
198,texture_subset_std,0.1,poison_svm,1.173021,0.500000


In [9]:
def toPivot(table):
    tab_pivot = table[table['Poisoning Rate'] == RATE].pivot(index='Dataset', columns=['Attack'], values=['Test']).round(1)
    tab_pivot.columns = [c[-1] for c in tab_pivot.columns.to_flat_index()]
    tab_pivot = tab_pivot.reset_index()
    tab_pivot = tab_pivot[['Dataset', 'rand_svm', 'poison_svm', 'alfa_svm', 'falfa_nn']]
    tab_pivot.columns = ['Dataset', 'Rand', 'PoisSVM', 'ALFA', 'FALFA']
    tab_pivot['Dataset'] = tab_pivot['Dataset'].map(NAME_MAP)
    return tab_pivot

In [10]:
df_diff = results.groupby(['Dataset', 'Poisoning Rate', 'Attack']).mean().reset_index()
df_diff_SD = results.groupby(['Dataset', 'Poisoning Rate', 'Attack']).std().reset_index()

In [11]:
pivot_diff = toPivot(df_diff)
pivot_sd = toPivot(df_diff_SD)

In [12]:
output_pivot = pd.DataFrame()
output_pivot['Dataset'] = pivot_diff['Dataset']
output_pivot[['Rand', 'PoisSVM', 'ALFA', 'FALFA']] = '$' + pivot_diff[['Rand', 'PoisSVM', 'ALFA', 'FALFA']].astype(str) + '\pm' + pivot_sd[['Rand', 'PoisSVM', 'ALFA', 'FALFA']].astype(str) + '$'
print(output_pivot.to_latex(index=False, float_format='%.1f', multicolumn=True, escape=False))

\begin{tabular}{lllll}
\toprule
     Dataset &         Rand &      PoisSVM &         ALFA &        FALFA \\
\midrule
     Abalone &  $0.8\pm0.7$ &  $1.8\pm0.8$ &  $9.5\pm1.9$ &  $7.7\pm1.7$ \\
  Australian &  $0.7\pm0.5$ &  $4.5\pm3.9$ &  $4.9\pm4.0$ &  $8.3\pm3.8$ \\
    Banknote &  $1.4\pm2.3$ &  $1.1\pm1.1$ & $10.9\pm2.5$ & $10.3\pm2.9$ \\
Breastcancer &  $2.5\pm0.7$ &  $5.3\pm4.6$ &  $7.2\pm2.0$ &  $9.1\pm2.7$ \\
         CMC & $-0.2\pm0.7$ & $15.1\pm4.7$ &  $3.5\pm3.0$ &  $5.7\pm3.3$ \\
       HTRU2 &  $0.7\pm0.3$ &  $0.7\pm1.3$ &  $9.2\pm3.1$ &  $9.4\pm2.4$ \\
     Phoneme &  $3.5\pm2.9$ &  $0.9\pm2.1$ &  $6.8\pm0.7$ & $11.6\pm2.1$ \\
    Ringnorm &  $0.1\pm0.3$ &  $1.7\pm0.5$ &  $3.2\pm2.5$ &  $6.4\pm2.9$ \\
     Texture &  $0.5\pm1.1$ &  $1.2\pm0.8$ &  $7.9\pm4.6$ &  $4.9\pm3.9$ \\
       Yeast & $-0.2\pm1.6$ &  $1.9\pm3.8$ & $10.4\pm4.9$ &  $2.3\pm4.6$ \\
\bottomrule
\end{tabular}

