In [1]:
import os
from glob import glob
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import linear_model
from sklearn.metrics import RocCurveDisplay, auc, mean_squared_error, roc_curve
from sklearn.model_selection import RandomizedSearchCV
from sklearn.utils.fixes import loguniform
from scipy.interpolate import interp1d

from label_flip_revised.utils import create_dir

In [2]:
PATH_ROOT = Path(os.getcwd()).absolute().parent
print(PATH_ROOT)

/home/lukec/workspace/diva_01


In [3]:
path_output = os.path.join(PATH_ROOT, 'results_plot')
print(path_output)
create_dir(path_output)

/home/lukec/workspace/diva_01/results_plot


In [4]:
df = pd.read_csv(os.path.join(PATH_ROOT, 'results', 'real', 'real_falfa_nn_db.csv'))

print(df.shape)

(90, 38)


In [5]:
datanames = df['Data.Base'].unique()
print(datanames)

['breastcancer_std' 'texture_subset_std' 'ringnorm_subset_std'
 'australian_std' 'cmc_std' 'yeast_subset_std' 'abalone_subset_std'
 'phoneme_subset_std' 'banknote_std' 'htru2_subset_std']


In [6]:
print(df.columns)

Index(['Data', 'F1', 'F1 SD', 'F1v', 'F2', 'F3', 'F4', 'N1', 'N2', 'N2 SD',
       'N3', 'N3 SD', 'N4', 'N4 SD', 'T1', 'T1 SD', 'LSC', 'L1', 'L2', 'L3',
       'T2', 'T3', 'T4', 'C1', 'C2', 'Density', 'ClsCoef', 'Hubs', 'Hubs SD',
       'Path.Train', 'Path.Poison', 'Path.Test', 'Rate', 'Train.Clean',
       'Test.Clean', 'Train.Poison', 'Test.Poison', 'Data.Base'],
      dtype='object')


In [7]:
workspace_names = [f'diva_{i:02d}' for i in range(1,6)]
workspace_names

['diva_01', 'diva_02', 'diva_03', 'diva_04', 'diva_05']

In [8]:
attacks = ['falfa_nn', 'alfa_svm', 'rand_svm', 'poison_svm']

In [9]:
NAME_MAP = {
    'abalone_subset_std': 'Abalone',
    'australian_std': 'Australian',
    'banknote_std': 'Banknote',
    'breastcancer_std': 'Breastcancer',
    'cmc_std': 'CMC',
    'htru2_subset_std': 'HTRU2',
    'phoneme_subset_std': 'Phoneme',
    'ringnorm_subset_std': 'Ringnorm',
    'texture_subset_std': 'Texture',
    'yeast_subset_std': 'Yeast'
}

In [10]:
for workspace in workspace_names:
    tables = []
    for att in attacks:
        df_ = pd.read_csv(os.path.join(PATH_ROOT.parent, workspace, 'results', 'real', f'real_{att}_db.csv'))
        tables.append(df_[['Data.Base', 'Train.Clean', 'Test.Clean']].groupby('Data.Base').mean().reset_index())
results = pd.concat(tables, ignore_index=True).groupby('Data.Base').mean().reset_index()
SDs = pd.concat(tables, ignore_index=True).groupby('Data.Base').std().reset_index()

In [11]:
results.to_csv(os.path.join(path_output, 'table_real_acc.csv'))
SDs.to_csv(os.path.join(path_output, 'table_real_acc_sd.csv'))

In [12]:
results[['Train.Clean', 'Test.Clean']] = (results[['Train.Clean', 'Test.Clean']] * 100).round(1)
results['Data.Base'] = results['Data.Base'].map(NAME_MAP)
results


Unnamed: 0,Data.Base,Train.Clean,Test.Clean
0,Abalone,79.9,76.5
1,Australian,91.5,81.9
2,Banknote,100.0,100.0
3,Breastcancer,99.3,95.0
4,CMC,79.9,77.5
5,HTRU2,94.8,92.6
6,Phoneme,89.7,85.6
7,Ringnorm,99.4,97.8
8,Texture,100.0,99.8
9,Yeast,73.5,65.8


In [13]:
SDs[['Train.Clean', 'Test.Clean']] = (SDs[['Train.Clean', 'Test.Clean']] * 100).round(1)
SDs['Data.Base'] = SDs['Data.Base'].map(NAME_MAP)
SDs


Unnamed: 0,Data.Base,Train.Clean,Test.Clean
0,Abalone,0.7,0.5
1,Australian,3.1,2.1
2,Banknote,0.0,0.0
3,Breastcancer,0.2,2.5
4,CMC,2.8,0.6
5,HTRU2,0.5,0.9
6,Phoneme,6.3,1.3
7,Ringnorm,0.4,1.1
8,Texture,0.0,0.5
9,Yeast,4.7,1.6


In [15]:
output_acc = pd.DataFrame()
output_acc['Data'] = results['Data.Base']
output_acc[['Train', 'Test']] = '$' + results[['Train.Clean', 'Test.Clean']].astype(str) + '\pm' + SDs[['Train.Clean', 'Test.Clean']].astype(str) + '$'
output_acc

Unnamed: 0,Data,Train,Test
0,Abalone,$79.9\pm0.7$,$76.5\pm0.5$
1,Australian,$91.5\pm3.1$,$81.9\pm2.1$
2,Banknote,$100.0\pm0.0$,$100.0\pm0.0$
3,Breastcancer,$99.3\pm0.2$,$95.0\pm2.5$
4,CMC,$79.9\pm2.8$,$77.5\pm0.6$
5,HTRU2,$94.8\pm0.5$,$92.6\pm0.9$
6,Phoneme,$89.7\pm6.3$,$85.6\pm1.3$
7,Ringnorm,$99.4\pm0.4$,$97.8\pm1.1$
8,Texture,$100.0\pm0.0$,$99.8\pm0.5$
9,Yeast,$73.5\pm4.7$,$65.8\pm1.6$


In [17]:
print(output_acc.to_latex(index=False, float_format='%.1f', column_format='lcc', escape=False))

\begin{tabular}{lcc}
\toprule
        Data &         Train &          Test \\
\midrule
     Abalone &  $79.9\pm0.7$ &  $76.5\pm0.5$ \\
  Australian &  $91.5\pm3.1$ &  $81.9\pm2.1$ \\
    Banknote & $100.0\pm0.0$ & $100.0\pm0.0$ \\
Breastcancer &  $99.3\pm0.2$ &  $95.0\pm2.5$ \\
         CMC &  $79.9\pm2.8$ &  $77.5\pm0.6$ \\
       HTRU2 &  $94.8\pm0.5$ &  $92.6\pm0.9$ \\
     Phoneme &  $89.7\pm6.3$ &  $85.6\pm1.3$ \\
    Ringnorm &  $99.4\pm0.4$ &  $97.8\pm1.1$ \\
     Texture & $100.0\pm0.0$ &  $99.8\pm0.5$ \\
       Yeast &  $73.5\pm4.7$ &  $65.8\pm1.6$ \\
\bottomrule
\end{tabular}



In [None]:
RATES = [0.1, 0.2, 0.3]

tables = []
for workspace in workspace_names:
    for att in attacks:
        df_ = pd.read_csv(os.path.join(PATH_ROOT.parent, workspace, 'results', 'real', f'real_{att}_db.csv'))
        for rate in RATES:
            subset_ =  df_[df_['Rate'] == rate][['Data.Base', 'Train.Clean', 'Test.Clean', 'Train.Poison', 'Test.Poison']]
            n_ = subset_.shape[0]

            data = {
                'Dataset': subset_['Data.Base'],
                'Poisoning Rate': np.array([rate] * n_),
                'Attack': np.array([att] * n_),
                'Train': (subset_['Train.Clean'] - subset_['Train.Poison']) * 100,
                'Test': (subset_['Test.Clean'] - subset_['Test.Poison']) * 100,
            }
            tables.append(pd.DataFrame(data))
results = pd.concat(tables, ignore_index=True)


In [None]:
table = results.groupby(['Dataset', 'Poisoning Rate', 'Attack']).mean().reset_index()
table.to_csv(os.path.join(path_output, 'table_real_dif.csv'), index=False)

tab_SD = results.groupby(['Dataset', 'Poisoning Rate', 'Attack']).std().reset_index()
tab_SD.to_csv(os.path.join(path_output, 'table_real_dif_sd.csv'), index=False)

In [None]:
RATE = 0.3

def toPivot(table):
    tab_pivot = table[table['Poisoning Rate'] == RATE].pivot(index='Dataset', columns=['Attack'], values=['Test']).round(1)
    tab_pivot.columns = [c[-1] for c in tab_pivot.columns.to_flat_index()]
    tab_pivot = tab_pivot.reset_index()
    tab_pivot = tab_pivot[['Dataset', 'rand_svm', 'poison_svm', 'alfa_svm', 'falfa_nn']]
    tab_pivot.columns = ['Dataset', 'Rand', 'PoisSVM', 'ALFA', 'FALFA']
    tab_pivot['Dataset'] = tab_pivot['Dataset'].map(NAME_MAP)
    return tab_pivot


In [None]:
tab_pivot = toPivot(table)
tab_pivot

In [None]:
sd_pivot = toPivot(tab_SD)
sd_pivot

In [None]:
output_pivot = pd.DataFrame()
output_pivot['Dataset'] = tab_pivot['Dataset']
output_pivot[['Rand', 'PoisSVM', 'ALFA', 'FALFA']] = '$' + tab_pivot[['Rand', 'PoisSVM', 'ALFA', 'FALFA']].astype(str) + '\pm' + sd_pivot[['Rand', 'PoisSVM', 'ALFA', 'FALFA']].astype(str) + '$'
print(output_pivot.to_latex(index=False, float_format='%.1f', multicolumn=True, escape=False))