In [1]:
import os
from glob import glob
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import linear_model
from sklearn.metrics import RocCurveDisplay, auc, mean_squared_error, roc_curve
from sklearn.model_selection import RandomizedSearchCV
from sklearn.utils.fixes import loguniform
from scipy.interpolate import interp1d

from label_flip_revised.utils import create_dir

In [2]:
PATH_ROOT = Path(os.getcwd()).absolute().parent
print(PATH_ROOT)

/home/lukec/workspace/diva_01


In [3]:
path_output = os.path.join(PATH_ROOT, 'results_plot')
print(path_output)
create_dir(path_output)

/home/lukec/workspace/diva_01/results_plot


In [4]:
df_sample = pd.read_csv(os.path.join(PATH_ROOT, 'results', 'real', 'real_falfa_nn_db.csv'))
print(df_sample.shape, '\n')
datanames = sorted(df_sample['Data.Base'].unique())
print(*datanames, sep=',\n')
print()

columns = df_sample.columns
print(*columns, sep=',\n')

(90, 38) 

abalone_subset_std,
australian_std,
banknote_std,
breastcancer_std,
cmc_std,
htru2_subset_std,
phoneme_subset_std,
ringnorm_subset_std,
texture_subset_std,
yeast_subset_std

Data,
F1,
F1 SD,
F1v,
F2,
F3,
F4,
N1,
N2,
N2 SD,
N3,
N3 SD,
N4,
N4 SD,
T1,
T1 SD,
LSC,
L1,
L2,
L3,
T2,
T3,
T4,
C1,
C2,
Density,
ClsCoef,
Hubs,
Hubs SD,
Path.Train,
Path.Poison,
Path.Test,
Rate,
Train.Clean,
Test.Clean,
Train.Poison,
Test.Poison,
Data.Base


In [5]:
cols_cm = [
    'F1', 'F1 SD', 'F1v', 'F2', 'F3', 'F4', 'N1', 'N2', 'N2 SD',
    'N3', 'N3 SD', 'N4', 'N4 SD', 'T1', 'T1 SD', 'LSC', 'L1', 'L2', 'L3',
    'T2', 'T3', 'T4', 'C1', 'C2', 'Density', 'ClsCoef', 'Hubs', 'Hubs SD',
]
col_y = 'Test.Poison'

print(len(cols_cm))

28


In [6]:
attacks = ['falfa_nn', 'alfa_svm', 'rand_svm', 'poison_svm']

In [7]:
workspace_names = [f'diva_{i:02d}' for i in range(1,6)]
workspace_names

['diva_01', 'diva_02', 'diva_03', 'diva_04', 'diva_05']

In [8]:
def load_dataframes(workspace):
    dfs = []
    for att in attacks:
        df_ = pd.read_csv(os.path.join(PATH_ROOT.parent, workspace, 'results', 'real', f'real_{att}_db.csv'))
        df_['Attack'] = att
        dfs.append(df_)
    df = pd.concat(dfs, ignore_index=True)
    df['Prediction'] = 0.
    return df

In [9]:
df_dict = {}
for wname in workspace_names:
    df = load_dataframes(wname)
    df_dict[wname] = df

In [10]:
N_ITER_SEARCH = 100

In [11]:
alphas = {}

param = {'alpha': loguniform(1e-4, 10),}
for wname in workspace_names:
    X = df_dict[wname][cols_cm]
    y = df_dict[wname][col_y]
    regressor = linear_model.Ridge()
    param_search = RandomizedSearchCV(regressor, param_distributions=param, n_iter=N_ITER_SEARCH, n_jobs=-1)
    param_search.fit(X, y)

    best_estimator_ = param_search.best_estimator_.get_params()
    alphas[wname] = np.round(best_estimator_['alpha'], 4)

print(alphas)

{'diva_01': 0.0034, 'diva_02': 0.036, 'diva_03': 0.0284, 'diva_04': 0.0109, 'diva_05': 0.0119}


In [12]:
IDX_PRED = df_dict[workspace_names[0]].columns.get_indexer(['Prediction'])
print(IDX_PRED)

[39]


In [13]:
for wname in workspace_names:
    df_ = df_dict[wname]
    alpha_ = alphas[wname]
    for dname in datanames:
        # Anything other than test set
        # Only train with 1 attack and 1 noise
        df_train_1 = df_[(df_['Data.Base'] != dname) & (df_['Attack'] == 'falfa_nn')]
        df_train_2 = df_[(df_['Data.Base'] != dname) & (df_['Attack'] == 'rand_svm')]
        df_train = pd.concat([df_train_1, df_train_2], ignore_index=True)
        X_train = df_train[cols_cm]
        y_train = df_train[col_y]

        regressor = linear_model.Ridge(alpha=alpha_)
        regressor.fit(X_train, y_train)

        for att in attacks:
            df_test = df_[(df_['Data.Base'] == dname) & (df_['Attack'] == att)]
            X_test = df_test[cols_cm]
            
            pred = regressor.predict(X_test)
            # Apply clipping
            pred = np.amin([pred, np.ones_like(pred)], axis=0)

            idx = df_[(df_['Data.Base'] == dname) & (df_['Attack'] == att)].index
            df_.iloc[idx, IDX_PRED] = pred

In [14]:
ATTACK_DICT = {
    'falfa_nn': 'FALFA',
    'alfa_svm': 'ALFA',
    'rand_svm': 'SLN',
    'poison_svm': 'PoisSVM',
}

DATA_DICT = {
    'abalone_subset_std': 'Abalone',
    'australian_std': 'Australian',
    'banknote_std': 'Banknote',
    'breastcancer_std': 'Breastcancer',
    'cmc_std': 'CMC',
    'htru2_subset_std': 'HTRU2',
    'phoneme_subset_std': 'Phoneme',
    'ringnorm_subset_std': 'Ringnorm',
    'texture_subset_std': 'Texture',
    'yeast_subset_std': 'Yeast',
}

In [15]:
data_rmse = {
    'Dataset': [],
    'Attack': [],
    'RMSE': [],
}
for dname in datanames:
    for att in attacks:
        RMSEs = []
        for wname in workspace_names:
            df_ = df_dict[wname]
            y_true = df_[(df_['Attack'] == att) & (df_['Data.Base'] == dname)]['Test.Poison']
            pred = df_[(df_['Attack'] == att) & (df_['Data.Base'] == dname)]['Prediction']
            rmse = np.sqrt(mean_squared_error(y_true, pred))
            RMSEs.append(rmse)
        rmse = np.mean(RMSEs)
        data_rmse['Dataset'].append(DATA_DICT[dname])
        data_rmse['Attack'].append(ATTACK_DICT[att])
        data_rmse['RMSE'].append(rmse)
df_rmse = pd.DataFrame(data_rmse)
path_rmse = os.path.join(path_output, 'real_rmse.csv')
df_rmse.to_csv(path_rmse, index=False)
print(f'Save to: {path_rmse}')

df_rmse

Save to: /home/lukec/workspace/diva_01/results_plot/real_rmse.csv


Unnamed: 0,Dataset,Attack,RMSE
0,Abalone,FALFA,0.082573
1,Abalone,ALFA,0.114757
2,Abalone,SLN,0.071744
3,Abalone,PoisSVM,0.058351
4,Australian,FALFA,0.108992
5,Australian,ALFA,0.114566
6,Australian,SLN,0.071057
7,Australian,PoisSVM,0.064454
8,Banknote,FALFA,0.146025
9,Banknote,ALFA,0.089204


In [16]:
df_mean = df_rmse.groupby('Attack').mean().transpose()
df_mean = df_mean.reset_index(drop=True)
df_mean['Dataset'] = 'Mean'
df_mean

Attack,ALFA,FALFA,PoisSVM,SLN,Dataset
0,0.126795,0.117924,0.108055,0.094544,Mean


In [18]:
df_sd = df_rmse.groupby('Attack').std().transpose().round(3)
df_sd

Attack,ALFA,FALFA,PoisSVM,SLN
RMSE,0.054,0.039,0.064,0.039


In [19]:
df_rmse_pivot = df_rmse.pivot(index='Dataset', columns='Attack', values='RMSE').reset_index()
df_rmse_full = pd.concat([df_rmse_pivot.reset_index(), df_mean], ignore_index=True)
df_rmse_full =  df_rmse_full[['Dataset', 'SLN', 'PoisSVM', 'ALFA', 'FALFA']]
df_rmse_full= df_rmse_full.rename({'Dataset': 'Dataset/Attack'}, axis=1)
df_rmse_full

Attack,Dataset/Attack,SLN,PoisSVM,ALFA,FALFA
0,Abalone,0.071744,0.058351,0.114757,0.082573
1,Australian,0.071057,0.064454,0.114566,0.108992
2,Banknote,0.125771,0.174844,0.089204,0.146025
3,Breastcancer,0.06765,0.065132,0.117239,0.113384
4,CMC,0.101471,0.250192,0.250426,0.208706
5,HTRU2,0.055077,0.061504,0.105428,0.094382
6,Phoneme,0.086685,0.066644,0.0846,0.093598
7,Ringnorm,0.191611,0.14871,0.139373,0.13446
8,Texture,0.090455,0.099153,0.066767,0.072702
9,Yeast,0.083917,0.091567,0.18559,0.12442


In [20]:
print(df_rmse_full.to_latex(index=False, float_format='%.3f'))

\begin{tabular}{lrrrr}
\toprule
Dataset/Attack &   SLN &  PoisSVM &  ALFA &  FALFA \\
\midrule
       Abalone & 0.072 &    0.058 & 0.115 &  0.083 \\
    Australian & 0.071 &    0.064 & 0.115 &  0.109 \\
      Banknote & 0.126 &    0.175 & 0.089 &  0.146 \\
  Breastcancer & 0.068 &    0.065 & 0.117 &  0.113 \\
           CMC & 0.101 &    0.250 & 0.250 &  0.209 \\
         HTRU2 & 0.055 &    0.062 & 0.105 &  0.094 \\
       Phoneme & 0.087 &    0.067 & 0.085 &  0.094 \\
      Ringnorm & 0.192 &    0.149 & 0.139 &  0.134 \\
       Texture & 0.090 &    0.099 & 0.067 &  0.073 \\
         Yeast & 0.084 &    0.092 & 0.186 &  0.124 \\
          Mean & 0.095 &    0.108 & 0.127 &  0.118 \\
\bottomrule
\end{tabular}

