In [1]:
import os
from pathlib import Path
from glob import glob

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from label_flip_revised.utils import open_csv, create_dir
from label_flip_revised.simple_nn_model import SimpleModel
from label_flip_revised.torch_utils import evaluate, train_model

In [2]:
PATH_ROOT = Path(os.getcwd()).absolute().parent
print(PATH_ROOT)

/home/lukec/workspace/label_flip_revised


In [3]:
DATANAMES = [
    'abalone',
    'australian',
    'banknote',
    'breastcancer',
    'cmc',
    'htru2',
    'phoneme',
    'ringnorm',
    'texture',
    'yeast'
]

DATAFILES = [
    'abalone_subset_std',
    'australian_std',
    'banknote_std',
    'breastcancer_std',
    'cmc_std',
    'htru2_subset_std',
    'phoneme_subset_std',
    'ringnorm_subset_std',
    'texture_subset_std',
    'yeast_subset_std',
]

In [4]:
path_scores = np.sort(glob(os.path.join(PATH_ROOT, 'results', 'real', '*_nn_flfa_score.csv')))
print(len(path_scores))
print(path_scores[:5])

10
['/home/lukec/workspace/label_flip_revised/results/real/abalone_subset_std_nn_flfa_score.csv'
 '/home/lukec/workspace/label_flip_revised/results/real/australian_std_nn_flfa_score.csv'
 '/home/lukec/workspace/label_flip_revised/results/real/banknote_std_nn_flfa_score.csv'
 '/home/lukec/workspace/label_flip_revised/results/real/breastcancer_std_nn_flfa_score.csv'
 '/home/lukec/workspace/label_flip_revised/results/real/cmc_std_nn_flfa_score.csv']


In [5]:
def append_cm(dataname, datafilename, df, method='nn'):
    print(dataname, datafilename)

    if method == 'nn':
        foldername = 'real_nn'
        methodname = method
    elif method == 'svm':
        foldername = 'real_svm'
        methodname = method
    elif method == 'rand_nn' or method == 'rand_svm':
        foldername = 'real_random'
        methodname = 'random'
    else:
        raise ValueError('Unsupported method!')
    df_cm = pd.read_csv(os.path.join(PATH_ROOT, 'results', foldername, f'{dataname}_{methodname}_poison.csv'))

    # Add rates
    rates = [float(Path(d).stem.split('_')[-1]) for d in df_cm['Data'].to_list()]
    df_cm['Rate'] = rates
    df_cm = df_cm[df_cm['Rate'] <= 0.41]

    # Add file path
    if method == 'nn':
        foldername = 'alfa_nn'
    elif method == 'svm':
        foldername = 'alfa'
    elif method == 'rand_nn' or method == 'rand_svm':
        foldername = 'random'
    else:
        raise ValueError('Unsupported method!')
    df_cm['Filepath'] = df_cm['Data'].apply(lambda x: os.path.join('data', 'output', foldername, x))

    # Read clean
    df_cm_clean = pd.read_csv(os.path.join(PATH_ROOT, 'results', 'real_cm_clean.csv'))
    df_cm_clean['Rate'] = 0.
    df_cm_clean = df_cm_clean[df_cm_clean['Data'] == f'{datafilename}_clean_train.csv']
    df_cm_clean['Filepath'] = df_cm_clean['Data'].apply(lambda x: os.path.join('data', 'output', 'train', x))
    df_cm = pd.concat([df_cm_clean, df_cm], ignore_index=True)

    # Add data name
    df_cm['Data'] = dataname

    # Remove NA
    cols_not_na = df_cm.columns[df_cm.notna().any()].tolist()
    df_cm = df_cm[cols_not_na]
    
    # Add accuracy
    if method == 'nn':
        suffix = 'nn_flfa'
    elif method == 'svm':
        suffix = 'svm_alfa'
    elif method == 'rand_nn':
        suffix = 'nn_random'
    elif method == 'rand_svm':
        suffix = 'svm_random'
    else:
        raise ValueError('Unsupported method!')
    path_score = os.path.join(PATH_ROOT, 'results', 'real', f'{datafilename}_{suffix}_score.csv')
    df_score = pd.read_csv(path_score)
    df_score = df_score.sort_values(['rate'], ignore_index=True)
    df_cm['Train'] = df_score['train']
    df_cm['Test'] = df_score['test']

    # Rename columns
    COL_NAMES = [   
        'Data', 
        'F1', 'F1 SD', 'F1v', 'F2', 'F3', 'F4', 'N1', 
        'N2', 'N2 SD', 'N3 ', 'N3 SD', 'N4', 'N4 SD', 'T1', 'T1 SD', 'LSC', 
        'L1', 'L2', 'L3', 'T2', 'T3', 'T4', 'C1', 'C2', 'Density', 'ClsCoef', 
        'Hubs', 'HubsSD', 
        'Rate', 'Filepath', 'Train', 'Test',
    ]
    new_names_map = {df_cm.columns[i]:COL_NAMES[i] for i in range(len(COL_NAMES))}
    df_cm = df_cm.rename(new_names_map, axis=1)

    df = pd.concat([df, df_cm], ignore_index=True)
    return df

In [6]:
cols = [
    'Data', 'Filepath', 'Rate',
    'F1', 'F1 SD', 'F1v', 'F2', 'F3', 'F4', 'N1', 'N2', 'N2 SD',
    'N3 ', 'N3 SD', 'N4', 'N4 SD', 'T1', 'T1 SD', 'LSC', 'L1', 'L2', 'L3',
    'T2', 'T3', 'T4', 'C1', 'C2', 'Density', 'ClsCoef', 'Hubs', 'HubsSD',
    'Train', 'Test'
]

In [7]:
method = 'nn'
df = pd.DataFrame()
for data, filename in zip(DATANAMES, DATAFILES):
    df = append_cm(data, filename, df, method=method)
df = df[cols]
path_output = os.path.join(PATH_ROOT, 'results', f'real_cmeasures_{method}.csv')
df.to_csv(path_output, index=False)

abalone abalone_subset_std
australian australian_std
banknote banknote_std
breastcancer breastcancer_std
cmc cmc_std
htru2 htru2_subset_std
phoneme phoneme_subset_std
ringnorm ringnorm_subset_std
texture texture_subset_std
yeast yeast_subset_std


In [8]:
method = 'svm'
df = pd.DataFrame()
for data, filename in zip(DATANAMES, DATAFILES):
    df = append_cm(data, filename, df, method=method)
df = df[cols]
path_output = os.path.join(PATH_ROOT, 'results', f'real_cmeasures_{method}.csv')
df.to_csv(path_output, index=False)

abalone abalone_subset_std
australian australian_std
banknote banknote_std
breastcancer breastcancer_std
cmc cmc_std
htru2 htru2_subset_std
phoneme phoneme_subset_std
ringnorm ringnorm_subset_std
texture texture_subset_std
yeast yeast_subset_std


In [9]:
method = 'rand_nn'
df = pd.DataFrame()
for data, filename in zip(DATANAMES, DATAFILES):
    df = append_cm(data, filename, df, method=method)
df = df[cols]
path_output = os.path.join(PATH_ROOT, 'results', f'real_cmeasures_{method}.csv')
df.to_csv(path_output, index=False)

abalone abalone_subset_std
australian australian_std
banknote banknote_std
breastcancer breastcancer_std
cmc cmc_std
htru2 htru2_subset_std
phoneme phoneme_subset_std
ringnorm ringnorm_subset_std
texture texture_subset_std
yeast yeast_subset_std


In [10]:
method = 'rand_svm'
df = pd.DataFrame()
for data, filename in zip(DATANAMES, DATAFILES):
    df = append_cm(data, filename, df, method=method)
df = df[cols]
path_output = os.path.join(PATH_ROOT, 'results', f'real_cmeasures_{method}.csv')
df.to_csv(path_output, index=False)

abalone abalone_subset_std
australian australian_std
banknote banknote_std
breastcancer breastcancer_std
cmc cmc_std
htru2 htru2_subset_std
phoneme phoneme_subset_std
ringnorm ringnorm_subset_std
texture texture_subset_std
yeast yeast_subset_std
