In [1]:
import os
from pathlib import Path
from glob import glob

import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

from label_flip_revised.utils import (create_dir, open_csv, open_json)

In [2]:
PATH_ROOT = Path(os.getcwd()).absolute().parent
print(PATH_ROOT)

/home/lukec/workspace/label_flip_revised


In [3]:
# NOTE: `cardiotocography` is removed!

datanames = [
    'abalone_subset_std',
    'australian_std',
    'banknote_std',
    'breastcancer_std',
    'cmc_std',
    'htru2_subset_std',
    'phoneme_subset_std',
    'ringnorm_subset_std',
    'texture_subset_std',
    'yeast_subset_std'
]

print(len(datanames))

10


In [4]:
poison_rate = [f'{i:.2f}' for i in np.arange(0.0, 0.41, 0.05)]
poison_rate

['0.00', '0.05', '0.10', '0.15', '0.20', '0.25', '0.30', '0.35', '0.40']

In [5]:
path_train = os.path.join(PATH_ROOT, 'data', 'output', 'train', '*.csv')
datapath_train = np.sort(glob(path_train))
print(len(datapath_train))
datapath_train

11


array(['/home/lukec/workspace/label_flip_revised/data/output/train/abalone_subset_std_clean_train.csv',
       '/home/lukec/workspace/label_flip_revised/data/output/train/australian_std_clean_train.csv',
       '/home/lukec/workspace/label_flip_revised/data/output/train/banknote_std_clean_train.csv',
       '/home/lukec/workspace/label_flip_revised/data/output/train/breastcancer_std_clean_train.csv',
       '/home/lukec/workspace/label_flip_revised/data/output/train/cardiotocography_std_clean_train.csv',
       '/home/lukec/workspace/label_flip_revised/data/output/train/cmc_std_clean_train.csv',
       '/home/lukec/workspace/label_flip_revised/data/output/train/htru2_subset_std_clean_train.csv',
       '/home/lukec/workspace/label_flip_revised/data/output/train/phoneme_subset_std_clean_train.csv',
       '/home/lukec/workspace/label_flip_revised/data/output/train/ringnorm_subset_std_clean_train.csv',
       '/home/lukec/workspace/label_flip_revised/data/output/train/texture_subset_std_

In [6]:
path_test = os.path.join(PATH_ROOT, 'data', 'output', 'test', '*.csv')
datapath_test = np.sort(glob(path_test))
print(len(datapath_test))
datapath_test

11


array(['/home/lukec/workspace/label_flip_revised/data/output/test/abalone_subset_std_clean_test.csv',
       '/home/lukec/workspace/label_flip_revised/data/output/test/australian_std_clean_test.csv',
       '/home/lukec/workspace/label_flip_revised/data/output/test/banknote_std_clean_test.csv',
       '/home/lukec/workspace/label_flip_revised/data/output/test/breastcancer_std_clean_test.csv',
       '/home/lukec/workspace/label_flip_revised/data/output/test/cardiotocography_std_clean_test.csv',
       '/home/lukec/workspace/label_flip_revised/data/output/test/cmc_std_clean_test.csv',
       '/home/lukec/workspace/label_flip_revised/data/output/test/htru2_subset_std_clean_test.csv',
       '/home/lukec/workspace/label_flip_revised/data/output/test/phoneme_subset_std_clean_test.csv',
       '/home/lukec/workspace/label_flip_revised/data/output/test/ringnorm_subset_std_clean_test.csv',
       '/home/lukec/workspace/label_flip_revised/data/output/test/texture_subset_std_clean_test.csv',
  

In [7]:
path_alfa = os.path.join(PATH_ROOT, 'data', 'output', 'alfa', '*.csv')
datapath_alfa = np.sort(glob(path_alfa))
print(len(datapath_alfa))
datapath_alfa[:10]

99


array(['/home/lukec/workspace/label_flip_revised/data/output/alfa/abalone_subset_std_rbf_ALFA_0.05.csv',
       '/home/lukec/workspace/label_flip_revised/data/output/alfa/abalone_subset_std_rbf_ALFA_0.10.csv',
       '/home/lukec/workspace/label_flip_revised/data/output/alfa/abalone_subset_std_rbf_ALFA_0.15.csv',
       '/home/lukec/workspace/label_flip_revised/data/output/alfa/abalone_subset_std_rbf_ALFA_0.20.csv',
       '/home/lukec/workspace/label_flip_revised/data/output/alfa/abalone_subset_std_rbf_ALFA_0.25.csv',
       '/home/lukec/workspace/label_flip_revised/data/output/alfa/abalone_subset_std_rbf_ALFA_0.30.csv',
       '/home/lukec/workspace/label_flip_revised/data/output/alfa/abalone_subset_std_rbf_ALFA_0.35.csv',
       '/home/lukec/workspace/label_flip_revised/data/output/alfa/abalone_subset_std_rbf_ALFA_0.40.csv',
       '/home/lukec/workspace/label_flip_revised/data/output/alfa/abalone_subset_std_rbf_ALFA_0.45.csv',
       '/home/lukec/workspace/label_flip_revised/data/o

In [8]:
path_flfa = os.path.join(PATH_ROOT, 'data', 'output', 'alfa_nn', '*.csv')
datapath_flfa = np.sort(glob(path_flfa))
print(len(datapath_flfa))
datapath_flfa[:10]

95


array(['/home/lukec/workspace/label_flip_revised/data/output/alfa_nn/abalone_subset_std_nn_ALFA_0.05.csv',
       '/home/lukec/workspace/label_flip_revised/data/output/alfa_nn/abalone_subset_std_nn_ALFA_0.10.csv',
       '/home/lukec/workspace/label_flip_revised/data/output/alfa_nn/abalone_subset_std_nn_ALFA_0.15.csv',
       '/home/lukec/workspace/label_flip_revised/data/output/alfa_nn/abalone_subset_std_nn_ALFA_0.20.csv',
       '/home/lukec/workspace/label_flip_revised/data/output/alfa_nn/abalone_subset_std_nn_ALFA_0.25.csv',
       '/home/lukec/workspace/label_flip_revised/data/output/alfa_nn/abalone_subset_std_nn_ALFA_0.30.csv',
       '/home/lukec/workspace/label_flip_revised/data/output/alfa_nn/abalone_subset_std_nn_ALFA_0.35.csv',
       '/home/lukec/workspace/label_flip_revised/data/output/alfa_nn/abalone_subset_std_nn_ALFA_0.40.csv',
       '/home/lukec/workspace/label_flip_revised/data/output/alfa_nn/abalone_subset_std_nn_ALFA_0.45.csv',
       '/home/lukec/workspace/label_f

In [9]:
path_random = os.path.join(PATH_ROOT, 'data', 'output', 'random', '*.csv')
datapath_random = np.sort(glob(path_random))
print(len(datapath_random))
datapath_random[:10]

88


array(['/home/lukec/workspace/label_flip_revised/data/output/random/abalone_subset_std_random_0.05.csv',
       '/home/lukec/workspace/label_flip_revised/data/output/random/abalone_subset_std_random_0.10.csv',
       '/home/lukec/workspace/label_flip_revised/data/output/random/abalone_subset_std_random_0.15.csv',
       '/home/lukec/workspace/label_flip_revised/data/output/random/abalone_subset_std_random_0.20.csv',
       '/home/lukec/workspace/label_flip_revised/data/output/random/abalone_subset_std_random_0.25.csv',
       '/home/lukec/workspace/label_flip_revised/data/output/random/abalone_subset_std_random_0.30.csv',
       '/home/lukec/workspace/label_flip_revised/data/output/random/abalone_subset_std_random_0.35.csv',
       '/home/lukec/workspace/label_flip_revised/data/output/random/abalone_subset_std_random_0.40.csv',
       '/home/lukec/workspace/label_flip_revised/data/output/random/australian_std_random_0.05.csv',
       '/home/lukec/workspace/label_flip_revised/data/outpu

In [10]:
path_output = os.path.join(PATH_ROOT, 'results', 'real')
create_dir(path_output)

for j, dataname in enumerate(datanames):
    print(dataname)
    acc_train = []
    acc_test = []

    path_json_param = os.path.join(PATH_ROOT, 'data', 'output', 'alfa', dataname + '_svm.json')
    svm_param = open_json(path_json_param)

    path_data_test = os.path.join(PATH_ROOT, 'data', 'output', 'test', dataname + '_clean_test.csv')
    X_test, y_test, _ = open_csv(path_data_test)

    path_data_train_list = [f for f in datapath_train if dataname in f] + [f for f in datapath_alfa if dataname in f]
    print(len(path_data_train_list))

    for i, p in enumerate(poison_rate):
        X_train, y_train, _ = open_csv(path_data_train_list[i])
        clf = SVC(**svm_param)
        clf.fit(X_train, y_train)
        acc_train.append(clf.score(X_train, y_train))
        acc_test.append(clf.score(X_test, y_test))
        print(f'Accuracy on {p}% poison data train: {acc_train[i] * 100:.2f} test: {acc_test[i] * 100:.2f}')
    
    results = {
        'rate': poison_rate,
        'train': acc_train,
        'test': acc_test,
    }
    df = pd.DataFrame(results)
    df.to_csv(os.path.join(path_output, f'{dataname}_svm_alfa_score.csv'), index=False)



abalone_subset_std
10
Accuracy on 0.00% poison data train: 81.44 test: 78.75
Accuracy on 0.05% poison data train: 78.56 test: 77.00
Accuracy on 0.10% poison data train: 78.00 test: 69.50
Accuracy on 0.15% poison data train: 79.88 test: 67.00
Accuracy on 0.20% poison data train: 78.00 test: 59.50
Accuracy on 0.25% poison data train: 78.62 test: 57.25
Accuracy on 0.30% poison data train: 77.75 test: 53.75
Accuracy on 0.35% poison data train: 81.69 test: 51.00
Accuracy on 0.40% poison data train: 78.88 test: 43.75
australian_std
10
Accuracy on 0.00% poison data train: 87.14 test: 90.58
Accuracy on 0.05% poison data train: 80.43 test: 85.51
Accuracy on 0.10% poison data train: 84.60 test: 68.12
Accuracy on 0.15% poison data train: 83.88 test: 70.29
Accuracy on 0.20% poison data train: 64.49 test: 44.20
Accuracy on 0.25% poison data train: 69.57 test: 44.20
Accuracy on 0.30% poison data train: 74.46 test: 44.20
Accuracy on 0.35% poison data train: 79.53 test: 44.20
Accuracy on 0.40% poison 

In [11]:
path_output = os.path.join(PATH_ROOT, 'results', 'real')
create_dir(path_output)

for j, dataname in enumerate(datanames):
    print(dataname)
    acc_train = []
    acc_test = []

    path_json_param = os.path.join(PATH_ROOT, 'data', 'output', 'alfa', dataname + '_svm.json')
    svm_param = open_json(path_json_param)

    path_data_test = os.path.join(PATH_ROOT, 'data', 'output', 'test', dataname + '_clean_test.csv')
    X_test, y_test, _ = open_csv(path_data_test)

    path_data_train_list = [f for f in datapath_train if dataname in f] + [f for f in datapath_random if dataname in f]

    for i, p in enumerate(poison_rate):
        X_train, y_train, _ = open_csv(path_data_train_list[i])
        clf = SVC(**svm_param)
        clf.fit(X_train, y_train)
        acc_train.append(clf.score(X_train, y_train))
        acc_test.append(clf.score(X_test, y_test))
        print(f'Accuracy on {p}% poison data train: {acc_train[i] * 100:.2f} test: {acc_test[i] * 100:.2f}')
    
    results = {
        'rate': poison_rate,
        'train': acc_train,
        'test': acc_test,
    }
    df = pd.DataFrame(results)
    df.to_csv(os.path.join(path_output, f'{dataname}_svm_random_score.csv'), index=False)

abalone_subset_std
Accuracy on 0.00% poison data train: 81.44 test: 78.75
Accuracy on 0.05% poison data train: 79.00 test: 78.00
Accuracy on 0.10% poison data train: 74.12 test: 79.00
Accuracy on 0.15% poison data train: 72.00 test: 77.50
Accuracy on 0.20% poison data train: 69.56 test: 78.75
Accuracy on 0.25% poison data train: 65.00 test: 77.75
Accuracy on 0.30% poison data train: 63.12 test: 77.50
Accuracy on 0.35% poison data train: 60.44 test: 74.00
Accuracy on 0.40% poison data train: 57.56 test: 75.00
australian_std
Accuracy on 0.00% poison data train: 87.14 test: 90.58
Accuracy on 0.05% poison data train: 82.61 test: 86.96
Accuracy on 0.10% poison data train: 79.71 test: 86.23
Accuracy on 0.15% poison data train: 76.99 test: 85.51
Accuracy on 0.20% poison data train: 70.11 test: 86.23
Accuracy on 0.25% poison data train: 67.75 test: 86.23
Accuracy on 0.30% poison data train: 65.04 test: 87.68
Accuracy on 0.35% poison data train: 59.42 test: 86.96
Accuracy on 0.40% poison data t

In [12]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

from label_flip_revised.simple_nn_model import SimpleModel
from label_flip_revised.torch_utils import evaluate, train_model

In [13]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
    print('Running on CPU!')


In [14]:
HIDDEN_LAYER = 128  # Number of hidden neurons in a hidden layer.
LR = 0.001  # Learning rate.
MAX_EPOCHS = 400  # Number of iteration for training.
BATCH_SIZE = 128

In [15]:
# Scores for Neural Networks
path_output = os.path.join(PATH_ROOT, 'results', 'real')
print(path_output)
create_dir(os.path.join(path_output, 'torch'))

for j, dataname in enumerate(datanames):
    print(dataname)
    acc_train = []
    acc_test = []

    path_clean_train = os.path.join(PATH_ROOT, 'data', 'output', 'train', dataname + '_clean_train.csv')
    path_clean_test = os.path.join(PATH_ROOT, 'data', 'output', 'test', dataname + '_clean_test.csv')
    X_train, y_train, _ = open_csv(path_clean_train)
    X_test, y_test, _ = open_csv(path_clean_test)
    n_features = X_train.shape[1]

    # Preprocessing
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    dataset_test = TensorDataset(torch.from_numpy(X_test).type(torch.float32),
                                 torch.from_numpy(y_test).type(torch.int64))
    dataloader_test = DataLoader(dataset_test, batch_size=BATCH_SIZE, shuffle=True)
    
    path_data_train_list = [f for f in datapath_train if dataname in f] + [f for f in datapath_flfa if dataname in f]

    for i, p in enumerate(poison_rate):
        _, y_train, _ = open_csv(path_data_train_list[i])
        dataset_train = TensorDataset(torch.from_numpy(X_train).type(torch.float32),
                                  torch.from_numpy(y_train).type(torch.int64))
        dataloader_train = DataLoader(dataset_train, batch_size=BATCH_SIZE, shuffle=True)

        model = SimpleModel(n_features, hidden_dim=HIDDEN_LAYER, output_dim=2).to(device)
        optimizer = torch.optim.SGD(model.parameters(), lr=LR, momentum=0.8)
        loss_fn = nn.CrossEntropyLoss()

        path_model = os.path.join(path_output, 'torch', dataname + f'_SimpleNN_flfa_{p}.torch')
        if os.path.exists(path_model):
            model.load_state_dict(torch.load(path_model, map_location=device))
        else:
            train_model(model, dataloader_train, optimizer, loss_fn, device, MAX_EPOCHS)
            torch.save(model.state_dict(), path_model)

        acc_poison, _ = evaluate(
            dataloader_train, model, loss_fn, device)
        acc_te, _ = evaluate(
            dataloader_test, model, loss_fn, device)

        acc_train.append(acc_poison)
        acc_test.append(acc_te)
        print(f'Accuracy on {p}% poison data train: {acc_train[i] * 100:.2f} test: {acc_test[i] * 100:.2f}')
    
    results = {
        'rate': poison_rate,
        'train': acc_train,
        'test': acc_test,
    }
    df = pd.DataFrame(results)
    df.to_csv(os.path.join(path_output, f'{dataname}_nn_flfa_score.csv'), index=False)

/home/lukec/workspace/label_flip_revised/results/real
abalone_subset_std
Accuracy on 0.00% poison data train: 75.06 test: 74.00
Accuracy on 0.05% poison data train: 70.50 test: 73.50
Accuracy on 0.10% poison data train: 72.19 test: 60.00
Accuracy on 0.15% poison data train: 71.56 test: 55.25
Accuracy on 0.20% poison data train: 77.44 test: 55.00
Accuracy on 0.25% poison data train: 75.00 test: 50.00
Accuracy on 0.30% poison data train: 80.00 test: 50.50
Accuracy on 0.35% poison data train: 85.00 test: 50.00
Accuracy on 0.40% poison data train: 90.00 test: 50.00
australian_std
Accuracy on 0.00% poison data train: 84.42 test: 82.61
Accuracy on 0.05% poison data train: 72.64 test: 62.32
Accuracy on 0.10% poison data train: 76.27 test: 77.54
Accuracy on 0.15% poison data train: 70.47 test: 55.80
Accuracy on 0.20% poison data train: 64.49 test: 44.20
Accuracy on 0.25% poison data train: 80.43 test: 55.80
Accuracy on 0.30% poison data train: 74.64 test: 44.20
Accuracy on 0.35% poison data tr

In [16]:
# NN with random noise
path_output = os.path.join(PATH_ROOT, 'results', 'real')
print(path_output)
create_dir(os.path.join(path_output, 'torch'))

for j, dataname in enumerate(datanames):
    print(dataname)
    acc_train = []
    acc_test = []

    path_clean_train = os.path.join(PATH_ROOT, 'data', 'output', 'train', dataname + '_clean_train.csv')
    path_clean_test = os.path.join(PATH_ROOT, 'data', 'output', 'test', dataname + '_clean_test.csv')
    X_train, y_train, _ = open_csv(path_clean_train)
    X_test, y_test, _ = open_csv(path_clean_test)
    n_features = X_train.shape[1]

    # Preprocessing
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    dataset_test = TensorDataset(torch.from_numpy(X_test).type(torch.float32), torch.from_numpy(y_test).type(torch.int64))
    dataloader_test = DataLoader(dataset_test, batch_size=BATCH_SIZE, shuffle=True)
    
    path_data_train_list = [f for f in datapath_train if dataname in f] + [f for f in datapath_random if dataname in f]

    for i, p in enumerate(poison_rate):
        _, y_train, _ = open_csv(path_data_train_list[i])
        dataset_train = TensorDataset(torch.from_numpy(X_train).type(torch.float32), torch.from_numpy(y_train).type(torch.int64))
        dataloader_train = DataLoader(dataset_train, batch_size=BATCH_SIZE, shuffle=True)

        model = SimpleModel(n_features, hidden_dim=HIDDEN_LAYER, output_dim=2).to(device)
        optimizer = torch.optim.SGD(model.parameters(), lr=LR, momentum=0.8)
        loss_fn = nn.CrossEntropyLoss()

        path_model = os.path.join(path_output, 'torch', dataname + f'_SimpleNN_random_{p}.torch')
        if os.path.exists(path_model):
            model.load_state_dict(torch.load(path_model, map_location=device))
        else:
            train_model(model, dataloader_train, optimizer, loss_fn, device, MAX_EPOCHS)
            torch.save(model.state_dict(), path_model)

        acc_poison, _ = evaluate(
            dataloader_train, model, loss_fn, device)
        acc_te, _ = evaluate(
            dataloader_test, model, loss_fn, device)

        acc_train.append(acc_poison)
        acc_test.append(acc_te)
        print(f'Accuracy on {p}% poison data train: {acc_train[i] * 100:.2f} test: {acc_test[i] * 100:.2f}')
    
    results = {
        'rate': poison_rate,
        'train': acc_train,
        'test': acc_test,
    }
    df = pd.DataFrame(results)
    df.to_csv(os.path.join(path_output, f'{dataname}_nn_random_score.csv'), index=False)

/home/lukec/workspace/label_flip_revised/results/real
abalone_subset_std
Accuracy on 0.00% poison data train: 75.25 test: 73.75
Accuracy on 0.05% poison data train: 72.62 test: 73.25
Accuracy on 0.10% poison data train: 70.25 test: 74.25
Accuracy on 0.15% poison data train: 68.25 test: 73.75
Accuracy on 0.20% poison data train: 64.81 test: 72.25
Accuracy on 0.25% poison data train: 62.06 test: 72.25
Accuracy on 0.30% poison data train: 59.19 test: 73.50
Accuracy on 0.35% poison data train: 58.56 test: 72.50
Accuracy on 0.40% poison data train: 55.00 test: 72.50
australian_std
Accuracy on 0.00% poison data train: 84.60 test: 87.68
Accuracy on 0.05% poison data train: 81.88 test: 87.68
Accuracy on 0.10% poison data train: 78.80 test: 85.51
Accuracy on 0.15% poison data train: 73.37 test: 79.71
Accuracy on 0.20% poison data train: 70.65 test: 78.26
Accuracy on 0.25% poison data train: 68.66 test: 81.88
Accuracy on 0.30% poison data train: 62.86 test: 76.09
Accuracy on 0.35% poison data tr