In [1]:
import os
from glob import glob
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import torch
import torch.nn as nn
from numpy.core.defchararray import find
from pandas.api.types import CategoricalDtype
from scipy import stats
from sklearn import linear_model, preprocessing
from sklearn.metrics import (RocCurveDisplay, auc, confusion_matrix,
                             mean_squared_error, roc_curve)
from sklearn.model_selection import KFold, RandomizedSearchCV, train_test_split
from sklearn.svm import SVC
from sklearn.utils.fixes import loguniform
from torch.utils.data import DataLoader, TensorDataset

from label_flip_revised.simple_nn_model import SimpleModel
from label_flip_revised.torch_utils import evaluate, train_model
from label_flip_revised.utils import create_dir, open_csv


In [2]:
PATH_ROOT = Path(os.getcwd()).absolute().parent
print(PATH_ROOT)

/home/lukec/workspace/label_flip_revised


In [3]:
path_cm = sorted(glob(os.path.join(PATH_ROOT, 'results', 'synth_nn', '*.csv')))
print(len(path_cm))
print(path_cm[:5])

54
['/home/lukec/workspace/label_flip_revised/results/synth_nn/synth_nn_poison_0.csv', '/home/lukec/workspace/label_flip_revised/results/synth_nn/synth_nn_poison_1.csv', '/home/lukec/workspace/label_flip_revised/results/synth_nn/synth_nn_poison_10.csv', '/home/lukec/workspace/label_flip_revised/results/synth_nn/synth_nn_poison_11.csv', '/home/lukec/workspace/label_flip_revised/results/synth_nn/synth_nn_poison_12.csv']


In [4]:
df_cm = pd.DataFrame()
for p in path_cm:
    _df = pd.read_csv(p)
    df_cm = pd.concat([df_cm, _df], ignore_index=True)

In [5]:
# Add rates
rates = [float(Path(d).stem.split('_')[-1]) for d in df_cm['Data'].to_list()]
df_cm['Rate'] = rates
df_cm = df_cm[df_cm['Rate'] <= 0.41]
print(df_cm.shape)

(2397, 37)


In [6]:
# Add filepath
df_cm['Filepath'] = df_cm['Data'].apply(lambda x: os.path.join('data', 'synth', 'alfa_nn', x))
df_cm['Data'] = df_cm['Data'].apply(lambda x: x[:-len('_nn_ALFA_0.05.csv')])

In [7]:
path_clean = sorted(glob(os.path.join(PATH_ROOT, 'results', 'synth_svm', '*clean*')))
df_cm_clean = pd.DataFrame()
for p in path_clean:
    _df = pd.read_csv(p)
    df_cm_clean = pd.concat([df_cm_clean, _df], ignore_index=True)

df_cm_clean['Rate'] = 0
df_cm_clean['Filepath'] = df_cm_clean['Data'].apply(lambda x: os.path.join('data', 'synth', 'train', '{}_clean_train.csv'.format(x.split('.')[0])))
df_cm_clean['Data'] = df_cm_clean['Data'].apply(lambda x: x.split('.')[0])

df_cm = pd.concat([df_cm_clean, df_cm], ignore_index=True)

In [8]:
# Remove NA
print('# of columns before removing NA:', len(df_cm.columns) - 1)  # Name does not count
cols_not_na = df_cm.columns[df_cm.notna().any()].tolist()
df_cm = df_cm[cols_not_na]
print('# of columns after removing NA:', len(df_cm.columns) - 1)  # Name does not count
print(cols_not_na)

# of columns before removing NA: 37
# of columns after removing NA: 30
['Data', 'overlapping.F1.mean', 'overlapping.F1.sd', 'overlapping.F1v.mean', 'overlapping.F2.mean', 'overlapping.F3.mean', 'overlapping.F4.mean', 'neighborhood.N1', 'neighborhood.N2.mean', 'neighborhood.N2.sd', 'neighborhood.N3.mean', 'neighborhood.N3.sd', 'neighborhood.N4.mean', 'neighborhood.N4.sd', 'neighborhood.T1.mean', 'neighborhood.T1.sd', 'neighborhood.LSC', 'linearity.L1.mean', 'linearity.L2.mean', 'linearity.L3.mean', 'dimensionality.T2', 'dimensionality.T3', 'dimensionality.T4', 'balance.C1', 'balance.C2', 'network.Density', 'network.ClsCoef', 'network.Hubs.mean', 'network.Hubs.sd', 'Rate', 'Filepath']


In [9]:
df_cm = df_cm.sort_values(by=['Data', 'Rate'], ignore_index=True)
df_cm['Testpath'] = df_cm['Data'].apply(lambda x: os.path.join('data', 'synth', 'test', f'{x}_clean_test.csv'))

In [10]:
df_cm['Train'] = 0.
df_cm['Test'] = 0.

In [11]:
COL_NAMES = ['Data', 'F1', 'F1 SD', 'F1v', 'F2', 'F3', 'F4', 'N1', 
    'N2', 'N2 SD', 'N3 ', 'N3 SD', 'N4', 'N4 SD', 'T1', 'T1 SD', 'LSC', 
    'L1', 'L2', 'L3', 'T2', 'T3', 'T4', 'C1', 'C2', 'Density', 'ClsCoef', 
    'Hubs', 'HubsSD', 'Rate', 'Filepath', 'Testpath', 'Train', 'Test']
new_names_map = {df_cm.columns[i]:COL_NAMES[i] for i in range(len(COL_NAMES))}
df_cm = df_cm.rename(new_names_map, axis=1)

In [12]:
# Load clean score
def update_score(df_score, rate, df_cm):
    for i in range(df_score.shape[0]):
        data = df_score.loc[i, 'data']
        train = df_score.loc[i, 'train']
        test = df_score.loc[i, 'test']
        idx = df_cm[(df_cm['Data'] == data) & (df_cm['Rate'] == rate)].index
        df_cm.loc[idx, 'Train'] = train
        df_cm.loc[idx, 'Test'] = test    

In [13]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
    print('Running on CPU!')

HIDDEN_LAYER = 128
LR = 0.001  # Learning rate.
MAX_EPOCHS = 400  # Number of iteration for training.
BATCH_SIZE = 128

In [14]:
def get_dataloader(X, y, batch_size=BATCH_SIZE, shuffle=True):
    dataset = TensorDataset(torch.from_numpy(X).type(torch.float32),
                                  torch.from_numpy(y).type(torch.int64))
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
    return dataloader

In [15]:
# Load clean score
df_score_clean = pd.read_csv(os.path.join(PATH_ROOT, 'results', 'synth_clean_score.csv'))
update_score(df_score_clean, 0, df_cm)

In [16]:
path_output = os.path.join(PATH_ROOT, 'results')
path_model = os.path.join(path_output, 'torch')
create_dir(path_model)

In [17]:
df_cm.head()

Unnamed: 0,Data,F1,F1 SD,F1v,F2,F3,F4,N1,N2,N2 SD,...,C2,Density,ClsCoef,Hubs,HubsSD,Rate,Filepath,Testpath,Train,Test
0,f04_i02_r00_c01_w6_1,0.81073,0.372452,0.0542,0.334868,0.504,0.4245,0.03,0.200008,0.109163,...,0.077664,0.848518,0.406686,0.766283,0.30288,0.0,data/synth/alfa_nn/f04_i02_r00_c01_w6_1_clean_...,data/synth/test/f04_i02_r00_c01_w6_1_clean_tes...,0.985,0.994
1,f04_i02_r00_c01_w6_1,0.871642,0.236221,0.157394,0.478438,0.762,0.686,0.082,0.262551,0.115698,...,0.020594,0.862773,0.400181,0.770124,0.299065,0.05,data/synth/alfa_nn/f04_i02_r00_c01_w6_1_nn_ALF...,data/synth/test/f04_i02_r00_c01_w6_1_clean_tes...,0.0,0.0
2,f04_i02_r00_c01_w6_1,0.898315,0.188025,0.250773,0.56529,0.69,0.636,0.126,0.264643,0.140696,...,0.278243,0.865011,0.415929,0.642544,0.312808,0.1,data/synth/alfa_nn/f04_i02_r00_c01_w6_1_nn_ALF...,data/synth/test/f04_i02_r00_c01_w6_1_clean_tes...,0.0,0.0
3,f04_i02_r00_c01_w6_1,0.924654,0.092116,0.302377,0.436307,0.69,0.614,0.115,0.279178,0.123644,...,0.019025,0.87006,0.382698,0.761814,0.292958,0.15,data/synth/alfa_nn/f04_i02_r00_c01_w6_1_nn_ALF...,data/synth/test/f04_i02_r00_c01_w6_1_clean_tes...,0.0,0.0
4,f04_i02_r00_c01_w6_1,0.932818,0.08896,0.343645,0.512272,0.631,0.585,0.107,0.261546,0.134748,...,0.532007,0.863499,0.437701,0.639688,0.248479,0.2,data/synth/alfa_nn/f04_i02_r00_c01_w6_1_nn_ALF...,data/synth/test/f04_i02_r00_c01_w6_1_clean_tes...,0.0,0.0


In [27]:
RATES = [f'{r:.2f}' for r in np.arange(0.05, 0.41, 0.05)]
print(RATES)
postfix_torch = '_SimpleNN.torch'

for rate in RATES:
    print(f'Current poison rate: {rate}...')
    path_score = os.path.join(PATH_ROOT, 'results', f'synth_falfa_{rate}_score.csv')
    if os.path.exists(path_score):
        df_score = pd.read_csv(path_score)
        update_score(df_score, rate, df_cm)
    else:
        acc_train = []
        acc_test = []
        datanames = []

        for i in df_cm[df_cm['Rate'] == float(rate)].index:
            dataname = df_cm.iloc[i]['Data']
            datanames.append(dataname)

            path_train = os.path.join(PATH_ROOT, df_cm.iloc[i]['Filepath'])
            path_test = os.path.join(PATH_ROOT, df_cm.iloc[i]['Testpath'])

            X_po, y_po, _ = open_csv(path_train)
            X_test, y_test, _ = open_csv(path_test)
            n_features = X_po.shape[1]

            dataloader_poison = get_dataloader(X_po, y_po, shuffle=True)
            dataloader_test = get_dataloader(X_test, y_test, shuffle=False)

            model = SimpleModel(n_features, hidden_dim=HIDDEN_LAYER, output_dim=2).to(device)
            optimizer = torch.optim.SGD(model.parameters(), lr=LR, momentum=0.8)
            loss_fn = nn.CrossEntropyLoss()

            dataname_poison = '{}_nn_ALFA_{}{}'.format(dataname, rate, postfix_torch)
            _path_model = os.path.join(path_model, dataname_poison)
            if os.path.exists(_path_model):
                model.load_state_dict(torch.load(_path_model, map_location=device))
            else:
                train_model(model, dataloader_poison, optimizer, loss_fn, device, MAX_EPOCHS)
                torch.save(model.state_dict(), _path_model)

            acc_po, _ = evaluate(dataloader_poison, model, loss_fn, device)
            acc_te, _ = evaluate(dataloader_test, model, loss_fn, device)
            acc_train.append(acc_po)
            acc_test.append(acc_te)

            if i % 10 == 0:
                print(f'[{dataname}] Acc train: {acc_po*100:.2f} test: {acc_te*100:.2f}')
        results = {
            'data': datanames,
            'train': acc_train,
            'test': acc_test,
        }
        df_score = pd.DataFrame(results)
        df_score.to_csv(os.path.join(path_output, f'synth_falfa_{rate}_score.csv'), index=False)
        update_score(df_score, float(rate), df_cm)


['0.05', '0.10', '0.15', '0.20', '0.25', '0.30', '0.35', '0.40']
Current poison rate: 0.05...
[f04_i02_r01_c01_w5_1] Acc train: 93.10 test: 98.80
[f07_i04_r00_c03_w4_1] Acc train: 80.50 test: 83.40
[f08_i07_r00_c01_w5_1] Acc train: 88.90 test: 93.70
[f10_i05_r00_c02_w4_1] Acc train: 80.80 test: 84.30
[f10_i09_r00_c03_w6_1] Acc train: 80.80 test: 72.90
[f12_i08_r02_c03_w5_1] Acc train: 74.70 test: 79.10
[f13_i07_r03_c03_w6_1] Acc train: 75.20 test: 78.20
[f14_i08_r05_c07_w4_1] Acc train: 71.30 test: 64.50
[f15_i08_r04_c03_w6_1] Acc train: 82.60 test: 79.40
[f15_i15_r00_c08_w6_1] Acc train: 76.20 test: 65.50
[f17_i09_r02_c03_w5_1] Acc train: 77.30 test: 71.70
[f18_i10_r03_c01_w5_1] Acc train: 91.10 test: 86.70
[f18_i18_r00_c01_w5_1] Acc train: 85.80 test: 89.80
[f19_i18_r00_c05_w4_1] Acc train: 73.90 test: 70.20
[f20_i14_r06_c07_w5_1] Acc train: 72.50 test: 66.60
[f21_i14_r07_c04_w6_1] Acc train: 55.00 test: 60.00
[f22_i13_r04_c03_w6_1] Acc train: 71.20 test: 63.80
[f23_i12_r10_c08_w4_1]

In [28]:
df_cm[df_cm['Train'] == 0]

Unnamed: 0,Data,F1,F1 SD,F1v,F2,F3,F4,N1,N2,N2 SD,...,C2,Density,ClsCoef,Hubs,HubsSD,Rate,Filepath,Testpath,Train,Test


In [29]:
df_cm.to_csv(os.path.join(PATH_ROOT, 'results', 'synth_cmeasures_nn.csv'), index=False)