In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import glob
import json
import os
import pathlib

from numpy.core.defchararray import find
from pandas.api.types import CategoricalDtype
from prettytable import PrettyTable
from scipy import stats
from sklearn import linear_model, preprocessing
from sklearn.metrics import (
    RocCurveDisplay,
    auc,
    mean_squared_error,
    roc_curve,
    confusion_matrix,
)
from sklearn.model_selection import KFold, RandomizedSearchCV, train_test_split
from sklearn.svm import SVC
from sklearn.utils.fixes import loguniform
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [3]:
plt.rcParams["font.size"] = 14

In [4]:
PATH_ROOT = os.path.join(pathlib.Path().absolute().parent)
print(PATH_ROOT)

/home/lukec/workspace/label_flip_revised


In [5]:
POISON_LVLS = np.round(np.arange(0, 0.41, 0.05, dtype=float), 2)

print("# of different percentage tested for 1 dataset:", len(POISON_LVLS))
POISON_LVLS

# of different percentage tested for 1 dataset: 9


array([0.  , 0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 ])

In [6]:
path_cm = glob.glob(os.path.join(PATH_ROOT, "results", "synth_nn", "*.csv"))
print(f"Found {len(path_cm)} files for C-Measures")

df_list = []
for p in path_cm:
    df_list.append(pd.read_csv(p))
df_cm_poison = pd.concat(df_list)
print("df_cm_poison", df_cm_poison.shape)

Found 54 files for C-Measures
df_cm_poison (2619, 36)


In [7]:
path_cm = [os.path.join(PATH_ROOT, "results", "synth_svm", f"synth_svm_clean_{i}.csv") for i in range(3)]

df_list = []
for p in path_cm:
    df_list.append(pd.read_csv(p))
df_cm_clean = pd.concat(df_list)
print("df_cm_clean", df_cm_clean.shape)

df_cm_clean (300, 36)


In [8]:
# Remove NA
# Name does not count
print("# of columns before removing NA:", len(df_cm_clean.columns) - 1)

cols_not_na = df_cm_clean.columns[df_cm_clean.notna().any()].tolist()
# Name does not count
print("# of columns after removing NA:", len(cols_not_na) - 1)

df_cm_clean = df_cm_clean[cols_not_na]
df_cm_poison = df_cm_poison[cols_not_na]

print(*sorted(df_cm_clean.columns.to_list()), sep=",\n")

# of columns before removing NA: 35
# of columns after removing NA: 28
Data,
balance.C1,
balance.C2,
dimensionality.T2,
dimensionality.T3,
dimensionality.T4,
linearity.L1.mean,
linearity.L2.mean,
linearity.L3.mean,
neighborhood.LSC,
neighborhood.N1,
neighborhood.N2.mean,
neighborhood.N2.sd,
neighborhood.N3.mean,
neighborhood.N3.sd,
neighborhood.N4.mean,
neighborhood.N4.sd,
neighborhood.T1.mean,
neighborhood.T1.sd,
network.ClsCoef,
network.Density,
network.Hubs.mean,
network.Hubs.sd,
overlapping.F1.mean,
overlapping.F1.sd,
overlapping.F1v.mean,
overlapping.F2.mean,
overlapping.F3.mean,
overlapping.F4.mean


In [9]:
# Adding `Rate` column
df_cm_clean.insert(1, "Rate", 0.0)

rates = [float(os.path.splitext(d)[0].split("_")[-1]) for d in df_cm_poison["Data"].to_numpy()]
df_cm_poison.insert(1, "Rate", rates)

# Merge 2 dataframe together
df_cm = pd.concat([df_cm_clean, df_cm_poison])
print("df_cm", df_cm.shape)

df_cm (2919, 30)


In [10]:
df_cm.insert(0, "Filename", df_cm["Data"])

# Update data column, the same dataset with different poison level will have consistant name
df_cm["Data"] = ["_".join(os.path.splitext(d)[0].split("_")[:6]) for d in df_cm["Data"].to_list()]
df_cm = df_cm.sort_values(["Data", "Rate"], axis=0)
df_cm = df_cm.reset_index(drop=True)

df_cm.head()

Unnamed: 0,Filename,Data,Rate,overlapping.F1.mean,overlapping.F1.sd,overlapping.F1v.mean,overlapping.F2.mean,overlapping.F3.mean,overlapping.F4.mean,neighborhood.N1,...,linearity.L3.mean,dimensionality.T2,dimensionality.T3,dimensionality.T4,balance.C1,balance.C2,network.Density,network.ClsCoef,network.Hubs.mean,network.Hubs.sd
0,f04_i02_r00_c01_w6_1.csv,f04_i02_r00_c01_w6_1,0.0,0.81073,0.372452,0.0542,0.334868,0.504,0.4245,0.03,...,0.0045,0.002,0.002,1.0,0.970657,0.077664,0.848518,0.406686,0.766283,0.30288
1,f04_i02_r00_c01_w6_1_nn_ALFA_0.05.csv,f04_i02_r00_c01_w6_1,0.05,0.871642,0.236221,0.157394,0.478438,0.762,0.686,0.082,...,0.05,0.004,0.004,1.0,0.992482,0.020594,0.862773,0.400181,0.770124,0.299065
2,f04_i02_r00_c01_w6_1_nn_ALFA_0.10.csv,f04_i02_r00_c01_w6_1,0.1,0.898315,0.188025,0.250773,0.56529,0.69,0.636,0.126,...,0.117,0.004,0.004,1.0,0.880065,0.278243,0.865011,0.415929,0.642544,0.312808
3,f04_i02_r00_c01_w6_1_nn_ALFA_0.15.csv,f04_i02_r00_c01_w6_1,0.15,0.924654,0.092116,0.302377,0.436307,0.69,0.614,0.115,...,0.133,0.004,0.004,1.0,0.993061,0.019025,0.87006,0.382698,0.761814,0.292958
4,f04_i02_r00_c01_w6_1_nn_ALFA_0.20.csv,f04_i02_r00_c01_w6_1,0.2,0.932818,0.08896,0.343645,0.512272,0.631,0.585,0.107,...,0.11,0.004,0.004,1.0,0.719924,0.532007,0.863499,0.437701,0.639688,0.248479


In [11]:
# Remove any row with Rate greater than 0.4
# 45% poison rate may lead to 1 class disappear
df_cm = df_cm[df_cm["Rate"] <= 0.4]

In [33]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

from label_flip_revised.utils import open_csv
from label_flip_revised import SimpleModel, train_model, evaluate, create_dir

In [13]:
# For training the classifier:
BATCH_SIZE = 128  # Size of mini-batch.
HIDDEN_LAYER = 128  # Number of hidden neurons in a hidden layer.
LR = 0.001  # Learning rate.
MAX_EPOCHS = 300  # Number of iteration for training.

In [34]:
# Compute Train acc and test acc
datanames = df_cm['Data'].unique()
for dataname in tqdm(datanames[:5]):
    df_subset = df_cm[df_cm["Data"] == dataname]

    # Only 1 clean file, the rest of data are poisoned
    # TODO: The C-Measure is computed from the entire dataset instead of training set.
    # TODO: Recompute C-Measure for the clean training sets!
    # filename_clean = df_subset[df_subset["Rate"] == 0].at[0, "Filename"]
    filelist_poison = df_subset[df_subset["Rate"] != 0]['Filename'].to_list()

    # Load clean data
    X_train, y_train, _ = open_csv(os.path.join(PATH_ROOT, "data", "synth", "train", f"{dataname}_clean_train.csv"))
    X_test, y_test, _ = open_csv(os.path.join(PATH_ROOT, "data", "synth", "test", f"{dataname}_clean_test.csv"))

    device = torch.device("cuda") if torch.cuda.is_available else torch.device("cpu")
    n_features = X_train.shape[1]
    model_clean = SimpleModel(n_features, hidden_dim=HIDDEN_LAYER, output_dim=2).to(device)
    path_model = os.path.join(PATH_ROOT, "data", "synth", "torch", f"{dataname}_SimpleNN.torch")
    model_clean.load_state_dict(torch.load(path_model, map_location=device))
    loss_fn = nn.CrossEntropyLoss()

    dataset = TensorDataset(torch.from_numpy(X_train).type(torch.float32),
                            torch.from_numpy(y_train).type(torch.int64))
    dataloader_train = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
    dataset = TensorDataset(torch.from_numpy(X_test).type(torch.float32),
                            torch.from_numpy(y_test).type(torch.int64))
    dataloader_test = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
    acc_train, _ = evaluate(dataloader_train, model_clean, loss_fn, device)
    acc_test, _ = evaluate(dataloader_test, model_clean, loss_fn, device)
    print(f"[{dataname}] Acc train: {acc_train*100:.2f} test: {acc_test*100:.2f}")
    # TODO: Save results into the dataframe

    for idx in df_subset[df_subset["Rate"] != 0].index:
        data_poison = df_subset.at[idx, "Filename"]
        path_data = os.path.join(PATH_ROOT, "data", "synth", "alfa_nn", data_poison)
        X_poison, y_poison, _ = open_csv(path_data)
        np.testing.assert_array_almost_equal(X_poison, X_train)
        assert not np.array_equal(y_poison, y_train)
        dataset = TensorDataset(torch.from_numpy(X_train).type(torch.float32),
                                torch.from_numpy(y_poison).type(torch.int64))
        dataloader_poison = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

        model_poison = SimpleModel(n_features, hidden_dim=HIDDEN_LAYER, output_dim=2).to(device)
        optimizer = torch.optim.SGD(model_poison.parameters(), lr=LR, momentum=0.8)
        train_model(model_poison,
                    dataloader_poison,
                    optimizer=optimizer,
                    loss_fn=loss_fn,
                    device=device,
                    max_epochs=MAX_EPOCHS)
        # TODO: Load pre-trained model, if it already exists.
        # Save model
        path_poison = os.path.join(PATH_ROOT, "data", "synth", "torch", "poison")
        create_dir(path_poison)
        path_model = os.path.join(path_poison, os.path.splitext(data_poison)[0] + ".torch")
        torch.save(model_poison.state_dict(), path_model)

        acc_poison, _ = evaluate(dataloader_poison, model_poison, loss_fn, device)
        acc_test, _ = evaluate(dataloader_test, model_poison, loss_fn, device)
        print(f"[{data_poison}] Acc train: {acc_poison*100:.2f} test: {acc_test*100:.2f}")
        # TODO: Save results into the dataframe


  0%|          | 0/5 [00:00<?, ?it/s]

[f04_i02_r00_c01_w6_1] Acc train: 98.50 test: 99.40
[f04_i02_r00_c01_w6_1_nn_ALFA_0.05.csv] Acc train: 93.60 test: 99.30
[f04_i02_r00_c01_w6_1_nn_ALFA_0.10.csv] Acc train: 86.40 test: 94.00
[f04_i02_r00_c01_w6_1_nn_ALFA_0.15.csv] Acc train: 87.70 test: 89.90
[f04_i02_r00_c01_w6_1_nn_ALFA_0.20.csv] Acc train: 87.20 test: 77.20
[f04_i02_r00_c01_w6_1_nn_ALFA_0.25.csv] Acc train: 89.40 test: 72.20
[f04_i02_r00_c01_w6_1_nn_ALFA_0.30.csv] Acc train: 90.10 test: 60.00
[f04_i02_r00_c01_w6_1_nn_ALFA_0.35.csv] Acc train: 91.30 test: 59.80


 20%|██        | 1/5 [00:15<01:01, 15.44s/it]

[f04_i02_r00_c01_w6_1_nn_ALFA_0.40.csv] Acc train: 92.50 test: 60.00
[f04_i02_r01_c01_w5_1] Acc train: 98.70 test: 99.10
[f04_i02_r01_c01_w5_1_nn_ALFA_0.05.csv] Acc train: 93.00 test: 98.10
[f04_i02_r01_c01_w5_1_nn_ALFA_0.10.csv] Acc train: 87.50 test: 96.70
[f04_i02_r01_c01_w5_1_nn_ALFA_0.15.csv] Acc train: 85.90 test: 90.90
[f04_i02_r01_c01_w5_1_nn_ALFA_0.20.csv] Acc train: 84.70 test: 85.40
[f04_i02_r01_c01_w5_1_nn_ALFA_0.25.csv] Acc train: 81.20 test: 60.50
[f04_i02_r01_c01_w5_1_nn_ALFA_0.30.csv] Acc train: 78.40 test: 50.00
[f04_i02_r01_c01_w5_1_nn_ALFA_0.35.csv] Acc train: 91.70 test: 57.00


 40%|████      | 2/5 [00:28<00:42, 14.30s/it]

[f04_i02_r01_c01_w5_1_nn_ALFA_0.40.csv] Acc train: 86.80 test: 50.00
[f04_i03_r00_c02_w5_1] Acc train: 89.90 test: 92.40
[f04_i03_r00_c02_w5_1_nn_ALFA_0.05.csv] Acc train: 82.30 test: 90.80
[f04_i03_r00_c02_w5_1_nn_ALFA_0.10.csv] Acc train: 80.50 test: 88.60
[f04_i03_r00_c02_w5_1_nn_ALFA_0.15.csv] Acc train: 78.50 test: 73.60
[f04_i03_r00_c02_w5_1_nn_ALFA_0.20.csv] Acc train: 79.30 test: 72.10
[f04_i03_r00_c02_w5_1_nn_ALFA_0.25.csv] Acc train: 75.00 test: 50.00
[f04_i03_r00_c02_w5_1_nn_ALFA_0.30.csv] Acc train: 89.00 test: 61.20
[f04_i03_r00_c02_w5_1_nn_ALFA_0.35.csv] Acc train: 85.00 test: 50.00


 60%|██████    | 3/5 [00:42<00:27, 13.82s/it]

[f04_i03_r00_c02_w5_1_nn_ALFA_0.40.csv] Acc train: 91.90 test: 51.30
[f04_i03_r01_c01_w6_1] Acc train: 97.80 test: 97.10
[f04_i03_r01_c01_w6_1_nn_ALFA_0.05.csv] Acc train: 91.50 test: 96.00
[f04_i03_r01_c01_w6_1_nn_ALFA_0.10.csv] Acc train: 91.00 test: 92.60
[f04_i03_r01_c01_w6_1_nn_ALFA_0.15.csv] Acc train: 90.90 test: 81.40
[f04_i03_r01_c01_w6_1_nn_ALFA_0.20.csv] Acc train: 92.30 test: 79.10
[f04_i03_r01_c01_w6_1_nn_ALFA_0.25.csv] Acc train: 84.70 test: 59.60
[f04_i03_r01_c01_w6_1_nn_ALFA_0.30.csv] Acc train: 83.70 test: 63.70
[f04_i03_r01_c01_w6_1_nn_ALFA_0.35.csv] Acc train: 94.70 test: 59.60


 80%|████████  | 4/5 [00:55<00:13, 13.65s/it]

[f04_i03_r01_c01_w6_1_nn_ALFA_0.40.csv] Acc train: 94.10 test: 54.90
[f05_i03_r02_c02_w5_1] Acc train: 91.40 test: 91.10
[f05_i03_r02_c02_w5_1_nn_ALFA_0.05.csv] Acc train: 85.50 test: 89.90
[f05_i03_r02_c02_w5_1_nn_ALFA_0.10.csv] Acc train: 80.30 test: 89.30
[f05_i03_r02_c02_w5_1_nn_ALFA_0.15.csv] Acc train: 85.50 test: 72.70
[f05_i03_r02_c02_w5_1_nn_ALFA_0.20.csv] Acc train: 77.40 test: 61.70
[f05_i03_r02_c02_w5_1_nn_ALFA_0.25.csv] Acc train: 84.20 test: 65.10
[f05_i03_r02_c02_w5_1_nn_ALFA_0.30.csv] Acc train: 79.90 test: 49.90
[f05_i03_r02_c02_w5_1_nn_ALFA_0.35.csv] Acc train: 88.90 test: 57.40


100%|██████████| 5/5 [01:10<00:00, 14.15s/it]

[f05_i03_r02_c02_w5_1_nn_ALFA_0.40.csv] Acc train: 89.90 test: 49.90





In [15]:
# Split data into 3 difficulty levels: Hard, Normal Easy

In [16]:
# Find optimal hyperparameters for regression model

In [17]:
# Plot ROC curve (All difficulty)

In [18]:
# Plot ROC curve (Group by difficulty)

In [19]:
# Confusion Matrix