In [1]:
import os
from pathlib import Path
from glob import glob

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
PATH_ROOT = Path(os.getcwd()).absolute().parent
print(PATH_ROOT)

/home/lukec/workspace/label_flip_revised_06


In [3]:
# NOTE: `cardiotocography` is removed!

datanames = [
    'abalone_subset_std',
    'australian_std',
    'banknote_std',
    'breastcancer_std',
    'cmc_std',
    'htru2_subset_std',
    'phoneme_subset_std',
    'ringnorm_subset_std',
    'texture_subset_std',
    'yeast_subset_std'
]

print(len(datanames))

10


## Plot correlations between C-Measures and Test Accuracy on clean data

In [6]:
path_cm_clean = glob(os.path.join(PATH_ROOT, 'results', 'synth_svm', '*clean*.csv'))
print(path_cm_clean)

['/home/lukec/workspace/label_flip_revised_06/results/synth_svm/synth_svm_clean_0.csv', '/home/lukec/workspace/label_flip_revised_06/results/synth_svm/synth_svm_clean_1.csv', '/home/lukec/workspace/label_flip_revised_06/results/synth_svm/synth_svm_clean_2.csv']


In [7]:
df_c_measure = pd.DataFrame()
for p in path_cm_clean:
    _df = pd.read_csv(p)
    df_c_measure = pd.concat([df_c_measure, _df])

df_c_measure

Unnamed: 0,Data,overlapping.F1.mean,overlapping.F1.sd,overlapping.F1v.mean,overlapping.F1v.sd,overlapping.F2.mean,overlapping.F2.sd,overlapping.F3.mean,overlapping.F3.sd,overlapping.F4.mean,...,linearity.L3.sd,dimensionality.T2,dimensionality.T3,dimensionality.T4,balance.C1,balance.C2,network.Density,network.ClsCoef,network.Hubs.mean,network.Hubs.sd
0,f04_i02_r00_c01_w6_1.csv,0.810730,0.372452,0.054200,,0.334868,,0.5040,,0.4245,...,,0.0020,0.0020,1.000000,0.970657,0.077664,0.848518,0.406686,0.766283,0.302880
1,f04_i02_r01_c01_w5_1.csv,0.611179,0.449290,0.043539,,0.835243,,0.9165,,0.8680,...,,0.0020,0.0015,0.750000,1.000000,0.000000,0.824310,0.330959,0.814283,0.221256
2,f04_i03_r00_c02_w5_1.csv,0.788140,0.256240,0.115885,,0.744394,,0.8925,,0.8685,...,,0.0020,0.0020,1.000000,1.000000,0.000000,0.860769,0.390880,0.747007,0.305292
3,f04_i03_r01_c01_w6_1.csv,0.429929,0.201707,0.031733,,0.197845,,0.5455,,0.5300,...,,0.0020,0.0015,0.750000,0.972961,0.071823,0.822687,0.307840,0.750106,0.329192
4,f05_i03_r02_c02_w5_1.csv,0.799987,0.201796,0.127439,,0.663490,,0.9220,,0.7945,...,,0.0025,0.0015,0.600000,0.999997,0.000008,0.864595,0.262166,0.819075,0.251342
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,f30_i27_r01_c01_w6_1.csv,0.970729,0.049059,0.141101,,0.012527,,0.9685,,0.8175,...,,0.0150,0.0110,0.733333,0.971533,0.075450,0.871958,0.569480,0.726014,0.244403
96,f30_i27_r01_c13_w5_1.csv,0.993482,0.010085,0.550902,,0.027854,,0.9945,,0.9275,...,,0.0150,0.0125,0.833333,0.999994,0.000018,0.893963,0.613341,0.771873,0.243977
97,f30_i28_r02_c05_w5_1.csv,0.985669,0.018146,0.360492,,0.034833,,0.9910,,0.9195,...,,0.0150,0.0125,0.833333,0.999994,0.000018,0.886141,0.585342,0.797491,0.223961
98,f30_i29_r00_c22_w6_1.csv,0.997904,0.002624,0.805732,,0.043719,,0.9935,,0.9240,...,,0.0150,0.0140,0.933333,0.972678,0.072542,0.913843,0.726942,0.609342,0.326444


In [8]:
cols = df_c_measure.columns
print(cols)

Index(['Data', 'overlapping.F1.mean', 'overlapping.F1.sd',
       'overlapping.F1v.mean', 'overlapping.F1v.sd', 'overlapping.F2.mean',
       'overlapping.F2.sd', 'overlapping.F3.mean', 'overlapping.F3.sd',
       'overlapping.F4.mean', 'overlapping.F4.sd', 'neighborhood.N1',
       'neighborhood.N2.mean', 'neighborhood.N2.sd', 'neighborhood.N3.mean',
       'neighborhood.N3.sd', 'neighborhood.N4.mean', 'neighborhood.N4.sd',
       'neighborhood.T1.mean', 'neighborhood.T1.sd', 'neighborhood.LSC',
       'linearity.L1.mean', 'linearity.L1.sd', 'linearity.L2.mean',
       'linearity.L2.sd', 'linearity.L3.mean', 'linearity.L3.sd',
       'dimensionality.T2', 'dimensionality.T3', 'dimensionality.T4',
       'balance.C1', 'balance.C2', 'network.Density', 'network.ClsCoef',
       'network.Hubs.mean', 'network.Hubs.sd'],
      dtype='object')


## Get scores

In [23]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

from label_flip_revised.utils import open_csv
from label_flip_revised.simple_nn_model import SimpleModel
from label_flip_revised.torch_utils import evaluate, train_model

In [20]:
def get_dataloader(X, y, batch_size=128, shuffle=True):
    dataset = TensorDataset(torch.from_numpy(X).type(torch.float32),
                                  torch.from_numpy(y).type(torch.int64))
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
    return dataloader

In [22]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
    print('Running on CPU!')

In [21]:
HIDDEN_LAYER = 128

In [28]:
postfix_train = '_clean_train.csv'
postfix_clean = '_clean_test.csv'
postfix_torch = '_SimpleNN.torch'

acc_train = []
acc_test = []
datanames = []

for i in range(df_c_measure.shape[0]):
    dataname = df_c_measure.iloc[i]['Data'].split('.')[0]
    datanames.append(dataname)

    X_train, y_train, _ = open_csv(os.path.join(PATH_ROOT, 'data', 'synth', 'train', f'{dataname}{postfix_train}'))
    X_test, y_test, _ = open_csv(os.path.join(PATH_ROOT, 'data', 'synth', 'test', f'{dataname}{postfix_clean}'))
    n_features = X_train.shape[1]

    dataloader_train = get_dataloader(X_train, y_train, shuffle=False)
    dataloader_test = get_dataloader(X_test, y_test, shuffle=False)

    model = SimpleModel(n_features, hidden_dim=HIDDEN_LAYER, output_dim=2).to(device)
    loss_fn = nn.CrossEntropyLoss()
    path_model = os.path.join(PATH_ROOT, 'data', 'synth','torch', f'{dataname}{postfix_torch}')
    model.load_state_dict(torch.load(path_model, map_location=device))

    _acc_train, _ = evaluate(dataloader_train, model, loss_fn, device)
    _acc_test, _ = evaluate(dataloader_test, model, loss_fn, device)

    acc_train.append(_acc_train)
    acc_test.append(_acc_test)

    if i % 10 == 0:
        print(f'[{dataname}] Acc train: {_acc_train*100:.2f} test: {_acc_test*100:.2f}')

results = {
    'data': datanames,
    'train': acc_train,
    'test': acc_test,
}
df = pd.DataFrame(results)
path_output = os.path.join(PATH_ROOT, 'results')
df.to_csv(os.path.join(path_output, 'synth_clean_score.csv'), index=False)

[f04_i02_r00_c01_w6_1] Acc train: 98.50 test: 99.40
[f06_i06_r00_c02_w5_1] Acc train: 84.30 test: 80.90
[f08_i06_r02_c02_w5_1] Acc train: 71.40 test: 73.60
[f09_i09_r00_c03_w5_1] Acc train: 76.40 test: 75.60
[f10_i08_r02_c07_w6_1] Acc train: 60.00 test: 60.00
[f12_i06_r04_c01_w4_1] Acc train: 97.70 test: 96.80
[f12_i12_r00_c01_w5_1] Acc train: 95.40 test: 96.20
[f14_i08_r04_c07_w5_1] Acc train: 72.40 test: 72.10
[f14_i14_r00_c10_w6_1] Acc train: 59.70 test: 59.80
[f15_i15_r00_c08_w6_1] Acc train: 60.00 test: 60.10
[f17_i09_r03_c05_w6_1] Acc train: 68.90 test: 70.70
[f18_i11_r00_c06_w4_1] Acc train: 59.70 test: 59.70
[f18_i18_r00_c02_w4_1] Acc train: 78.00 test: 79.70
[f19_i18_r00_c16_w5_1] Acc train: 55.70 test: 52.70
[f20_i14_r06_c08_w4_1] Acc train: 60.00 test: 60.00
[f21_i15_r00_c13_w5_1] Acc train: 64.50 test: 60.10
[f22_i14_r00_c06_w4_1] Acc train: 59.70 test: 59.60
[f23_i13_r04_c04_w5_1] Acc train: 70.20 test: 67.80
[f23_i21_r02_c20_w6_1] Acc train: 60.10 test: 60.00
[f24_i19_r01