In [1]:
import sys
sys.path.append("/home/dfischer/masterarbeit/src/")

from pathlib import Path

import numpy as np
import pandas as pd
import xgboost as xgb
import sklearn as skl
import _pickle as pickle

from ydata_synthetic.synthesizers.regular import RegularSynthesizer
from ydata_synthetic.synthesizers import ModelParameters, TrainParameters

from mylib import class_distributions
from mylib import data_selection
from mylib import helper_funcs

import dtreeviz
import logging
# to suppress messages when plotting trees
logging.getLogger('matplotlib.font_manager').setLevel(level=logging.CRITICAL)
import matplotlib.pyplot as plt

%load_ext autoreload
%autoreload 2

2023-04-27 22:20:58.110897: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-27 22:20:58.216571: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/teradata/client/14.10/lib
2023-04-27 22:20:58.216591: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-04-27 22:20:58.763755: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open sh

In [2]:
# open file

data_folder = Path("../../data/DryBeanDataset/")
model_folder = Path("../../models/DryBeanDataset")
file_to_open = data_folder / "Dry_Bean_Dataset.xlsx"

In [3]:
# read and prepare data

data = pd.read_excel(file_to_open)

labels_dict = {key:value for (value,key) in enumerate(data["Class"].unique())}
data["Class"] = data["Class"].map(labels_dict)
#data.Class.astype("category").cat.codes

# need feature matrix X and labels labels for xgboost
labels = data["Class"]
X = data.drop(["Class"],axis=1,inplace=False)

num_cols = list(data.columns[:16])
cat_cols = list(data.columns[16])

In [4]:
# prepare smaller dataset with only first num_labels classes of beans

old_classes = [0,1,3,4,5,6]
new_class = 2

# compute number of old labels used
num_labels = len(old_classes)

# relabel for XGBoost
labels = helper_funcs.relabel(labels, old_classes, new_class)

data_small = X[labels < num_labels]
labels_small = labels[labels < num_labels]

# attempt to retrain with new data
data_update = X[labels == num_labels]
labels_update = labels[labels == num_labels]

# also train a model with all the data availale for comparison
data_full = pd.concat([data_small, data_update])
labels_full = pd.concat([labels_small, labels_update])

In [5]:
# Defining the training parameters
batch_size = 500
epochs = 100+1
learning_rate = 2e-4
beta_1 = 0.5
beta_2 = 0.9

ctgan_args = ModelParameters(batch_size=batch_size,
                             lr=learning_rate,
                             betas=(beta_1, beta_2))

train_args = TrainParameters(epochs=epochs)

In [6]:
synth = RegularSynthesizer(modelname='ctgan', model_parameters=ctgan_args)
synth.fit(data=data, train_arguments=train_args, num_cols=num_cols, cat_cols=cat_cols)

2023-04-27 22:21:02.232565: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:267] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2023-04-27 22:21:02.232596: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: gpu-server
2023-04-27 22:21:02.232602: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: gpu-server
2023-04-27 22:21:02.232683: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: 520.56.6
2023-04-27 22:21:02.232706: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 520.56.6
2023-04-27 22:21:02.232711: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:310] kernel version seems to match DSO: 520.56.6
2023-04-27 22:21:29.534502: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with one

Epoch: 0 | critic_loss: 0.17191067337989807 | generator_loss: -0.9156926870346069
Epoch: 1 | critic_loss: -1.4283976554870605 | generator_loss: -0.761842668056488
Epoch: 2 | critic_loss: -1.5370874404907227 | generator_loss: -0.5966498851776123
Epoch: 3 | critic_loss: -1.5672193765640259 | generator_loss: 1.073550820350647
Epoch: 4 | critic_loss: -2.5383293628692627 | generator_loss: 2.1424639225006104
Epoch: 5 | critic_loss: -3.9207956790924072 | generator_loss: 3.551234483718872
Epoch: 6 | critic_loss: -4.756309509277344 | generator_loss: 4.070196628570557
Epoch: 7 | critic_loss: -5.963504791259766 | generator_loss: 4.606064796447754
Epoch: 8 | critic_loss: -7.768545150756836 | generator_loss: 4.980936050415039
Epoch: 9 | critic_loss: -8.151101112365723 | generator_loss: 5.341215133666992
Epoch: 10 | critic_loss: -8.854833602905273 | generator_loss: 5.521848201751709
Epoch: 11 | critic_loss: -11.654335021972656 | generator_loss: 6.0599164962768555
Epoch: 12 | critic_loss: -12.6772699

In [7]:
# create synthetical dataset of the same size as original
dataset_synth = synth.sample(len(data))

In [8]:
labels_synth = dataset_synth["Class"]
data_synth = dataset_synth.drop(["Class"],axis=1,inplace=False)

In [9]:
# split data into train- and test-data

X_train_small, X_test_small, y_train_small, y_test_small = skl.model_selection.train_test_split(data_small, 
                                                    labels_small,
                                                    test_size=.2)

X_train_synth, X_test_synth, y_train_synth, y_test_synth = skl.model_selection.train_test_split(data_synth, 
                                                    labels_synth,
                                                    test_size=.2)

In [10]:
# specify DMatrices

dtrain_small = xgb.DMatrix(X_train_small, label=y_train_small)
dtest_small = xgb.DMatrix(X_test_small, label=y_test_small)

dtrain_synth = xgb.DMatrix(X_train_synth, label=y_train_synth)
dtest_synth = xgb.DMatrix(X_test_synth, label=y_test_synth)

# specify some parameters
num_models = 1
proportion_of_old_data = [i*0.1 for i in range(1,10)]

# specify paramters for XGBoost
num_round = 100
early_stopping_rounds = num_round*.1
max_depth = 3
eta = .1

param_small = {'max_depth': max_depth, 'eta': eta, 'objective': 'multi:softprob', "num_class": num_labels}
param_small['nthread'] = 4
param_small['eval_metric'] = 'mlogloss'

evallist_small = [(dtrain_small, 'train'), (dtest_small, 'eval')]

In [11]:
# training model with fewer labels
bst_small = xgb.train(param_small,
                      dtrain_small,
                      num_round,
                      evals=evallist_small,
                      early_stopping_rounds=early_stopping_rounds,
                      verbose_eval=25)

bst_small.save_model(fname=model_folder / 'fewer_class_model.json')

[0]	train-mlogloss:1.56737	eval-mlogloss:1.56855
[25]	train-mlogloss:0.32334	eval-mlogloss:0.33876
[50]	train-mlogloss:0.20089	eval-mlogloss:0.22924
[75]	train-mlogloss:0.16895	eval-mlogloss:0.21131
[99]	train-mlogloss:0.15316	eval-mlogloss:0.20598


In [12]:
print("Accuracy on test data: ", skl.metrics.accuracy_score(np.argmax(bst_small.predict(dtest_small), axis=1), y_test_small))

Accuracy on test data:  0.9251336898395722


In [13]:
# split data into train- and test-data

X_train_update, X_test_update, y_train_update, y_test_update = skl.model_selection.train_test_split(data_update,
                                                                                                    labels_update,
                                                                                                    test_size=.2)

X_train_full, X_test_full, y_train_full, y_test_full = skl.model_selection.train_test_split(data_full,
                                                                                            labels_full,
                                                                                            test_size=.2)

In [14]:
# specify DMatrices

dtrain_update = xgb.DMatrix(X_train_update, label=y_train_update)
dtest_update = xgb.DMatrix(X_test_update, label=y_test_update)

dtrain_full = xgb.DMatrix(X_train_full, label=y_train_full)
dtest_full = xgb.DMatrix(X_test_full, label=y_test_full)


# specify paramters for XGBoost
param_update = {'max_depth': max_depth,
                'eta': eta,
                'objective': 'multi:softprob',
                "num_class": num_labels+1}
param_update['nthread'] = 4
param_update['eval_metric'] = 'mlogloss'

evallist_update = [(dtrain_update, 'train'), (dtest_update, 'eval')]


param_full = {'max_depth': max_depth,
              'eta': eta,
              'objective': 'multi:softprob',
              "num_class": num_labels+1}
param_full['nthread'] = 4
param_full['eval_metric'] = 'mlogloss'

evallist_full = [(dtrain_full, 'train'), (dtest_full, 'eval')]

In [15]:
# training a model with all the training data

bst_full = xgb.train(param_full,
                     dtrain_full,
                     num_round,
                     evals=evallist_full,
                     early_stopping_rounds=early_stopping_rounds,
                     verbose_eval=25)

[0]	train-mlogloss:1.68416	eval-mlogloss:1.69019
[25]	train-mlogloss:0.32455	eval-mlogloss:0.36716
[50]	train-mlogloss:0.19310	eval-mlogloss:0.24378
[75]	train-mlogloss:0.16037	eval-mlogloss:0.21732
[99]	train-mlogloss:0.14447	eval-mlogloss:0.20930


In [16]:
print("Accuracy on test data: ", skl.metrics.accuracy_score(np.argmax(bst_full.predict(dtest_full), axis=1), y_test_full))

Accuracy on test data:  0.9206757253029747


In [1]:
bst_synth = xgb.train(param_full,
                     dtrain_synth,
                     num_round,
                     evals=evallist_full,
                     early_stopping_rounds=early_stopping_rounds,
                     verbose_eval=25)

NameError: name 'xgb' is not defined

In [18]:
# accuracy of synth-model on actual data

print("Accuracy on test data: ", skl.metrics.accuracy_score(np.argmax(bst_synth.predict(dtest_full), axis=1), y_test_full))

Accuracy on test data:  0.08409842085934631


In [19]:
# accuracy of actual model on synth data

print("Accuracy on test data: ", skl.metrics.accuracy_score(np.argmax(bst_full.predict(dtest_synth), axis=1), y_test_synth))

Accuracy on test data:  0.1421226588321704


In [None]:
# training model with fewer labels
bst_synth = xgb.train(param_small,
                      dtrain_synth,
                      num_round,
                      evals=evallist_small,
                      early_stopping_rounds=early_stopping_rounds,
                      verbose_eval=25)

In [None]:
critical_old = []
critical_new = []
critical_mixed = []
critical_full = []

for proportion in proportion_of_old_data:
    print(f"Current target proportion of old data in use: {proportion}")

    # get critical data
    critical_data, critical_data_labels = data_selection.get_samples_nearest_neighbors(data_small,
                                                                                          labels_small,
                                                                                          data_update,
                                                                                          ratio_return_total = proportion,
                                                                                          normalization="min_max",
                                                                                          alpha=alpha,
                                                                                          remove_duplicates=False)



    # concatenate with data for new class
    critical_data = pd.concat([critical_data, data_update])
    critical_data_labels = pd.concat([critical_data_labels, labels_update])

    # train a model with the new class and the critical data
    critical_old_tmp = 0
    critical_new_tmp = 0
    critical_mixed_tmp = 0
    critical_full_tmp = 0

    for i in range(num_models):
        X_train_critical, X_test_critical, y_train_critical, y_test_critical = skl.model_selection.train_test_split(critical_data,
                                                                                                                    critical_data_labels,
                                                                                                                    test_size=.2)

        dtrain_critical = xgb.DMatrix(X_train_critical, label=y_train_critical)
        dtest_critical = xgb.DMatrix(X_test_critical, label=y_test_critical)

        # updating the model with the new class
        bst_critical = xgb.train(param_update,
                                  dtrain_critical,
                                  num_round,
                                  evals=evallist_update,
                                  early_stopping_rounds=early_stopping_rounds,
                                  verbose_eval=False,
                                  xgb_model=model_folder/"fewer_class_model.json")

        critical_old_tmp += skl.metrics.accuracy_score(np.argmax(bst_critical.predict(dtest_small), axis=1), y_test_small)
        critical_new_tmp += skl.metrics.accuracy_score(np.argmax(bst_critical.predict(dtest_update), axis=1), y_test_update)
        critical_mixed_tmp += skl.metrics.accuracy_score(np.argmax(bst_critical.predict(dtest_critical), axis=1), y_test_critical)
        critical_full_tmp += skl.metrics.accuracy_score(np.argmax(bst_critical.predict(dtest_full), axis=1), y_test_full)

    critical_old.append(critical_old_tmp/num_models)
    critical_new.append(critical_new_tmp/num_models)
    critical_mixed.append(critical_mixed_tmp/num_models)
    critical_full.append(critical_full_tmp/num_models)

13611