In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
import xgboost as xgb
import sklearn as skl
import _pickle as pickle

from mylib import class_distributions
from mylib import data_selection
from mylib import helper_funcs

import dtreeviz
import logging
# to suppress messages when plotting trees
logging.getLogger('matplotlib.font_manager').setLevel(level=logging.CRITICAL)
import matplotlib.pyplot as plt

%load_ext autoreload
%autoreload 2

In [3]:
# open file

data_folder = Path("../../../data/DryBeanDataset/")
model_folder = Path("../../../models/DryBeanDataset")
file_to_open = data_folder / "Dry_Bean_Dataset.xlsx"

In [4]:
# read and prepare data

data = pd.read_excel(file_to_open)

labels_dict = {key:value for (value,key) in enumerate(data["Class"].unique())}
data["Class"] = data["Class"].map(labels_dict)
#data.Class.astype("category").cat.codes

# need feature matrix X and labels labels for xgboost
labels = data["Class"]
X = data.drop(["Class"],axis=1,inplace=False)

In [5]:
class_distributions.label_proportions(labels)

6    0.260525
5    0.193667
0    0.148924
4    0.141650
3    0.119756
1    0.097127
2    0.038351
Name: Class, dtype: float64

In [6]:
# prepare smaller dataset with only first num_labels classes of beans

old_classes = [0,1,3,4,5,6]
new_class = 2

# compute number of old labels used
num_labels = len(old_classes)

# relabel for XGBoost
labels = helper_funcs.relabel(labels, old_classes, new_class)

data_small = X[labels < num_labels]
labels_small = labels[labels < num_labels]

# attempt to retrain with new data
new_class_data = X[labels == num_labels]
new_class_labels = labels[labels == num_labels]

# also train a model with all the data availale for comparison
data_full = pd.concat([data_small, new_class_data])
labels_full = pd.concat([labels_small, new_class_labels])

In [7]:
# split data into train- and test-data

X_train_small, X_test_small, y_train_small, y_test_small = skl.model_selection.train_test_split(data_small, 
                                                    labels_small,
                                                    test_size=.2)

In [8]:
# specify DMatrices

dtrain_small = xgb.DMatrix(X_train_small, label=y_train_small)
dtest_small = xgb.DMatrix(X_test_small, label=y_test_small)

# specify some parameters
num_models = 10
proportion_of_old_data = [i*0.1 for i in range(1,10)]

# specify paramters for XGBoost
num_round = 100
num_round_full = 2*num_round   # a more apt comparison I think
early_stopping_rounds = num_round*.1
max_depth = 3
eta = .1

param_small = {'max_depth': max_depth, 'eta': eta, 'objective': 'multi:softprob', "num_class": num_labels}
param_small['nthread'] = 4
param_small['eval_metric'] = 'mlogloss'

evallist_small = [(dtrain_small, 'train'), (dtest_small, 'eval')]

In [9]:
# training model with fewer labels
bst_small = xgb.train(param_small,
                      dtrain_small,
                      num_round,
                      evals=evallist_small,
                      #early_stopping_rounds=early_stopping_rounds,
                      verbose_eval=25)

bst_small.save_model(fname=model_folder / 'small_model.json')

[0]	train-mlogloss:1.56619	eval-mlogloss:1.57072
[25]	train-mlogloss:0.32024	eval-mlogloss:0.35110
[50]	train-mlogloss:0.19905	eval-mlogloss:0.23705
[75]	train-mlogloss:0.16812	eval-mlogloss:0.21243
[99]	train-mlogloss:0.15333	eval-mlogloss:0.20453


In [10]:
print("Accuracy on test data: ", skl.metrics.accuracy_score(np.argmax(bst_small.predict(dtest_small), axis=1), y_test_small))

Accuracy on test data:  0.9243697478991597


In [12]:
bst_small.get_fscore()

{'Area': 173.0,
 'Perimeter': 256.0,
 'MajorAxisLength': 162.0,
 'MinorAxisLength': 191.0,
 'AspectRation': 170.0,
 'ConvexArea': 179.0,
 'Extent': 293.0,
 'Solidity': 322.0,
 'roundness': 410.0,
 'Compactness': 384.0,
 'ShapeFactor1': 385.0,
 'ShapeFactor2': 99.0,
 'ShapeFactor4': 569.0}

In [14]:
bst_small_df = bst_small.trees_to_dataframe()
data_selection.important_features_by_class(bst_small_df, 6)

[('Compactness', 0.8503294711809524),
 ('ShapeFactor1', 0.0061881079500000005),
 ('ConvexArea', 56172.95238095238),
 ('Compactness', 0.7600958175390071),
 ('MajorAxisLength', 285.557258140625),
 ('ConvexArea', 38900.9125)]

In [41]:
# split data into train- and test-data

X_train_new_class, X_test_new_class, y_train_new_class, y_test_new_class = skl.model_selection.train_test_split(new_class_data,
                                                                                                                new_class_labels,
                                                                                                                test_size=.3,
                                                                                                                random_state=2)

X_train_full, X_test_full, y_train_full, y_test_full = skl.model_selection.train_test_split(data_full,
                                                                                            labels_full,
                                                                                            test_size=.2)

In [35]:
# specify DMatrices

# only to check performance on the newly added data
dtrain_new_class = xgb.DMatrix(new_class_data, label=new_class_labels)

dtrain_full = xgb.DMatrix(X_train_full, label=y_train_full)
dtest_full = xgb.DMatrix(X_test_full, label=y_test_full)

In [36]:
# parameters for data_full model

param_full = {'max_depth': max_depth,
              'eta': eta,
              'objective': 'multi:softprob',
              "num_class": num_labels+1}
param_full['nthread'] = 4
param_full['eval_metric'] = 'mlogloss'

evallist_full = [(dtrain_full, 'train'), (dtest_full, 'eval')]

In [13]:
# training a model with all the training data

bst_full = xgb.train(param_full,
                     dtrain_full,
                     num_round_full,
                     evals=evallist_full,
                     #early_stopping_rounds=early_stopping_rounds,
                     verbose_eval=25)

bst_full.save_model(fname=model_folder / 'small_model_full.json')

[0]	train-mlogloss:1.68470	eval-mlogloss:1.68676
[25]	train-mlogloss:0.32986	eval-mlogloss:0.35262
[50]	train-mlogloss:0.19646	eval-mlogloss:0.22863
[75]	train-mlogloss:0.16380	eval-mlogloss:0.20446
[100]	train-mlogloss:0.14744	eval-mlogloss:0.19596
[125]	train-mlogloss:0.13644	eval-mlogloss:0.19356
[150]	train-mlogloss:0.12723	eval-mlogloss:0.19273
[175]	train-mlogloss:0.11931	eval-mlogloss:0.19247
[199]	train-mlogloss:0.11242	eval-mlogloss:0.19160


In [14]:
print("Accuracy on test data: ", skl.metrics.accuracy_score(np.argmax(bst_full.predict(dtest_full), axis=1), y_test_full))

Accuracy on test data:  0.9283878075651855


In [16]:
# parameters for update model (the same as for full model, but just in case I want to ever change them)

param_update = {'max_depth': max_depth,
                'eta': eta,
                'objective': 'multi:softprob',
                "num_class": num_labels+1}
param_update['nthread'] = 4
param_update['eval_metric'] = 'mlogloss'

In [17]:
random_old = []
random_new = []
random_mixed = []
random_full = []


for proportion in proportion_of_old_data:
    print(f"Current target proportion of old data in use: {proportion}")
    
    random_old_tmp = 0
    random_new_tmp = 0
    random_mixed_tmp = 0
    random_full_tmp = 0

    for _ in range(num_models):

        _, old_data_part, _, old_y_part = skl.model_selection.train_test_split(data_small,
                                                                               labels_small,
                                                                               test_size=proportion)
        

        data_update = pd.concat([old_data_part, new_class_data])
        labels_update = pd.concat([old_y_part, new_class_labels])

        X_train_update, X_test_update, y_train_update, y_test_update = skl.model_selection.train_test_split(data_update,
                                                                                                            labels_update,
                                                                                                            test_size=.2)

        # create DMatrices

        dtrain_update = xgb.DMatrix(X_train_update, label=y_train_update)
        dtest_update = xgb.DMatrix(X_test_update, label=y_test_update)
        
        evallist_update = [(dtrain_update, 'train'), (dtest_update, 'eval')]

        # train model
        bst_update = xgb.train(param_update,
                                  dtrain_update,
                                  num_round,
                                  evals=evallist_update,
                                  #early_stopping_rounds=early_stopping_rounds,
                                  verbose_eval=False,
                                  xgb_model=model_folder/"small_model.json")

        random_old_tmp += skl.metrics.accuracy_score(np.argmax(bst_update.predict(dtest_small), axis=1), y_test_small)
        random_new_tmp += skl.metrics.accuracy_score(np.argmax(bst_update.predict(dtest_update), axis=1), y_test_update)
        random_mixed_tmp += skl.metrics.accuracy_score(np.argmax(bst_update.predict(dtest_update), axis=1), y_test_update)
        random_full_tmp += skl.metrics.accuracy_score(np.argmax(bst_update.predict(dtest_full), axis=1), y_test_full)
        
    random_old.append(random_old_tmp/num_models)
    random_new.append(random_new_tmp/num_models)    
    random_mixed.append(random_mixed_tmp/num_models)    
    random_full.append(random_full_tmp/num_models)    

Current target proportion of old data in use: 0.1
Current target proportion of old data in use: 0.2


KeyboardInterrupt: 

In [18]:
critical_old_alphas = {}
critical_new_alphas = {}
critical_mixed_alphas = {}
critical_full_alphas = {}

for alpha in [.2*i for i in range(6)]:

    critical_old = []
    critical_new = []
    critical_mixed = []
    critical_full = []

    for proportion in proportion_of_old_data:
        print(f"Current target proportion of old data in use: {proportion}")

        # get critical data
        critical_data, critical_data_labels = data_selection.get_samples_nearest_neighbors(data_small,
                                                                                              labels_small,
                                                                                              new_class_data,
                                                                                              ratio_return_total = proportion,
                                                                                              normalization="min_max",
                                                                                              alpha=alpha,
                                                                                              remove_duplicates=False)



        # concatenate with data for new class
        critical_data = pd.concat([critical_data, data_update])
        critical_data_labels = pd.concat([critical_data_labels, labels_update])

        # train a model with the new class and the critical data
        critical_old_tmp = 0
        critical_new_tmp = 0
        critical_mixed_tmp = 0
        critical_full_tmp = 0

        for i in range(num_models):
            X_train_critical, X_test_critical, y_train_critical, y_test_critical = skl.model_selection.train_test_split(critical_data,
                                                                                                                        critical_data_labels,
                                                                                                                        test_size=.2)

            dtrain_critical = xgb.DMatrix(X_train_critical, label=y_train_critical)
            dtest_critical = xgb.DMatrix(X_test_critical, label=y_test_critical)

            # updating the model with the new class
            bst_critical = xgb.train(param_update,
                                      dtrain_critical,
                                      num_round,
                                      evals=evallist_update,
                                      early_stopping_rounds=early_stopping_rounds,
                                      verbose_eval=False,
                                      xgb_model=model_folder/"small_model.json")

            critical_old_tmp += skl.metrics.accuracy_score(np.argmax(bst_critical.predict(dtest_small), axis=1), y_test_small)
            critical_new_tmp += skl.metrics.accuracy_score(np.argmax(bst_critical.predict(dtest_update), axis=1), y_test_update)
            critical_mixed_tmp += skl.metrics.accuracy_score(np.argmax(bst_critical.predict(dtest_critical), axis=1), y_test_critical)
            critical_full_tmp += skl.metrics.accuracy_score(np.argmax(bst_critical.predict(dtest_full), axis=1), y_test_full)

        critical_old.append(critical_old_tmp/num_models)
        critical_new.append(critical_new_tmp/num_models)
        critical_mixed.append(critical_mixed_tmp/num_models)
        critical_full.append(critical_full_tmp/num_models)
    
    critical_old_alphas[f"{alpha}"] = critical_old
    critical_new_alphas[f"{alpha}"] = critical_new
    critical_mixed_alphas[f"{alpha}"] = critical_mixed
    critical_full_alphas[f"{alpha}"] = critical_full

Current target proportion of old data in use: 0.1
Current target proportion of old data in use: 0.2
Current target proportion of old data in use: 0.30000000000000004
Current target proportion of old data in use: 0.4
Current target proportion of old data in use: 0.5
Current target proportion of old data in use: 0.6000000000000001
Current target proportion of old data in use: 0.7000000000000001
Current target proportion of old data in use: 0.8
Current target proportion of old data in use: 0.9
Current target proportion of old data in use: 0.1
Current target proportion of old data in use: 0.2
Current target proportion of old data in use: 0.30000000000000004
Current target proportion of old data in use: 0.4
Current target proportion of old data in use: 0.5
Current target proportion of old data in use: 0.6000000000000001


KeyboardInterrupt: 

In [None]:
# save the performances
fig = plt.figure()
ax = plt.gca()
ax.set_xlim([0, 1])
ax.set_ylim([0.8, 1])
plt.title(f"NearestNeighbors, minmax, including duplicates, train smallest class")
plt.plot(proportion_of_old_data, random_full, label="model updated with random data")
for key in critical_full_alphas.keys():
    plt.plot(proportion_of_old_data, critical_full_alphas[key], label=f"model updated with critical data (alpha={key:.3})")
    
plt.axhline(skl.metrics.accuracy_score(np.argmax(bst_full.predict(dtest_full), axis=1), y_test_full),
            color = "black",
            linestyle = "--",
            label = "batch training on full data")
plt.xlabel("Percentage of old data used in updating")
plt.ylabel("Accuracy")
plt.legend(loc=4)
plt.savefig(f"NN, minmax, including duplicates, train smallest class.png")
plt.show();