In [23]:
import pandas as pd
import numpy as np
import xgboost as xgb
import sklearn as skl

from pathlib import Path

from mylib import class_distributions
from mylib import helper_funcs

from mylib.pipelines import updating_pipeline

In [6]:
data_folder = data_folder = Path("../../data/DryBeanDataset/")

In [9]:
class_distributions.label_proportions(data['Class'])

6    0.260525
5    0.193667
0    0.148924
4    0.141650
3    0.119756
1    0.097127
2    0.038351
Name: Class, dtype: float64

In [10]:
data = pd.read_csv(data_folder / 'DryBeanDataset.csv')

# need feature matrix X and labels labels for xgboost
labels = data["Class"]
X = data.drop(["Class"],axis=1,inplace=False)

In [25]:
training_method = 'continued_training'
new_class_idx = 6
num_models = 20
num_round = 10
max_depth = 3

In [11]:
# create model file 
Path('models').mkdir(parents=True, exist_ok=True)
model_folder = Path("models/")

In [21]:
# prepare smaller dataset with only first num_labels classes of beans
old_classes = np.setdiff1d(labels.unique(), new_class_idx)
new_class = new_class_idx

# compute number of old labels used
num_labels = len(old_classes)

# relabel for XGBoost
labels, relabel_dict = helper_funcs.relabel(labels, old_classes, new_class)

# the "original" training data
data_small = X[labels < num_labels]
labels_small = labels[labels < num_labels]

# to check full model on all of the old data
dsmall = xgb.DMatrix(data_small, label=labels_small)

# the new class data
new_class_data = X[labels == num_labels]
new_class_labels = labels[labels == num_labels]

# only to check performance on the newly added data
dnew_class = xgb.DMatrix(new_class_data, label=new_class_labels)

# the entire training data
data_full = pd.concat([data_small, new_class_data])
labels_full = pd.concat([labels_small, new_class_labels])

# to check full model on the full data
dfull = xgb.DMatrix(data_full, label=labels_full)


# some parameters
proportion_of_old_data = [i*0.1 for i in range(1,10)]
# I don't have the time to vary this
num_round_update=[num_round]
eta = .1

# parameters for small model
param_small = {'max_depth': max_depth,
               'eta': eta,
               'objective': 'multi:softprob',
               "num_class": num_labels}
param_small['nthread'] = 4
param_small['eval_metric'] = 'mlogloss'


# parameters for update model (the same as for full model, but just in case I want to ever change them)
param_update = {'max_depth': max_depth,
                'eta': eta,
                'objective': 'multi:softprob',
                "num_class": num_labels+1}
param_update['nthread'] = 4
param_update['eval_metric'] = 'mlogloss'

In [26]:
# these dictionaries are filled with the results and later pickled

old_data_mean_results = dict()
old_data_std_results = dict()
new_data_mean_results = dict()
new_data_std_results = dict()
update_data_mean_results = dict()
update_data_std_results = dict()
full_data_mean_results = dict()
full_data_std_results = dict()

# the update routine
for num_round_update_idx, num_round_updt in enumerate(num_round_update):

    # initialize arrays where results are stored
    old_data_mean = np.zeros(len(proportion_of_old_data))
    old_data_std = np.zeros(len(proportion_of_old_data))
    new_data_mean = np.zeros(len(proportion_of_old_data))
    new_data_std = np.zeros(len(proportion_of_old_data))
    update_data_mean = np.zeros(len(proportion_of_old_data))
    update_data_std = np.zeros(len(proportion_of_old_data))
    full_data_mean = np.zeros(len(proportion_of_old_data))
    full_data_std = np.zeros(len(proportion_of_old_data))

    for proportion_num, proportion in enumerate(proportion_of_old_data):
        print(f"Current target proportion of old data in use: {proportion}")

        # initialize arrays where temporary results are stored
        old_data_tmp = np.zeros(num_models)
        new_data_tmp = np.zeros(num_models)
        update_data_tmp = np.zeros(num_models)
        full_data_tmp = np.zeros(num_models)

        for model_num in range(num_models):
            
            # training the original model
            
            seed = np.random.randint(0,100)
            # split original data into train- and test-data
            X_train_small, X_test_small, y_train_small, y_test_small = skl.model_selection.train_test_split(data_small, 
                                                                                                            labels_small,
                                                                                                            test_size=.2,
                                                                                                            random_state=seed)

            # specify DMatrices
            dtrain_small = xgb.DMatrix(X_train_small, label=y_train_small)
            dtest_small = xgb.DMatrix(X_test_small, label=y_test_small)
            
            evallist_small = [(dtrain_small, 'train'), (dtest_small, 'eval')]
            
            bst_small = xgb.train(param_small,
                                  dtrain_small,
                                  num_round,
                                  evals=evallist_small,
                                  verbose_eval=False)

            bst_small.save_model(fname=model_folder / 'small_model.json')
                
            # concatenate selected data with data of new class
            data_update = new_class_data
            labels_update = new_class_labels

            # use all the update data to update the model
            dtrain_update = xgb.DMatrix(data_update, label=labels_update)
            
            # update model
            bst_update = xgb.train(param_update,
                                  dtrain_update,
                                  num_round_updt,
                                  verbose_eval=False,
                                  xgb_model=model_folder/"small_model.json")

            
            old_data_tmp[model_num] = skl.metrics.accuracy_score(np.argmax(bst_update.predict(dsmall), axis=1),
                                                                   labels_small)
            new_data_tmp[model_num] = skl.metrics.accuracy_score(np.argmax(bst_update.predict(dnew_class), axis=1),
                                                                   new_class_labels)
            update_data_tmp[model_num] = skl.metrics.accuracy_score(np.argmax(bst_update.predict(dtrain_update), axis=1),
                                                                     labels_update)
            full_data_tmp[model_num] = skl.metrics.accuracy_score(np.argmax(bst_update.predict(dfull), axis=1),
                                                                    labels_full)

        old_data_mean[proportion_num] = old_data_tmp.mean()
        old_data_std[proportion_num] = old_data_tmp.std()
        new_data_mean[proportion_num] = new_data_tmp.mean()
        new_data_std[proportion_num] = new_data_tmp.std()  
        update_data_mean[proportion_num] = update_data_tmp.mean()
        update_data_std[proportion_num] = update_data_tmp.std()
        full_data_mean[proportion_num] = full_data_tmp.mean()
        full_data_std[proportion_num] = full_data_tmp.std()

    old_data_mean_results[num_round_updt] = old_data_mean
    old_data_std_results[num_round_updt] = old_data_std
    new_data_mean_results[num_round_updt] = new_data_mean
    new_data_std_results[num_round_updt] = new_data_std
    update_data_mean_results[num_round_updt] = update_data_mean
    update_data_std_results[num_round_updt] = update_data_std
    full_data_mean_results[num_round_updt] = full_data_mean
    full_data_std_results[num_round_updt] = full_data_std

Current target proportion of old data in use: 0.1


TypeError: 'tuple' object cannot be interpreted as an integer