In [1]:
import sys
sys.path.append("/home/dfischer/masterarbeit/src/")

from pathlib import Path
import json

import numpy as np
import pandas as pd
import xgboost as xgb
import sklearn as skl
import _pickle as pickle

import lib.class_distributions as class_distributions
import lib.data_selection as data_selection
import lib.helper_funcs as helper_funcs
import lib.json_manipulation as json_manipulation

import dtreeviz
import logging
# to suppress messages when plotting trees
logging.getLogger('matplotlib.font_manager').setLevel(level=logging.CRITICAL)
import matplotlib.pyplot as plt

%load_ext autoreload
%autoreload 2

In [2]:
# open file

data_folder = Path("../../data/DryBeanDataset/")
model_folder = Path("../../models/DryBeanDataset")
file_to_open = data_folder / "Dry_Bean_Dataset.xlsx"

In [3]:
# read and prepare data

data = pd.read_excel(file_to_open)

labels_dict = {key:value for (value,key) in enumerate(data["Class"].unique())}
data["Class"] = data["Class"].map(labels_dict)
#data.Class.astype("category").cat.codes

# need feature matrix X and labels labels for xgboost
labels = data["Class"]
X = data.drop(["Class"],axis=1,inplace=False)

In [4]:
# prepare smaller dataset with only first num_labels classes of beans

old_classes = [0,1,3,4,5,6]
new_class = 2

# compute number of old labels used
num_labels = len(old_classes)

# relabel for XGBoost
labels = helper_funcs.relabel(labels, old_classes, new_class)

data_small = X[labels < num_labels]
labels_small = labels[labels < num_labels]

# attempt to retrain with new data
data_update = X[labels == num_labels]
labels_update = labels[labels == num_labels]

# also train a model with all the data availale for comparison
data_full = pd.concat([data_small, data_update])
labels_full = pd.concat([labels_small, labels_update])

In [5]:
# split data into train- and test-data

X_train_small, X_test_small, y_train_small, y_test_small = skl.model_selection.train_test_split(data_small, 
                                                    labels_small,
                                                    test_size=.2)

In [6]:
# specify DMatrices

dtrain_small = xgb.DMatrix(X_train_small, label=y_train_small)
dtest_small = xgb.DMatrix(X_test_small, label=y_test_small)

# specify some parameters
num_models = 10
proportion_of_old_data = [i*0.1 for i in range(1,10)]

# specify paramters for XGBoost
num_round = 1000
early_stopping_rounds = num_round*.1
max_depth = 3
eta = .1

param_small = {'max_depth': max_depth, 'eta': eta, 'objective': 'multi:softprob', "num_class": num_labels}
param_small['nthread'] = 4
param_small['eval_metric'] = 'mlogloss'

evallist_small = [(dtrain_small, 'train'), (dtest_small, 'eval')]

In [8]:
# training model with fewer labels
bst_small = xgb.train(param_small,
                      dtrain_small,
                      num_round,
                      evals=evallist_small,
                      early_stopping_rounds=early_stopping_rounds,
                      verbose_eval=False)

bst_small.save_model('fewer_class_model.json')

In [9]:
print("Accuracy on test data: ", skl.metrics.accuracy_score(np.argmax(bst_small.predict(dtest_small), axis=1), y_test_small))

Accuracy on test data:  0.9194041252864782


In [10]:
# split data into train- and test-data

X_train_update, X_test_update, y_train_update, y_test_update = skl.model_selection.train_test_split(data_update,
                                                                                                    labels_update,
                                                                                                    test_size=.2)

X_train_full, X_test_full, y_train_full, y_test_full = skl.model_selection.train_test_split(data_full,
                                                                                            labels_full,
                                                                                            test_size=.2)

In [11]:
# specify DMatrices

dtrain_update = xgb.DMatrix(X_train_update, label=y_train_update)
dtest_update = xgb.DMatrix(X_test_update, label=y_test_update)

dtrain_full = xgb.DMatrix(X_train_full, label=y_train_full)
dtest_full = xgb.DMatrix(X_test_full, label=y_test_full)


# specify paramters for XGBoost
param_update = {'max_depth': max_depth,
                'eta': eta,
                'objective': 'multi:softprob',
                "num_class": num_labels+1}
param_update['nthread'] = 4
param_update['eval_metric'] = 'mlogloss'

evallist_update = [(dtrain_update, 'train'), (dtest_update, 'eval')]


param_full = {'max_depth': max_depth,
              'eta': eta,
              'objective': 'multi:softprob',
              "num_class": num_labels+1}
param_full['nthread'] = 4
param_full['eval_metric'] = 'mlogloss'

evallist_full = [(dtrain_full, 'train'), (dtest_full, 'eval')]

In [12]:
_, old_data_part, _, old_y_part = skl.model_selection.train_test_split(data_small,
                                                                        labels_small,
                                                                        test_size=0.3)

data_update2 = pd.concat([old_data_part, data_update])
labels_update2 = pd.concat([old_y_part, labels_update])

X_train_update2, X_test_update2, y_train_update2, y_test_update2 = skl.model_selection.train_test_split(data_update2,
                                                                                                        labels_update2,
                                                                                                        test_size=.2)

# create DMatrices

dtrain_update2 = xgb.DMatrix(X_train_update2, label=y_train_update2)
dtest_update2 = xgb.DMatrix(X_test_update2, label=y_test_update2)

# train model
bst_update2 = xgb.train(param_update,
                          dtrain_update2,
                          num_round,
                          evals=evallist_update,
                          early_stopping_rounds=early_stopping_rounds,
                          verbose_eval=False,
                          xgb_model="fewer_class_model.json")

bst_update2.save_model(fname='updated_model.json')

print("Accuracy on full data: ", skl.metrics.accuracy_score(np.argmax(bst_update2.predict(dtest_full), axis=1), y_test_full))

Accuracy on full data:  0.9489533602644142


In [13]:
bst_test = json_manipulation.create_modified_model("updated_model.json", num_labels)
print("Accuracy on test data: ", skl.metrics.accuracy_score(np.argmax(bst_test.predict(dtest_full), axis=1), y_test_full))

Accuracy on test data:  0.7462357693720162
