In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
import xgboost as xgb
import sklearn as skl
import _pickle as pickle

from mylib import class_distributions
from mylib import data_selection
from mylib import helper_funcs
from mylib.db import preprocessing
from mylib.db import constants

import dtreeviz
import logging
# to suppress messages when plotting trees
logging.getLogger('matplotlib.font_manager').setLevel(level=logging.CRITICAL)
import matplotlib.pyplot as plt

%load_ext autoreload
%autoreload 2

In [2]:
from_date = "2022-06-01"
to_date = "2022-06-30"
n = 100000

car_df_unprocessed = preprocessing.Preprocessor(from_date, to_date, limit=n, verbose=False,
                                               remove_cat_columns = False, normalization = False)

Preprocessing successful.


In [3]:
# first capture the column to be predicted
labels = car_df_unprocessed.car_df["region_type"]

# now remove all categorical columns and normalize
car_df_unprocessed.remove_columns(constants.CATEGORICAL_COLUMNS)
car_df_unprocessed.normalize()
X = car_df_unprocessed.car_df

print(X.shape)
print(labels.shape)

Removing column 'age_segment'
Removing column 'region_type'
Removing column 'emp_liable'
Removing column 'hou_fam_structure'
Removing column 'hou_aff_new_products'
Removing column 'hou_aff_prices'
Removing column 'subs_hand_ind'
Removing column 'bnt_vvl_lng'
Removing column 'article_status_enc'
Removing column 'tech_generation'
Removing column 'known_article'
Removing column 'equal_to_recently_sold'
(99689, 151)
(99689,)


In [4]:
# need to drop unknown or missing values
nan_index = labels[np.isnan(labels)].index
unknown_index = labels[labels == -1].index

labels = labels.drop(nan_index)
labels = labels.drop(unknown_index)
X = X.drop(nan_index)
X = X.drop(unknown_index)

assert (labels.index == X.index).all()

labels = labels-2

In [5]:
class_distributions.label_proportions(labels)

2    0.367682
1    0.334212
0    0.298106
Name: region_type, dtype: float64

In [6]:
# prepare smaller dataset with only subset of classes

old_classes = [0,2]
new_class = 1

# compute number of old labels used
num_labels = len(old_classes)

# relabel for XGBoost
labels = helper_funcs.relabel(labels, old_classes, new_class)

data_small = X[labels < num_labels]
labels_small = labels[labels < num_labels]

# attempt to retrain with new data
data_update = X[labels == num_labels]
labels_update = labels[labels == num_labels]

# also train a model with all the data availale for comparison
data_full = pd.concat([data_small, data_update])
labels_full = pd.concat([labels_small, labels_update])

In [7]:
# split data into train- and test-data

X_train_small, X_test_small, y_train_small, y_test_small = skl.model_selection.train_test_split(data_small, 
                                                    labels_small,
                                                    test_size=.2)

In [8]:
# specify DMatrices

dtrain_small = xgb.DMatrix(X_train_small, label=y_train_small)
dtest_small = xgb.DMatrix(X_test_small, label=y_test_small)

# specify some parameters
proportion_of_old_data = [i*0.1 for i in range(1,10)]
num_models = 1

# specify paramters for XGBoost
num_round = 100
num_round_full = 200
early_stopping_rounds = num_round*.1
max_depth = 3
eta = .1

param_small = {'max_depth': max_depth, 'eta': eta, 'objective': 'multi:softprob', "num_class": num_labels}
param_small['nthread'] = 4
param_small['eval_metric'] = 'mlogloss'

evallist_small = [(dtrain_small, 'train'), (dtest_small, 'eval')]

In [9]:
# training model with fewer labels
bst_small = xgb.train(param_small,
                      dtrain_small,
                      num_round,
                      evals=evallist_small,
                      #early_stopping_rounds=early_stopping_rounds,
                      verbose_eval=50)

bst_small.save_model('small_model_region_type.json')

[0]	train-mlogloss:0.61290	eval-mlogloss:0.61315
[50]	train-mlogloss:0.07094	eval-mlogloss:0.07193
[99]	train-mlogloss:0.03932	eval-mlogloss:0.04063


In [10]:
print("Accuracy on test data: ", skl.metrics.accuracy_score(np.argmax(bst_small.predict(dtest_small), axis=1), y_test_small))

Accuracy on test data:  0.9873469723112316


In [11]:
# split data into train- and test-data

X_train_update, X_test_update, y_train_update, y_test_update = skl.model_selection.train_test_split(data_update,
                                                                                                    labels_update,
                                                                                                    test_size=.2)

X_train_full, X_test_full, y_train_full, y_test_full = skl.model_selection.train_test_split(data_full,
                                                                                            labels_full,
                                                                                            test_size=.2)

In [12]:
# specify DMatrices

dtrain_update = xgb.DMatrix(X_train_update, label=y_train_update)
dtest_update = xgb.DMatrix(X_test_update, label=y_test_update)

dtrain_full = xgb.DMatrix(X_train_full, label=y_train_full)
dtest_full = xgb.DMatrix(X_test_full, label=y_test_full)


# specify paramters for XGBoost
param_update = {'max_depth': max_depth,
                'eta': eta,
                'objective': 'multi:softprob',
                "num_class": num_labels+1}
param_update['nthread'] = 4
param_update['eval_metric'] = 'mlogloss'

evallist_update = [(dtrain_update, 'train'), (dtest_update, 'eval')]


param_full = {'max_depth': max_depth,
              'eta': eta,
              'objective': 'multi:softprob',
              "num_class": num_labels+1}
param_full['nthread'] = 4
param_full['eval_metric'] = 'mlogloss'

evallist_full = [(dtrain_full, 'train'), (dtest_full, 'eval')]

In [13]:
# training a model with all the training data

bst_full = xgb.train(param_full,
                     dtrain_full,
                     num_round_full,
                     evals=evallist_full,
                     #early_stopping_rounds=early_stopping_rounds,
                     verbose_eval=25)

[0]	train-mlogloss:1.00713	eval-mlogloss:1.00696
[25]	train-mlogloss:0.40138	eval-mlogloss:0.40250
[50]	train-mlogloss:0.31069	eval-mlogloss:0.31309
[75]	train-mlogloss:0.27065	eval-mlogloss:0.27278
[100]	train-mlogloss:0.24724	eval-mlogloss:0.24954
[125]	train-mlogloss:0.23014	eval-mlogloss:0.23272
[150]	train-mlogloss:0.21517	eval-mlogloss:0.21845
[175]	train-mlogloss:0.20159	eval-mlogloss:0.20509
[199]	train-mlogloss:0.18928	eval-mlogloss:0.19304


In [14]:
print("Accuracy on test data: ", skl.metrics.accuracy_score(np.argmax(bst_full.predict(dtest_full), axis=1), y_test_full))

Accuracy on test data:  0.9275164113785558


In [15]:
random_old = []
random_new = []
random_mixed = []
random_full = []


for proportion in proportion_of_old_data:
    print(f"Current target proportion of old data in use: {proportion}")
    
    random_old_tmp = 0
    random_new_tmp = 0
    random_mixed_tmp = 0
    random_full_tmp = 0

    for _ in range(num_models):

        _, old_data_part, _, old_y_part = skl.model_selection.train_test_split(data_small,
                                                                                labels_small,
                                                                                test_size=proportion)
        

        data_update2 = pd.concat([old_data_part, data_update])
        labels_update2 = pd.concat([old_y_part, labels_update])

        X_train_update2, X_test_update2, y_train_update2, y_test_update2 = skl.model_selection.train_test_split(data_update2,
                                                                                                                labels_update2,
                                                                                                                test_size=.2)

        # create DMatrices

        dtrain_update2 = xgb.DMatrix(X_train_update2, label=y_train_update2)
        dtest_update2 = xgb.DMatrix(X_test_update2, label=y_test_update2)

        # train model
        bst_update2 = xgb.train(param_update,
                                  dtrain_update2,
                                  num_round,
                                  evals=evallist_update,
                                  early_stopping_rounds=early_stopping_rounds,
                                  verbose_eval=False,
                                  xgb_model="small_model_region_type.json")

        random_old_tmp += skl.metrics.accuracy_score(np.argmax(bst_update2.predict(dtest_small), axis=1), y_test_small)
        random_new_tmp += skl.metrics.accuracy_score(np.argmax(bst_update2.predict(dtest_update), axis=1), y_test_update)
        random_mixed_tmp += skl.metrics.accuracy_score(np.argmax(bst_update2.predict(dtest_update2), axis=1), y_test_update2)
        random_full_tmp += skl.metrics.accuracy_score(np.argmax(bst_update2.predict(dtest_full), axis=1), y_test_full)
        
    random_old.append(random_old_tmp/num_models)
    random_new.append(random_new_tmp/num_models)    
    random_mixed.append(random_mixed_tmp/num_models)    
    random_full.append(random_full_tmp/num_models)    

Current target proportion of old data in use: 0.1
Current target proportion of old data in use: 0.2
Current target proportion of old data in use: 0.30000000000000004
Current target proportion of old data in use: 0.4
Current target proportion of old data in use: 0.5
Current target proportion of old data in use: 0.6000000000000001
Current target proportion of old data in use: 0.7000000000000001
Current target proportion of old data in use: 0.8
Current target proportion of old data in use: 0.9


In [None]:
critical_old_alphas = {}
critical_new_alphas = {}
critical_mixed_alphas = {}
critical_full_alphas = {}

for alpha in [.2*i for i in range(5)]:

    critical_old = []
    critical_new = []
    critical_mixed = []
    critical_full = []

    for proportion in proportion_of_old_data:
        print(f"Current target proportion of old data in use: {proportion}")

        # get critical data
        critical_data, critical_data_labels = data_selection.get_samples_nearest_neighbors(data_small,
                                                                                              labels_small,
                                                                                              data_update,
                                                                                              ratio_return_total = proportion,
                                                                                              normalization="min_max",
                                                                                              alpha=alpha,
                                                                                              remove_duplicates=False)



        # concatenate with data for new class
        critical_data = pd.concat([critical_data, data_update])
        critical_data_labels = pd.concat([critical_data_labels, labels_update])

        # train a model with the new class and the critical data
        critical_old_tmp = 0
        critical_new_tmp = 0
        critical_mixed_tmp = 0
        critical_full_tmp = 0

        for i in range(num_models):
            X_train_critical, X_test_critical, y_train_critical, y_test_critical = skl.model_selection.train_test_split(critical_data,
                                                                                                                        critical_data_labels,
                                                                                                                        test_size=.2)

            dtrain_critical = xgb.DMatrix(X_train_critical, label=y_train_critical)
            dtest_critical = xgb.DMatrix(X_test_critical, label=y_test_critical)

            # updating the model with the new class
            bst_critical = xgb.train(param_update,
                                      dtrain_critical,
                                      num_round,
                                      evals=evallist_update,
                                      early_stopping_rounds=early_stopping_rounds,
                                      verbose_eval=False,
                                      xgb_model="small_model_region_type.json")

            critical_old_tmp += skl.metrics.accuracy_score(np.argmax(bst_critical.predict(dtest_small), axis=1), y_test_small)
            critical_new_tmp += skl.metrics.accuracy_score(np.argmax(bst_critical.predict(dtest_update), axis=1), y_test_update)
            critical_mixed_tmp += skl.metrics.accuracy_score(np.argmax(bst_critical.predict(dtest_critical), axis=1), y_test_critical)
            critical_full_tmp += skl.metrics.accuracy_score(np.argmax(bst_critical.predict(dtest_full), axis=1), y_test_full)

        critical_old.append(critical_old_tmp/num_models)
        critical_new.append(critical_new_tmp/num_models)
        critical_mixed.append(critical_mixed_tmp/num_models)
        critical_full.append(critical_full_tmp/num_models)
    
    critical_old_alphas[f"{alpha}"] = critical_old
    critical_new_alphas[f"{alpha}"] = critical_new
    critical_mixed_alphas[f"{alpha}"] = critical_mixed
    critical_full_alphas[f"{alpha}"] = critical_full

Current target proportion of old data in use: 0.1
Current target proportion of old data in use: 0.2
Current target proportion of old data in use: 0.30000000000000004
Current target proportion of old data in use: 0.4
Current target proportion of old data in use: 0.5
Current target proportion of old data in use: 0.6000000000000001
Current target proportion of old data in use: 0.7000000000000001
Current target proportion of old data in use: 0.8
Current target proportion of old data in use: 0.9
Current target proportion of old data in use: 0.1
Current target proportion of old data in use: 0.2
Current target proportion of old data in use: 0.30000000000000004
Current target proportion of old data in use: 0.4
Current target proportion of old data in use: 0.5
Current target proportion of old data in use: 0.6000000000000001
Current target proportion of old data in use: 0.7000000000000001
Current target proportion of old data in use: 0.8
Current target proportion of old data in use: 0.9
Current 

In [1]:
# save the performances
fig = plt.figure()
ax = plt.gca()
ax.set_xlim([0, 1])
ax.set_ylim([0.8, 1])
plt.title(f"NearestNeighbors, minmax, including duplicates, train middle class")
plt.plot(proportion_of_old_data, random_full, label="model updated with random data")
for key in critical_full_alphas.keys():
    plt.plot(proportion_of_old_data, critical_full_alphas[key], label=f"model updated with critical data (alpha={key:.3})")
    
plt.axhline(skl.metrics.accuracy_score(np.argmax(bst_full.predict(dtest_full), axis=1), y_test_full),
            color = "black",
            linestyle = "--",
            label = "batch training on full data")
plt.xlabel("Percentage of old data used in updating")
plt.ylabel("Accuracy")
plt.legend(loc=4)
plt.savefig(f"NN, minmax, including duplicates, train middle class.png")
plt.show();

NameError: name 'plt' is not defined

In [2]:
# Die Ergebnisse lagen alle unterhalb der zufällig ausgewählten Daten. Am besten schnitt alpha=0.8 ab.