In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
import xgboost as xgb
import sklearn as skl
import _pickle as pickle

from mylib import class_distributions
from mylib import data_selection
from mylib import helper_funcs
from mylib.db import preprocessing
from mylib.db import constants

import dtreeviz
import logging
# to suppress messages when plotting trees
logging.getLogger('matplotlib.font_manager').setLevel(level=logging.CRITICAL)
import matplotlib.pyplot as plt

%load_ext autoreload
%autoreload 2

In [2]:
from_date = "2022-06-01"
to_date = "2022-06-30"
n = 100000

car_df_unprocessed = preprocessing.Preprocessor(from_date, to_date, limit=n, verbose=False,
                                               remove_cat_columns = False, normalization = False)

Preprocessing successful.


In [3]:
# first capture the column to be predicted
labels = car_df_unprocessed.car_df["age_segment"]

# now remove all categorical columns and normalize
car_df_unprocessed.remove_columns(constants.CATEGORICAL_COLUMNS)
car_df_unprocessed.normalize()
X = car_df_unprocessed.car_df

print(X.shape)
print(labels.shape)

Removing column 'age_segment'
Removing column 'region_type'
Removing column 'emp_liable'
Removing column 'hou_fam_structure'
Removing column 'hou_aff_new_products'
Removing column 'hou_aff_prices'
Removing column 'subs_hand_ind'
Removing column 'bnt_vvl_lng'
Removing column 'article_status_enc'
Removing column 'tech_generation'
Removing column 'known_article'
Removing column 'equal_to_recently_sold'
(99693, 151)
(99693,)


In [4]:
# need to drop unknown or missing values
nan_index = labels[np.isnan(labels)].index
unknown_index = labels[labels == -1].index

labels = labels.drop(nan_index)
labels = labels.drop(unknown_index)
X = X.drop(nan_index)
X = X.drop(unknown_index)

assert (labels.index == X.index).all()

# manipulate values to suit XGBoost
labels = pd.Series((labels//10)-1)

In [5]:
# need to drop columns that describe age if we want to predict age_segment

X.drop(['year_of_birth', 'avg_canc_age_segment'], axis=1, inplace=True)

In [6]:
class_distributions.label_proportions(labels)

4    0.217170
3    0.191002
2    0.184671
1    0.154560
5    0.134912
6    0.068244
7    0.042054
0    0.007388
Name: age_segment, dtype: float64

In [7]:
# prepare smaller dataset with only subset of classes

old_classes = [1,2,3,4,5,6,7]
new_class = 0

# compute number of old labels used
num_labels = len(old_classes)

# relabel for XGBoost
labels = helper_funcs.relabel(labels, old_classes, new_class)

data_small = X[labels < num_labels]
labels_small = labels[labels < num_labels]

# attempt to retrain with new data
data_update = X[labels == num_labels]
labels_update = labels[labels == num_labels]

# also train a model with all the data availale for comparison
data_full = pd.concat([data_small, data_update])
labels_full = pd.concat([labels_small, labels_update])

In [8]:
# split data into train- and test-data

X_train_small, X_test_small, y_train_small, y_test_small = skl.model_selection.train_test_split(data_small, 
                                                    labels_small,
                                                    test_size=.2)

In [9]:
# specify DMatrices

dtrain_small = xgb.DMatrix(X_train_small, label=y_train_small)
dtest_small = xgb.DMatrix(X_test_small, label=y_test_small)

# specify some parameters
proportion_of_old_data = [i*0.1 for i in range(1,10)]

# specify paramters for XGBoost
num_round = 5000
early_stopping_rounds = num_round*.1
max_depth = 3
eta = .1

param_small = {'max_depth': max_depth, 'eta': eta, 'objective': 'multi:softprob', "num_class": num_labels}
param_small['nthread'] = 4
param_small['eval_metric'] = 'mlogloss'

# maybe adjust for overfitting
#param_small['min_child_weight'] = 2
#param_small['gamma'] = 2
#param_small['subsample'] = 0.5

evallist_small = [(dtrain_small, 'train'), (dtest_small, 'eval')]

In [None]:
# training model with fewer labels
bst_small = xgb.train(param_small,
                      dtrain_small,
                      num_round,
                      evals=evallist_small,
                      #early_stopping_rounds=early_stopping_rounds,
                      verbose_eval=50)

bst_small.save_model('small_model.json')

[0]	train-mlogloss:1.91070	eval-mlogloss:1.91145
[50]	train-mlogloss:1.54291	eval-mlogloss:1.56024
[100]	train-mlogloss:1.49404	eval-mlogloss:1.52470
[150]	train-mlogloss:1.46655	eval-mlogloss:1.50961
[200]	train-mlogloss:1.44512	eval-mlogloss:1.50037
[250]	train-mlogloss:1.42749	eval-mlogloss:1.49443
[300]	train-mlogloss:1.41216	eval-mlogloss:1.49011
[350]	train-mlogloss:1.39847	eval-mlogloss:1.48644
[400]	train-mlogloss:1.38643	eval-mlogloss:1.48432
[450]	train-mlogloss:1.37507	eval-mlogloss:1.48259
[500]	train-mlogloss:1.36428	eval-mlogloss:1.48117
[550]	train-mlogloss:1.35398	eval-mlogloss:1.47994
[600]	train-mlogloss:1.34398	eval-mlogloss:1.47885
[650]	train-mlogloss:1.33416	eval-mlogloss:1.47798
[700]	train-mlogloss:1.32498	eval-mlogloss:1.47761
[750]	train-mlogloss:1.31587	eval-mlogloss:1.47655
[800]	train-mlogloss:1.30667	eval-mlogloss:1.47596
[850]	train-mlogloss:1.29781	eval-mlogloss:1.47531
[900]	train-mlogloss:1.28970	eval-mlogloss:1.47508
[950]	train-mlogloss:1.28186	eval-

In [None]:
print("Accuracy on test data: ", skl.metrics.accuracy_score(np.argmax(bst_small.predict(dtest_small), axis=1), y_test_small))

Accuracy on test data:  0.36866482112436116


In [None]:
print("Accuracy on train data: ", skl.metrics.accuracy_score(np.argmax(bst_small.predict(dtrain_small), axis=1), y_train_small))

Accuracy on train data:  0.7773311683126148


In [None]:
importance_scores = bst_small.get_fscore()
thresh = np.sort(np.array(list(importance_scores.values())))[int(len(importance_scores)*0.95)]
del_cols = []
for key, val in importance_scores.items():
    if val < thresh:
        del_cols.append(key)

In [None]:
X_slim = X.drop(del_cols, axis=1)

# split data into train- and test-data

X_slim_train_small, X_slim_test_small, y_slim_train_small, y_slim_test_small = skl.model_selection.train_test_split(data_small, 
                                                                                                                    labels_small,
                                                                                                                    test_size=.2)

dtrain_slim_small = xgb.DMatrix(X_slim_train_small, label=y_slim_train_small)
dtest_slim_small = xgb.DMatrix(X_slim_test_small, label=y_slim_test_small)

In [None]:
# training model with fewer labels
bst_small = xgb.train(param_small,
                      dtrain_small,
                      num_round,
                      evals=evallist_small,
                      #early_stopping_rounds=early_stopping_rounds,
                      verbose_eval=50)

bst_small.save_model('small_model.json')

[0]	train-mlogloss:1.91070	eval-mlogloss:1.91145
[50]	train-mlogloss:1.54291	eval-mlogloss:1.56024
[100]	train-mlogloss:1.49404	eval-mlogloss:1.52470
[150]	train-mlogloss:1.46655	eval-mlogloss:1.50961
[200]	train-mlogloss:1.44512	eval-mlogloss:1.50037
[250]	train-mlogloss:1.42749	eval-mlogloss:1.49443
[300]	train-mlogloss:1.41216	eval-mlogloss:1.49011
[350]	train-mlogloss:1.39847	eval-mlogloss:1.48644
[400]	train-mlogloss:1.38643	eval-mlogloss:1.48432
[450]	train-mlogloss:1.37507	eval-mlogloss:1.48259
[500]	train-mlogloss:1.36428	eval-mlogloss:1.48117
[550]	train-mlogloss:1.35398	eval-mlogloss:1.47994
[600]	train-mlogloss:1.34398	eval-mlogloss:1.47885
[650]	train-mlogloss:1.33416	eval-mlogloss:1.47798
[700]	train-mlogloss:1.32498	eval-mlogloss:1.47761
[750]	train-mlogloss:1.31587	eval-mlogloss:1.47655
[800]	train-mlogloss:1.30667	eval-mlogloss:1.47596
[850]	train-mlogloss:1.29781	eval-mlogloss:1.47531
[900]	train-mlogloss:1.28970	eval-mlogloss:1.47508
[950]	train-mlogloss:1.28186	eval-

In [None]:
print("Accuracy on test data: ", skl.metrics.accuracy_score(np.argmax(bst_small.predict(dtest_slim_small), axis=1), y_slim_test_small))

Accuracy on test data:  0.6979344122657581


In [None]:
print("Accuracy on train data: ", skl.metrics.accuracy_score(np.argmax(bst_small.predict(dtrain_slim_small), axis=1), y_slim_train_small))

Accuracy on train data:  0.6950115793116299
