# Prepare data

In [1]:
from catboost import Pool, CatBoostClassifier
from catboost.datasets import amazon
import pandas as pd

ImportError: cannot import name 'Pool' from 'catboost' (unknown location)

In [3]:
train_df, test_df = amazon()
train_df.head()

Unnamed: 0,ACTION,RESOURCE,MGR_ID,ROLE_ROLLUP_1,ROLE_ROLLUP_2,ROLE_DEPTNAME,ROLE_TITLE,ROLE_FAMILY_DESC,ROLE_FAMILY,ROLE_CODE
0,1,39353,85475,117961,118300,123472,117905,117906,290919,117908
1,1,17183,1540,117961,118343,123125,118536,118536,308574,118539
2,1,36724,14457,118219,118220,117884,117879,267952,19721,117880
3,1,36135,5396,117961,118343,119993,118321,240983,290919,118322
4,1,42680,5905,117929,117930,119569,119323,123932,19793,119325


In [4]:
y = train_df.ACTION
X = train_df.drop('ACTION', axis=1)

In [5]:
cat_features = list(range(0, X.shape[1]))
print(cat_features)

[0, 1, 2, 3, 4, 5, 6, 7, 8]


In [6]:
from sklearn.model_selection import train_test_split

X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.75, random_state=42)

In [38]:
from time import time

In [43]:
simple_model = CatBoostClassifier(
    eval_metric='AUC',
    random_seed=63,
    iterations=200
)

start_time = time()
simple_model.fit(
    X_train, y_train,
    cat_features=cat_features,
    eval_set=(X_validation, y_validation),
    logging_level='Silent',
    plot=True
)
simple_model_fit_time = time() - start_time
simple_model_auc = simple_model.best_score_['validation']['AUC']
print('best AUC: {:.5f}'.format(simple_model_auc))
print('time: {:.3f}s'.format(simple_model_fit_time))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

best AUC: 0.88850
time: 4.531s


In [49]:
import json

print('Simple model main parameters:')
all_params = simple_model.get_all_params()
for name in ('iterations', 'learning_rate', 'depth', 'bootstrap_type', 'subsample', 'border_count', 'l2_leaf_reg',
             'one_hot_max_size', 'rsm', 'leaf_estimation_iterations', 'leaf_estimation_method', 'max_ctr_complexity', 'random_strength'):
    print(f'{name}={all_params[name]}')

Simple model main parameters:
iterations=200
learning_rate=0.14074000716209412
depth=6
bootstrap_type=MVS
subsample=0.800000011920929
border_count=254
l2_leaf_reg=3
one_hot_max_size=2
rsm=1
leaf_estimation_iterations=10
leaf_estimation_method=Newton
max_ctr_complexity=4
random_strength=1


# Speedup training

Here is the list of parameters that are important for speeding up the training.
Note that changing this parameters might decrease the quality.
1. iterations + learning rate
By default we train for 1000 iterations. You can decrease this number, but if you decrease the number of iterations you need to increase learning rate so that the process converges. We set learning rate by default dependent on number of iterations and on your dataset, so you might just use default learning rate. But if you want to tune it, you need to know - the more iterations you have, the less should be the learning rate.

3. bootstrap_type
By default we use MVS boostrap. It is faster to use sampling from Bernoulli distribution. To enable that use bootstrap_type='Bernoulli' + subsample={some value < 1}

4. one_hot_max_size
By default we use one-hot encoding only for categorical features with little amount of different values. For all other categorical features we calculate statistics. This is expensive, and one-hot encoding is cheep. So you can speed up the training by setting one_hot_max_size to some bigger value

5. rsm
This parameter is very important, because it speeds up the training and does not affect the quality. So you should definitely use it, but only in case if you have hundreds of features.
If you have little amount of features it's better not to use this parameter.
If you have many features then the rule is the following: you decrease rsm, for example, you set rsm=0.1. With this rsm value the training needs more iterations to converge. Usually you need about 20% more iterations. But each iteration will be 10x faster. So the resulting training time will be faster even though you will have more trees in the resulting model.

6. leaf_estimation_iterations
This parameter is responsible for calculating leaf values after you have already selected tree structure.
If you have little amount of features, for example 8 or 10 features, then this place starts to be the bottle-neck.
Default value for this parameter depends on the training objective, you can try setting it to 1 or 5, and if you have little amount of features, this might speed up the training.

7. max_ctr_complexity
By default catboost generates categorical feature combinations in a greedy way.
This is time consuming, you can disable that by setting max_ctr_complexity=1 or by allowing only combinations of 2 features by setting max_ctr_complexity=2.
This will speed up the training only if you have categorical features.

8. If you are training the model on GPU, you can try decreasing border_count. This is the number of splits considered for each feature. By default it's set to 128, but you can try setting it to 32. In many cases it will not degrade the quality of the model and will speed up the training by a lot. 

9. random_strength
Sometimes it can give more quality but can be dangerous with big amount of categorical features.

In [56]:
fast_model = CatBoostClassifier(
    eval_metric='AUC',
    random_seed=63,
    iterations=200,
    learning_rate=0.14074,
    depth=6,
    bootstrap_type='MVS',
    subsample=0.8,
    border_count=254,
    l2_leaf_reg=3,
    one_hot_max_size=2,
    rsm=1,
    leaf_estimation_iterations=10,
    leaf_estimation_method='Newton',
    max_ctr_complexity=4,
    random_strength=1
)

start_time = time()
fast_model.fit(
    X_train, y_train,
    cat_features=cat_features,
    eval_set=(X_validation, y_validation),
    logging_level='Silent',
    plot=True
)
fast_model_fit_time = time() - start_time
speedup = simple_model_fit_time / fast_model_fit_time
fast_model_auc = fast_model.best_score_['validation']['AUC']
delta_auc = fast_model_auc - simple_model_auc
print('best AUC: {:.5f}\tdelta: {:.5f}'.format(fast_model_auc, delta_auc))
print('time: {:.3f}s\tspeedup: {:.1f}'.format(fast_model_fit_time, speedup))
for name, value in fast_model.get_all_params().items():
    if simple_model.get_all_params()[name] != value:
        print(f'{name}: {simple_model.get_all_params()[name]} -> {value}')

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

best AUC: 0.88850	delta: 0.00000
time: 4.320s	speedup: 1.0


# Increase quality

The parameters listed below are important to get the best quality of the model. Try changing this parameters to improve the quality of the resulting model.

In [59]:
best_model = CatBoostClassifier(
    eval_metric='AUC',
    random_seed=63,
    iterations=200,
    learning_rate=0.14074,
    depth=6,
    bootstrap_type='MVS',
    subsample=0.8,
    border_count=254,
    l2_leaf_reg=3,
    one_hot_max_size=2,
    rsm=1,
    leaf_estimation_iterations=10,
    leaf_estimation_method='Newton',
    max_ctr_complexity=4,
    random_strength=1
)

start_time = time()
best_model.fit(
    X_train, y_train,
    cat_features=cat_features,
    eval_set=(X_validation, y_validation),
    logging_level='Silent',
    plot=True
)
best_model_fit_time = time() - start_time
best_model_auc = best_model.best_score_['validation']['AUC']
delta_auc = best_model_auc - simple_model_auc

print('best AUC: {:.5f}\tdelta: {:.5f}'.format(best_model_auc, delta_auc))
print('time: {:.3f}s'.format(best_model_fit_time))
for name, value in best_model.get_all_params().items():
    if simple_model.get_all_params()[name] != value:
        print(f'{name}: {simple_model.get_all_params()[name]} -> {value}')

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

best AUC: 0.88850	delta: 0.00000
time: 4.471s
