# CatBoost Application

## Установка

In [None]:
!pip install catboost -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25h

## Загрузка данных

In [None]:
from catboost.datasets import titanic
import numpy as np

train_df, test_df = titanic()

train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Обработка данных

In [None]:
null_value_stats = train_df.isnull().sum(axis=0)
null_value_stats[null_value_stats != 0]

Unnamed: 0,0
Age,177
Cabin,687
Embarked,2


Заполним пропуски в данных некоторым уникальным значением (есть и другие техники, но здесь для простоты используем эту).

In [None]:
train_df.fillna(-999, inplace=True)
test_df.fillna(-999, inplace=True)

Разбиваем данные на матрицу объект-признак и вектор с целевой переменной.

In [None]:
X = train_df.drop('Survived', axis=1)
y = train_df.Survived

Посмотрим на типы признаков.

In [None]:
print(X.dtypes)

categorical_features_indices = np.where(X.dtypes != float)[0]

PassengerId      int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object


Разобъем данные на трейн и валидацию.

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.75, random_state=42)

X_test = test_df

## Обучение модели

In [None]:
from catboost import CatBoostClassifier, Pool, metrics, cv
from sklearn.metrics import accuracy_score

In [None]:
model = CatBoostClassifier(
    custom_loss=[metrics.Accuracy()],
    random_seed=42,
    logging_level='Silent'
)

В google colab нет возможности отрисовывать динамические графики (насколько мы знаем), поэтому для отрисовки графиков запускайте ноутбук на локальной машине.

In [None]:
model.fit(
    X_train, y_train,
    cat_features=categorical_features_indices,
    eval_set=(X_validation, y_validation),
    logging_level='Verbose',
#    plot=True
)

Learning rate set to 0.028683
0:	learn: 0.6739988	test: 0.6742630	best: 0.6742630 (0)	total: 58.1ms	remaining: 58.1s
1:	learn: 0.6589013	test: 0.6592240	best: 0.6592240 (1)	total: 61.7ms	remaining: 30.8s
2:	learn: 0.6421502	test: 0.6426778	best: 0.6426778 (2)	total: 69ms	remaining: 22.9s
3:	learn: 0.6297276	test: 0.6302310	best: 0.6302310 (3)	total: 74.1ms	remaining: 18.4s
4:	learn: 0.6147184	test: 0.6198228	best: 0.6198228 (4)	total: 80.5ms	remaining: 16s
5:	learn: 0.6017730	test: 0.6073627	best: 0.6073627 (5)	total: 86.9ms	remaining: 14.4s
6:	learn: 0.5885309	test: 0.5956000	best: 0.5956000 (6)	total: 95.5ms	remaining: 13.5s
7:	learn: 0.5783200	test: 0.5858523	best: 0.5858523 (7)	total: 103ms	remaining: 12.7s
8:	learn: 0.5665895	test: 0.5743842	best: 0.5743842 (8)	total: 109ms	remaining: 12s
9:	learn: 0.5575381	test: 0.5662283	best: 0.5662283 (9)	total: 115ms	remaining: 11.4s
10:	learn: 0.5491045	test: 0.5575176	best: 0.5575176 (10)	total: 122ms	remaining: 11s
11:	learn: 0.5423887	te

<catboost.core.CatBoostClassifier at 0x78ceba719240>

## Кросс-валидация

In [None]:
cv_params = model.get_params()
cv_params.update({
    'loss_function': metrics.Logloss()
})
cv_data = cv(
    Pool(X, y, cat_features=categorical_features_indices),
    cv_params,
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Посмотрим на среднее качество и его разброс по кросс-валидации

In [None]:
print('Best validation accuracy score: {:.2f}±{:.2f} on step {}'.format(
    np.max(cv_data['test-Accuracy-mean']),
    cv_data['test-Accuracy-std'][np.argmax(cv_data['test-Accuracy-mean'])],
    np.argmax(cv_data['test-Accuracy-mean'])
))

Best validation accuracy score: 0.83±0.02 on step 355


## Применяем обученную модель

In [None]:
predictions = model.predict(X_test)
predictions_probs = model.predict_proba(X_test)
print(predictions[:10])
print(predictions_probs[:10])

[0 0 0 0 1 0 1 0 1 0]
[[0.85473931 0.14526069]
 [0.76313031 0.23686969]
 [0.88972889 0.11027111]
 [0.87876173 0.12123827]
 [0.3611047  0.6388953 ]
 [0.90513381 0.09486619]
 [0.33434185 0.66565815]
 [0.78468564 0.21531436]
 [0.39429048 0.60570952]
 [0.94047549 0.05952451]]


## Улучшение предсказаний и другие возможности CatBoost

### Early Stopping

In [None]:
model.fit(
    X_train, y_train,
    cat_features=categorical_features_indices,
    eval_set=(X_validation, y_validation),
    early_stopping_rounds = 30,
    logging_level='Verbose',  # you can uncomment this for text output
#    plot=True
)

Learning rate set to 0.028683
0:	learn: 0.6739988	test: 0.6742630	best: 0.6742630 (0)	total: 7.64ms	remaining: 7.64s
1:	learn: 0.6589013	test: 0.6592240	best: 0.6592240 (1)	total: 11.8ms	remaining: 5.91s
2:	learn: 0.6421502	test: 0.6426778	best: 0.6426778 (2)	total: 18.8ms	remaining: 6.25s
3:	learn: 0.6297276	test: 0.6302310	best: 0.6302310 (3)	total: 23.9ms	remaining: 5.96s
4:	learn: 0.6147184	test: 0.6198228	best: 0.6198228 (4)	total: 30.6ms	remaining: 6.1s
5:	learn: 0.6017730	test: 0.6073627	best: 0.6073627 (5)	total: 37.8ms	remaining: 6.26s
6:	learn: 0.5885309	test: 0.5956000	best: 0.5956000 (6)	total: 44.6ms	remaining: 6.33s
7:	learn: 0.5783200	test: 0.5858523	best: 0.5858523 (7)	total: 51.8ms	remaining: 6.42s
8:	learn: 0.5665895	test: 0.5743842	best: 0.5743842 (8)	total: 57.7ms	remaining: 6.36s
9:	learn: 0.5575381	test: 0.5662283	best: 0.5662283 (9)	total: 64.3ms	remaining: 6.37s
10:	learn: 0.5491045	test: 0.5575176	best: 0.5575176 (10)	total: 71.5ms	remaining: 6.43s
11:	learn: 0

<catboost.core.CatBoostClassifier at 0x78ceba719240>

In [None]:
model.tree_count_

284

Получили непереобученную модель, причем не пришлось ждать 1000 итераций!

## Важность признаков

CatBoost поддерживает несколько способов вычисления важности признаков


In [None]:
feature_importances = model.get_feature_importance()

feature_names = X_train.columns
for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
    print('{}: {}'.format(name, score))

Sex: 28.377591527551807
Pclass: 17.450379813673287
Parch: 10.276200044515498
Embarked: 8.761954037905873
Cabin: 8.281577549519369
SibSp: 7.950157281933983
Age: 7.842375602284014
Ticket: 5.620556803330715
Fare: 5.4392073392855105
PassengerId: 0.0
Name: 0.0


## Сохранение модели

In [None]:
# сохраняем модель
model.save_model('catboost_model.dump')

# загружаем сохраненную модель
model.load_model('catboost_model.dump');