In [1]:
from catboost import Pool, CatBoostClassifier
import numpy as np
from emnist import extract_training_samples, list_datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report

In [2]:
list_of_datasets = list_datasets()

images, labels = extract_training_samples('letters')
images = np.reshape(images, (images.shape[0], -1))
X_train, X_test, y_train, y_test = train_test_split(images, labels, test_size=0.33, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.33, random_state=42)

print(images.shape)

(124800, 784)


In [3]:
from sklearn.model_selection import KFold, cross_val_score

kfold = KFold(n_splits=5, random_state=42, shuffle=True)

train_dataset = Pool(data=X_train,
                     label=y_train)

val_dataset = Pool(data=X_val,
                   label=y_val)

test_dataset = Pool(data=X_test,
                    label=y_test)

model = CatBoostClassifier(iterations=1000,
                           learning_rate=0.1,
                           depth=6,
                           loss_function='MultiClass',
                           eval_metric='AUC')

model.fit(train_dataset,
          eval_set=val_dataset)

preds_class = model.predict(test_dataset)

print(classification_report(y_test, preds_class))

0:	test: 0.9177024	best: 0.9177024 (0)	total: 3.86s	remaining: 1h 4m 13s
1:	test: 0.9454550	best: 0.9454550 (1)	total: 7.81s	remaining: 1h 4m 57s
2:	test: 0.9575718	best: 0.9575718 (2)	total: 11.8s	remaining: 1h 5m 19s
3:	test: 0.9653552	best: 0.9653552 (3)	total: 15.7s	remaining: 1h 5m 10s
4:	test: 0.9695595	best: 0.9695595 (4)	total: 19.8s	remaining: 1h 5m 34s
5:	test: 0.9738970	best: 0.9738970 (5)	total: 25.2s	remaining: 1h 9m 30s
6:	test: 0.9767735	best: 0.9767735 (6)	total: 31.5s	remaining: 1h 14m 24s
7:	test: 0.9792494	best: 0.9792494 (7)	total: 38.2s	remaining: 1h 18m 52s
8:	test: 0.9807577	best: 0.9807577 (8)	total: 43.7s	remaining: 1h 20m 15s
9:	test: 0.9827207	best: 0.9827207 (9)	total: 48.1s	remaining: 1h 19m 23s
10:	test: 0.9838430	best: 0.9838430 (10)	total: 52s	remaining: 1h 17m 58s
11:	test: 0.9851535	best: 0.9851535 (11)	total: 56.1s	remaining: 1h 16m 57s
12:	test: 0.9863300	best: 0.9863300 (12)	total: 1m	remaining: 1h 16m 18s
13:	test: 0.9870484	best: 0.9870484 (13)	to

KeyboardInterrupt: 

In [22]:
# from sklearn.model_selection import KFold, cross_val_score
#
# kfold = KFold(n_splits=5, random_state=42, shuffle=True)
#
# # train_dataset = Pool(data=X_train,
# #                      label=y_train)
# #
# # val_dataset = Pool(data=X_val,
# #                    label=y_val)
# #
# # test_dataset = Pool(data=X_test,
# #                     label=y_test)
#
# # model = CatBoostClassifier(iterations=100,
# #                            learning_rate=0.1,
# #                            depth=4)
# #
# # model.fit(train_dataset, eval_set=val_dataset)
#
# # accuracies = cross_val_score(estimator=model, X=X_train, y=y_train, cv=5)
#
# # cv_results = cross_val_score(model, X_train, y_train,
# #                              cv=5, scoring='accuracy', verbose=1)
#
# for train_index, test_index in kfold.split(X_train):
#     train_dataset = Pool(data=X_train[train_index],
#                          label=y_train[train_index])
#
#     eval_dataset = Pool(data=X_train[test_index],
#                         label=y_train[test_index])
#
#     model = CatBoostClassifier(iterations=100,
#                                learning_rate=0.1,
#                                depth=6,
#                                eval_metric='AUC')
#
#     # cv_results = cross_val_score(model, X_train[train_index], y_train,
#     #                              cv=5, scoring='accuracy', verbose=1)
#
#     model.fit(train_dataset,
#               eval_set=eval_dataset)
#
#     train_score = model.score(X_train[train_index], y_train[train_index])  # train (learn) score
#     val_score = model.score(X_train[test_index], y_train[test_index])  # val (test) score
#
#     print(train_score)
#     print(val_score)
#
# # preds_class = model.predict(train_dataset)
#


0:	learn: 2.9361182	test: 2.9381295	best: 2.9381295 (0)	total: 1.19s	remaining: 1m 57s
1:	learn: 2.7231820	test: 2.7239290	best: 2.7239290 (1)	total: 2.39s	remaining: 1m 57s
2:	learn: 2.5561960	test: 2.5587172	best: 2.5587172 (2)	total: 3.57s	remaining: 1m 55s
3:	learn: 2.4295437	test: 2.4345741	best: 2.4345741 (3)	total: 4.76s	remaining: 1m 54s
4:	learn: 2.3138671	test: 2.3204778	best: 2.3204778 (4)	total: 5.94s	remaining: 1m 52s
5:	learn: 2.2141173	test: 2.2223585	best: 2.2223585 (5)	total: 7.15s	remaining: 1m 51s
6:	learn: 2.1289078	test: 2.1402526	best: 2.1402526 (6)	total: 8.33s	remaining: 1m 50s
7:	learn: 2.0485206	test: 2.0614546	best: 2.0614546 (7)	total: 9.53s	remaining: 1m 49s
8:	learn: 1.9768627	test: 1.9917471	best: 1.9917471 (8)	total: 10.7s	remaining: 1m 48s
9:	learn: 1.9087541	test: 1.9245849	best: 1.9245849 (9)	total: 11.9s	remaining: 1m 46s
10:	learn: 1.8440233	test: 1.8603825	best: 1.8603825 (10)	total: 13s	remaining: 1m 45s
11:	learn: 1.7872889	test: 1.8043234	best: 

KeyboardInterrupt: 

In [32]:
from catboost import cv

params = {
    'loss_function': 'MultiClass',
    'iterations': 10,
    'custom_loss': 'AUC',
    # 'random_seed': 63,
    'learning_rate': 0.1,
    'depth': 6
}

cv_data = cv(
    params=params,
    pool=Pool(X_train, label=y_train),
    fold_count=5,  # Разбивка выборки на 5 кусочков
    shuffle=True,  # Перемешаем наши данные
    # partition_random_seed=0,
    # plot=True, # Никуда без визуализатора
    # stratified=True,
    # verbose=False
)

Training on fold [0/5]
0:	learn: 2.9366505	test: 2.9415135	best: 2.9415135 (0)	total: 1.24s	remaining: 11.2s
1:	learn: 2.7262990	test: 2.7351214	best: 2.7351214 (1)	total: 2.53s	remaining: 10.1s
2:	learn: 2.5690359	test: 2.5828843	best: 2.5828843 (2)	total: 3.77s	remaining: 8.81s
3:	learn: 2.4263310	test: 2.4425620	best: 2.4425620 (3)	total: 5.04s	remaining: 7.56s
4:	learn: 2.3088246	test: 2.3273819	best: 2.3273819 (4)	total: 6.29s	remaining: 6.29s
5:	learn: 2.2066776	test: 2.2272465	best: 2.2272465 (5)	total: 7.54s	remaining: 5.03s
6:	learn: 2.1160191	test: 2.1384954	best: 2.1384954 (6)	total: 8.79s	remaining: 3.77s
7:	learn: 2.0300570	test: 2.0549725	best: 2.0549725 (7)	total: 10.1s	remaining: 2.54s
8:	learn: 1.9516631	test: 1.9777278	best: 1.9777278 (8)	total: 11.4s	remaining: 1.27s
9:	learn: 1.8837237	test: 1.9105692	best: 1.9105692 (9)	total: 12.7s	remaining: 0us

bestTest = 1.910569227
bestIteration = 9

Training on fold [1/5]
0:	learn: 2.9332102	test: 2.9402335	best: 2.9402335 (

In [33]:
print(cv_data)

   iterations  test-MultiClass-mean  test-MultiClass-std  \
0           0              2.941866             0.017517   
1           1              2.730312             0.020845   
2           2              2.570300             0.019590   
3           3              2.428370             0.017742   
4           4              2.314739             0.014181   
5           5              2.216314             0.010630   
6           6              2.127217             0.009405   
7           7              2.046088             0.008637   
8           8              1.973362             0.006003   
9           9              1.907233             0.008662   

   train-MultiClass-mean  train-MultiClass-std  test-AUC-mean  test-AUC-std  
0               2.937799              0.016776            NaN           NaN  
1               2.723209              0.017676            NaN           NaN  
2               2.561277              0.015844            NaN           NaN  
3               2.418577   