In [11]:
import catboost as cat
from sklearn import datasets
from sklearn.model_selection import train_test_split
import numpy as np

In [12]:
X = datasets.fetch_covtype().data[:3000]
y = datasets.fetch_covtype().target[:3000]
X_1, X_test, y_1, y_test = train_test_split(X, y, test_size=0.1)
X_train, X_va, y_train, y_va = train_test_split(X_1, y_1)

print(X_train.shape, y_train.shape)
print(np.unique(y_train))  # 7分类任务

(2025, 54) (2025,)
[1 2 3 4 5 6 7]


In [13]:
# label.shape = (num_sample, )
train_dataset = cat.Pool(X_train, label=y_train)
val_dataset = cat.Pool(X_va, label=y_va)
test_dataset = cat.Pool(X_test)

In [14]:
params = {"loss_function": "MultiClass",
          "eval_metric": "MultiClass",
          "allow_writing_files": False,
          "thread_count": -1,
          "use_best_model": True
          }

eval_set = [val_dataset]

In [15]:
model = cat.train(pool=train_dataset, params=params,
                  eval_set=eval_set,  # 自动将训练数据集加入评估
                  # bool — Defines the logging level:
                  #     “True”  corresponds to the Verbose logging level
                  #     “False” corresponds to the Silent logging level
                  # int — Use the Verbose logging level and set the logging period to the value of this parameter.
                  verbose_eval=100,  # 默认verbose_eval=True
                  early_stopping_rounds=100)

Learning rate set to 0.109511
0:	learn: 1.7206688	test: 1.7205231	best: 1.7205231 (0)	total: 6.9ms	remaining: 6.89s
100:	learn: 0.4144142	test: 0.4891577	best: 0.4891577 (100)	total: 645ms	remaining: 5.74s
200:	learn: 0.2982915	test: 0.4284048	best: 0.4284048 (200)	total: 1.27s	remaining: 5.04s
300:	learn: 0.2292487	test: 0.3958213	best: 0.3958213 (300)	total: 1.9s	remaining: 4.41s
400:	learn: 0.1778267	test: 0.3786860	best: 0.3786860 (400)	total: 2.54s	remaining: 3.79s
500:	learn: 0.1452754	test: 0.3704239	best: 0.3702867 (480)	total: 3.17s	remaining: 3.16s
600:	learn: 0.1206972	test: 0.3665313	best: 0.3662816 (596)	total: 3.81s	remaining: 2.53s
700:	learn: 0.1017670	test: 0.3636204	best: 0.3636204 (700)	total: 4.47s	remaining: 1.9s
800:	learn: 0.0860700	test: 0.3632988	best: 0.3631087 (727)	total: 5.11s	remaining: 1.27s
900:	learn: 0.0744447	test: 0.3642690	best: 0.3620725 (861)	total: 5.76s	remaining: 633ms
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.3620724

In [16]:
# 预测结果为概率向量(多分类)
# X_test: catboost.Pool,numpy.ndarray,pandas.DataFrame
model.predict(X_test).shape

(300, 7)

In [17]:
# 特征的相对重要性
model.feature_importances_

array([2.31935419e+01, 4.64585182e+00, 3.80216141e+00, 8.23285565e+00,
       6.06224270e+00, 1.34877190e+01, 4.73387238e+00, 4.23806420e+00,
       3.87817690e+00, 1.19013094e+01, 6.48858086e+00, 0.00000000e+00,
       6.60432640e-01, 3.58989832e+00, 1.65751942e-01, 4.36734912e-02,
       5.95060576e-03, 2.06908185e-02, 3.52309506e-02, 3.91919002e-02,
       0.00000000e+00, 0.00000000e+00, 3.92171791e-03, 3.53484571e-01,
       1.79702603e-02, 6.75729301e-01, 1.62520435e-01, 6.96502039e-04,
       0.00000000e+00, 6.88880616e-02, 1.35169205e-01, 2.56026562e-02,
       1.31982092e-02, 3.22951316e-01, 0.00000000e+00, 9.16501260e-02,
       5.16750881e-01, 3.43949656e-02, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 5.05597128e-05, 1.14254147e+00, 1.03268746e+00,
       1.10942845e-02, 2.58456594e-02, 1.15472914e-01, 5.11459877e-03,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.40515823e-02,
       5.01648110e-03, 0.00000000e+00])

In [18]:
# 根据fit函数中eval_set参数与eval_metric参数返回评估结果
model.evals_result_

{'learn': {'MultiClass': [1.7206687742743951,
   1.5632400982576096,
   1.444019263259081,
   1.3543512135830886,
   1.2738662004599197,
   1.2040555767345635,
   1.1451766748164944,
   1.09495623295215,
   1.0510723772989377,
   1.0107989568710445,
   0.9752537031603185,
   0.9462887373074438,
   0.9199069438155432,
   0.8901491771397736,
   0.8649597484901118,
   0.8396306265718048,
   0.8201594886037958,
   0.7982820719672902,
   0.782171430706151,
   0.7651979638809231,
   0.7481701238735597,
   0.739172615937875,
   0.7252681393096381,
   0.7131505247767835,
   0.7011930463954307,
   0.6920100396725304,
   0.6814356554055312,
   0.6734007628951395,
   0.6637901300965122,
   0.6563924913326911,
   0.6482255429075584,
   0.6423581226378686,
   0.6345681477238067,
   0.6264686054902403,
   0.6199911161173225,
   0.6128268834614365,
   0.6069813189284354,
   0.6019142391131003,
   0.5979313100705191,
   0.5914113042303223,
   0.5871799733404504,
   0.5813358331250142,
   0.57754491477

In [19]:
model.evals_result_.keys()

dict_keys(['learn', 'validation'])

In [20]:
# Return the best result for each metric calculated on each validation dataset.
model.get_best_score()

{'learn': {'MultiClass': 0.06810154698191939},
 'validation': {'MultiClass': 0.36207249028415106}}