In [52]:
import catboost as cat
from sklearn import datasets
from sklearn.model_selection import train_test_split
import numpy as np

In [53]:
X = datasets.fetch_covtype().data[:3000]
y = datasets.fetch_covtype().target[:3000]
X_1, X_test, y_1, y_test = train_test_split(X, y, test_size=0.1)
X_train, X_va, y_train, y_va = train_test_split(X_1, y_1)

print(X_train.shape, y_train.shape)
print(np.unique(y_train))  # 7分类任务

(2025, 54) (2025,)
[1 2 3 4 5 6 7]


In [54]:
# label.shape = (num_sample, )
train_dataset = cat.Pool(X_train, label=y_train)
val_dataset = cat.Pool(X_va, label=y_va)
test_dataset = cat.Pool(X_test)

In [55]:
params = {"loss_function": "MultiClass",
          "eval_metric": "MultiClass",
          "allow_writing_files": False,
          "verbose": False,
          "thread_count": -1,
          "use_best_model": True
          }

eval_set = [val_dataset]

In [56]:
model = cat.train(pool=train_dataset, params=params,
                  eval_set=eval_set,  # 自动将训练数据集加入评估
                  early_stopping_rounds=100)

In [57]:
# 预测结果为概率向量(多分类)
# X_test: catboost.Pool,numpy.ndarray,pandas.DataFrame
model.predict(X_test).shape

(300, 7)

In [58]:
# 特征的相对重要性
model.feature_importances_

array([2.04191691e+01, 4.62314344e+00, 3.77997570e+00, 6.97010076e+00,
       5.67701397e+00, 1.36655677e+01, 5.55324840e+00, 4.50286752e+00,
       3.79415761e+00, 1.20666599e+01, 5.95972414e+00, 0.00000000e+00,
       1.99290192e+00, 5.65228330e+00, 1.17066917e-01, 1.01723810e-01,
       6.46239278e-03, 1.20002843e-02, 2.98460234e-02, 1.73517782e-02,
       0.00000000e+00, 0.00000000e+00, 4.80410695e-03, 4.72066567e-01,
       5.39710713e-03, 8.53155197e-01, 1.56133462e-01, 0.00000000e+00,
       0.00000000e+00, 6.67825844e-02, 1.14423794e-01, 5.16908989e-02,
       1.93513938e-02, 4.16459318e-01, 0.00000000e+00, 2.70039661e-02,
       4.78129047e-01, 3.83628535e-02, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 2.97126633e-04, 1.28709481e+00, 8.87695092e-01,
       9.33896940e-03, 3.02601506e-03, 6.95671256e-02, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 6.07225935e-02,
       3.58546852e-02, 1.37857864e-03])

In [59]:
# 根据fit函数中eval_set参数与eval_metric参数返回评估结果
model.evals_result_

{'learn': {'MultiClass': [1.7409457608612011,
   1.5956667632817032,
   1.4777186480471076,
   1.3727577314776207,
   1.3001124685056384,
   1.229965382172256,
   1.1608975790150664,
   1.1047326700302895,
   1.0522618372174537,
   1.0047480691936672,
   0.968307505964647,
   0.9343812481923827,
   0.899279007300713,
   0.8742262770212805,
   0.8496241666295523,
   0.8288908112645602,
   0.8100179540331148,
   0.7899456056473207,
   0.7748794809074558,
   0.7578195542402658,
   0.7431367721933901,
   0.7304574771535762,
   0.7158332291292927,
   0.7044256969584684,
   0.691915898679279,
   0.6799440087931851,
   0.6699578210363265,
   0.6590056626171611,
   0.6505106620958055,
   0.6398255555733567,
   0.6313835296589818,
   0.6240069632218893,
   0.6157907142588618,
   0.607625682625048,
   0.6017355133371869,
   0.5950723832814957,
   0.5903215864848009,
   0.5853663734638535,
   0.5790608772750395,
   0.5708686825650954,
   0.5642167387301174,
   0.5590630435134808,
   0.55331489136

In [60]:
model.evals_result_.keys()

dict_keys(['learn', 'validation'])

In [61]:
# Return the best result for each metric calculated on each validation dataset.
model.get_best_score()

{'learn': {'MultiClass': 0.06887891100767556},
 'validation': {'MultiClass': 0.38961406266809917}}