In [0]:
!pip install catboost

Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/97/c4/586923de4634f88a31fd1b4966e15707a912b98b6f4566651b5ef58f36b5/catboost-0.20.2-cp36-none-manylinux1_x86_64.whl (63.9MB)
[K     |████████████████████████████████| 63.9MB 63kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.20.2


In [0]:
import numpy as np
from catboost import CatBoost, Pool, datasets
from sklearn.model_selection import train_test_split

In [0]:
train_df, _ = datasets.amazon()
X, y = np.array(train_df.drop(['ACTION'], axis=1)), np.array(train_df.ACTION)
cat_features = np.arange(9) # indices of categorical features

X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.25, random_state=42)
train_pool = Pool(X_train, y_train, cat_features=cat_features)
validation_pool = Pool(X_validation, y_validation, cat_features=cat_features)

print(train_pool.shape, validation_pool.shape)

(24576, 9) (8193, 9)


In [0]:
cb = CatBoost({'iterations': 100, 'verbose': False, 'random_seed': 42})
cb.fit(train_pool);
print(cb.eval_metrics(validation_pool, ['RMSE'])['RMSE'][-1])

0.20728569861391216


In [0]:
np.random.seed(42)
perturbed_idxs = np.random.choice(len(y_train), size=int(len(y_train) * 0.1), replace=False)
y_train_noisy = y_train.copy()
y_train_noisy[perturbed_idxs] = 1 - y_train_noisy[perturbed_idxs]
train_pool_noisy = Pool(X_train, y_train_noisy, cat_features=cat_features)

In [0]:
cb.fit(train_pool_noisy);
print(cb.eval_metrics(validation_pool, ['RMSE'])['RMSE'][-1])

0.23639586778659294


In [0]:
np.random.seed(42)
test_idx = np.random.choice(np.arange(y_validation.shape[0]), size=500, replace=False)
validation_pool_sampled = Pool(X_validation[test_idx], y_validation[test_idx], cat_features=cat_features)

indices, scores = cb.get_object_importance(
    validation_pool_sampled,
    train_pool_noisy,
    importance_values_sign='Positive' # Positive values means that the optimized metric
                                      # value is increase because of given train objects.
                                      # So here we get the indices of bad train objects.
)

In [44]:
def train_and_print_score(train_indices, remove_object_count):
    cb.fit(X_train[train_indices], y_train_noisy[train_indices], cat_features=cat_features)
    metric_value = cb.eval_metrics(validation_pool, ['RMSE'])['RMSE'][-1]
    s = 'RMSE on validation dataset when {} harmful objects from train are dropped: {}'
    print(s.format(remove_object_count, metric_value))
    return X_train[train_indices], y_train_noisy[train_indices]

batch_size = 250
train_indices = np.full(X_train.shape[0], True)
train_and_print_score(train_indices, 0)
dataframes = []
for batch_start_index in range(0, 2000, batch_size):
    train_indices[indices[batch_start_index:batch_start_index + batch_size]] = False
    dataframe = train_and_print_score(train_indices, batch_start_index + batch_size)
    dataframes.append(dataframe)





RMSE on validation dataset when 0 harmful objects from train are dropped: 0.23639586778659294
RMSE on validation dataset when 250 harmful objects from train are dropped: 0.2353242014145958
RMSE on validation dataset when 500 harmful objects from train are dropped: 0.23064887642153187
RMSE on validation dataset when 750 harmful objects from train are dropped: 0.2298147245640675
RMSE on validation dataset when 1000 harmful objects from train are dropped: 0.22925333191236252
RMSE on validation dataset when 1250 harmful objects from train are dropped: 0.229903662176537
RMSE on validation dataset when 1500 harmful objects from train are dropped: 0.2308325906972554
RMSE on validation dataset when 1750 harmful objects from train are dropped: 0.23403875662684973
RMSE on validation dataset when 2000 harmful objects from train are dropped: 0.24233303615177418


In [0]:
cb.fit(dataframes[3][0], dataframes[3][1], cat_features=cat_features)
metric_value = cb.eval_metrics(validation_pool, ['RMSE'])['RMSE'][-1]

In [49]:
metric_value

0.22925333191236252

0.24233303615177418
