In [1]:
!pip install catboost
!pip install ipywidgets
!jupyter nbextension enable --py widgetsnbextension

Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/ca/ae/aaff63662f7f5d2af7ec8d61a6f39e78ada9348e5df4f43e665ecc4bea10/catboost-0.21-cp36-none-manylinux1_x86_64.whl (64.0MB)
[K     |████████████████████████████████| 64.0MB 45kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.21
Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


Data Preparation

In [0]:
from catboost.datasets import amazon
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from catboost import Pool

In [0]:
train_df, test_df = amazon()


In [4]:
train_df.head()

Unnamed: 0,ACTION,RESOURCE,MGR_ID,ROLE_ROLLUP_1,ROLE_ROLLUP_2,ROLE_DEPTNAME,ROLE_TITLE,ROLE_FAMILY_DESC,ROLE_FAMILY,ROLE_CODE
0,1,39353,85475,117961,118300,123472,117905,117906,290919,117908
1,1,17183,1540,117961,118343,123125,118536,118536,308574,118539
2,1,36724,14457,118219,118220,117884,117879,267952,19721,117880
3,1,36135,5396,117961,118343,119993,118321,240983,290919,118322
4,1,42680,5905,117929,117930,119569,119323,123932,19793,119325


In [5]:
train_df.shape

(32769, 10)

In [6]:
train_df.isnull().sum()

ACTION              0
RESOURCE            0
MGR_ID              0
ROLE_ROLLUP_1       0
ROLE_ROLLUP_2       0
ROLE_DEPTNAME       0
ROLE_TITLE          0
ROLE_FAMILY_DESC    0
ROLE_FAMILY         0
ROLE_CODE           0
dtype: int64

In [7]:
for col in train_df.columns:
    s = train_df[col].unique().shape[0]
    print(f'For column {col} the number of unique valriables is {s}')

For column ACTION the number of unique valriables is 2
For column RESOURCE the number of unique valriables is 7518
For column MGR_ID the number of unique valriables is 4243
For column ROLE_ROLLUP_1 the number of unique valriables is 128
For column ROLE_ROLLUP_2 the number of unique valriables is 177
For column ROLE_DEPTNAME the number of unique valriables is 449
For column ROLE_TITLE the number of unique valriables is 343
For column ROLE_FAMILY_DESC the number of unique valriables is 2358
For column ROLE_FAMILY the number of unique valriables is 67
For column ROLE_CODE the number of unique valriables is 343


In [8]:
print(f'Labels are {set(train_df["ACTION"])}')
ones = train_df['ACTION'].loc[train_df['ACTION']==1].shape[0]
zeros = train_df['ACTION'].loc[train_df['ACTION']==0].shape[0]
print(f'Number of 1s in a dataset: {ones}, Number of 0s in a dataset: {zeros}')

Labels are {0, 1}
Number of 1s in a dataset: 30872, Number of 0s in a dataset: 1897


In [0]:
train_df, validation_df = train_test_split(train_df, test_size=0.1, stratify=train_df['ACTION'], random_state=200)


In [0]:
X, Y = train_df.drop(['ACTION'], axis=1), train_df['ACTION']






In [11]:
X.head()

Unnamed: 0,RESOURCE,MGR_ID,ROLE_ROLLUP_1,ROLE_ROLLUP_2,ROLE_DEPTNAME,ROLE_TITLE,ROLE_FAMILY_DESC,ROLE_FAMILY,ROLE_CODE
13192,75943,3100,117916,117917,118599,179731,120238,117887,117973
31386,109084,1483,117961,117962,118840,124886,124887,118643,124888
20396,74557,7504,117961,118300,118783,118321,117906,290919,118322
30156,19310,14551,5110,117954,117895,117899,117897,19721,117900
896,79363,1030,117961,118343,120722,118361,118362,118363,118364


In [0]:
categorical_indices = list(range(0, len(X.columns))) # indices of categorical features

In [13]:
X.iloc[:, categorical_indices].head()


Unnamed: 0,RESOURCE,MGR_ID,ROLE_ROLLUP_1,ROLE_ROLLUP_2,ROLE_DEPTNAME,ROLE_TITLE,ROLE_FAMILY_DESC,ROLE_FAMILY,ROLE_CODE
13192,75943,3100,117916,117917,118599,179731,120238,117887,117973
31386,109084,1483,117961,117962,118840,124886,124887,118643,124888
20396,74557,7504,117961,118300,118783,118321,117906,290919,118322
30156,19310,14551,5110,117954,117895,117899,117897,19721,117900
896,79363,1030,117961,118343,120722,118361,118362,118363,118364


The dataset for the purposes of this talk is very clean. All variables are of the same type, all categorical. No missing values to worry about. The only issue is imbalance of the dataset. That can, and should be treated during data preparation process, with different techniques depending on your problem. 

**Modeling**

In [0]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from catboost import CatBoostClassifier, sum_models


def cv_catboost(X, Y, cat_columns):
    # CROSS VALIDATION
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=7)

    # RUN
    c = 0
    oof_preds = np.zeros((len(X), 2))
    models = []

    for train, valid in cv.split(X, Y):
        print("VAL fold %s" % c)
        X_train = X.iloc[train]
        Y_train = Y.iloc[train]
        X_valid = X.iloc[valid]
        Y_valid = Y.iloc[valid]

        model = CatBoostClassifier(iterations=2000, 
                                   verbose=200,
                                   random_seed=43,
                                   od_type='Iter',
                                   od_wait=100,
                                   train_dir=f'model_cross_val_{c}',
                                   task_type='GPU'
                                   )

        try:
            model.fit(X_train, Y_train,
                    eval_set=(X_valid, Y_valid), 
                    use_best_model=True,
                    cat_features=cat_columns, 
                    save_snapshot=True,
                    snapshot_interval=300,
                    snapshot_file=f'snapshot_{c}',
                    )
        except:
            model.fit(X_train, Y_train,
                    eval_set=(X_valid, Y_valid), 
                    use_best_model=True,
                    cat_features=cat_columns,
                    )
        
        oof_preds[valid] = model.predict_proba(X_valid)
        models.append(model)


        # EVALUATION
        auc = roc_auc_score(validation_df['ACTION'], [r[1] for r in model.predict_proba(validation_df.drop('ACTION', axis=1))] )
        print(f'AUC for fold {c} is {auc}')
        c += 1

    final_model = sum_models(models, ctr_merge_policy='LeaveMostDiversifiedTable') # ('FailIfCtrsIntersects', 'IntersectingCountersAverage')
    auc = roc_auc_score(validation_df['ACTION'], [r[1] for r in final_model.predict(validation_df.drop('ACTION', axis=1), prediction_type='Probability')] )
    print(f'Final model AUC is {auc}')
    

    # OOF PREDS
    oof_preds = [row[1] for row in oof_preds]
    auc = roc_auc_score(Y, oof_preds)
    print("CV_AUC: {}".format(auc))

    # SAVE OOF PREDS
    oof_pred_df = pd.DataFrame(columns=['ID_code', 'target'])
    oof_pred_df['ID_code'] = pd.Series(X.index.tolist())
    oof_pred_df['target'] = pd.Series(oof_preds)

    return final_model, oof_pred_df, models

In [15]:
import time
start = time.time()
final_model, oof_pred_df, models = cv_catboost(X, Y, categorical_indices)
print(time.time() - start)

VAL fold 0
Learning rate set to 0.040852
0:	learn: 0.6335210	test: 0.6350821	best: 0.6350821 (0)	total: 104ms	remaining: 3m 28s
200:	learn: 0.1516341	test: 0.1407156	best: 0.1407156 (200)	total: 11.3s	remaining: 1m 41s
400:	learn: 0.1450930	test: 0.1385907	best: 0.1385734 (393)	total: 22.2s	remaining: 1m 28s
600:	learn: 0.1395943	test: 0.1378234	best: 0.1378102 (597)	total: 33.1s	remaining: 1m 17s
800:	learn: 0.1363002	test: 0.1373417	best: 0.1373182 (771)	total: 44.2s	remaining: 1m 6s
1000:	learn: 0.1331791	test: 0.1372169	best: 0.1370207 (966)	total: 55s	remaining: 54.9s
bestTest = 0.1370207493
bestIteration = 966
Shrink model to first 967 iterations.
AUC for fold 0 is 0.8640998755391881
VAL fold 1
Learning rate set to 0.040852
0:	learn: 0.6333667	test: 0.6346355	best: 0.6346355 (0)	total: 56.8ms	remaining: 1m 53s
200:	learn: 0.1486551	test: 0.1443204	best: 0.1443204 (200)	total: 11.4s	remaining: 1m 41s
400:	learn: 0.1425145	test: 0.1420261	best: 0.1420261 (400)	total: 22.3s	remainin

Get feature importance

In [0]:
import numpy as np 
from sklearn.metrics import roc_auc_score, accuracy_score


def score(model, X, y):
    y_pred = model.predict(X)
    return accuracy_score(y, y_pred)

def score_auc(model, X, y):
    y_pred_prob = [i[1] for i in model.predict_proba(X)]
    return roc_auc_score(y, y_pred_prob)

def shuffled_scores(X_orig, y, model, score_func):
    scores = []
    for col in X_orig.columns:
        X = X_orig.copy()
        X[col] = np.random.permutation(X[col])
        score = score_func(model, X, y)
        scores.append(score)
    return np.array(scores)

def permutation_feature_importance(X, y, model, score_func, n_iter):
    base_score = score_func(model, X, y)
    print('Base score is ', base_score)
    score_decreases = []
    for i in range(n_iter):
        scores = shuffled_scores(X, y, model, score_func)
        score_decreases.append(-scores + base_score)
    return base_score, score_decreases

def get_permutation_importance(X, y, model, score_func):
    '''
    Call this function to get permutation feature importances.
    X - pandas dataframe with all the features used to train the model
    y - pandas dataframe or series object 
    model - Model object used for prediction. Any model objects with predict or .predict_proba methods are acceptable. 
    score_func - function to score the model. AUC and ACCURACY are available above
    '''
    
    base_score, score_decreases = permutation_feature_importance(X, y, model, score_func, 5)
    permutation_feature_importances = np.mean(score_decreases, axis=0)

    return base_score, permutation_feature_importances

In [17]:
base_score, permutation_feature_importances = get_permutation_importance(validation_df.drop('ACTION', axis=1), validation_df['ACTION'], models[0], score_auc)

Base score is  0.8640998755391881


In [0]:
feat_df = pd.DataFrame()
feat_df['feature'] = validation_df.drop('ACTION', axis=1).columns.tolist()
feat_df['scores'] = permutation_feature_importances
feat_df.sort_values(by=['scores'], ascending=False, inplace=True)

In [19]:
feat_df

Unnamed: 0,feature,scores
0,RESOURCE,0.053775
1,MGR_ID,0.032959
6,ROLE_FAMILY_DESC,0.0172
4,ROLE_DEPTNAME,0.014626
3,ROLE_ROLLUP_2,0.013386
7,ROLE_FAMILY,0.00292
2,ROLE_ROLLUP_1,0.002738
8,ROLE_CODE,-0.001033
5,ROLE_TITLE,-0.00234


In [0]:
final_model.save_model('final_catboost_amazon_dataset.bin')

In [21]:
final_model

<catboost.core.CatBoost at 0x7fdf53547e10>

In [0]:
import pickle


In [0]:
with open('models0_model_amazon.pickle', 'wb') as d_model:
    pickle.dump(models[0], d_model)

In [24]:
validation_df.drop('ACTION', axis=1).head(1)

Unnamed: 0,RESOURCE,MGR_ID,ROLE_ROLLUP_1,ROLE_ROLLUP_2,ROLE_DEPTNAME,ROLE_TITLE,ROLE_FAMILY_DESC,ROLE_FAMILY,ROLE_CODE
24268,37326,25262,117961,118327,118320,118685,311622,308574,118687


In [0]:
preds = list(final_model.staged_predict(validation_df.drop('ACTION', axis=1), prediction_type='Probability', ntree_end=200))

In [26]:
preds[199]

array([[0.01272989, 0.98727011],
       [0.02401881, 0.97598119],
       [0.00888811, 0.99111189],
       ...,
       [0.01130833, 0.98869167],
       [0.02171759, 0.97828241],
       [0.01625624, 0.98374376]])

In [27]:
auc = roc_auc_score(validation_df['ACTION'], [r[1] for r in preds[199]])
print(f'Staged predict Final model AUC is {auc}')

Staged predict Final model AUC is 0.8680220960564676


In [0]:
final_model.shrink(200)

In [0]:
with open('shrink_model_amazon.pickle', 'wb') as d_model:
    pickle.dump(final_model, d_model)

In [30]:
auc = roc_auc_score(validation_df['ACTION'], [r[1] for r in final_model.predict(validation_df.drop('ACTION', axis=1), prediction_type='Probability')])
print(f'Staged predict Final model AUC is {auc}')

Staged predict Final model AUC is 0.8680220960564676


In [0]:
from catboost.utils import select_threshold, get_confusion_matrix
validation_pool = Pool(validation_df.drop('ACTION', axis=1), validation_df['ACTION'], cat_features=categorical_indices)

CatBoostError: ignored