In [0]:
!pip install catboost
!pip install ipywidgets
!jupyter nbextension enable --py widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


Data Preparation

In [0]:
from catboost.datasets import amazon
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from catboost import Pool

In [0]:
train_df, test_df = amazon()


In [0]:
train_df.head()

Unnamed: 0,ACTION,RESOURCE,MGR_ID,ROLE_ROLLUP_1,ROLE_ROLLUP_2,ROLE_DEPTNAME,ROLE_TITLE,ROLE_FAMILY_DESC,ROLE_FAMILY,ROLE_CODE
13192,1,75943,3100,117916,117917,118599,179731,120238,117887,117973
31386,1,109084,1483,117961,117962,118840,124886,124887,118643,124888
20396,1,74557,7504,117961,118300,118783,118321,117906,290919,118322
30156,1,19310,14551,5110,117954,117895,117899,117897,19721,117900
896,1,79363,1030,117961,118343,120722,118361,118362,118363,118364


In [0]:
train_df.shape

(26542, 10)

In [0]:
train_df.isnull().sum()

ACTION              0
RESOURCE            0
MGR_ID              0
ROLE_ROLLUP_1       0
ROLE_ROLLUP_2       0
ROLE_DEPTNAME       0
ROLE_TITLE          0
ROLE_FAMILY_DESC    0
ROLE_FAMILY         0
ROLE_CODE           0
dtype: int64

In [0]:
for col in train_df.columns:
    s = train_df[col].unique().shape[0]
    print(f'For column {col} the number of unique valriables is {s}')

For column ACTION the number of unique valriables is 2
For column RESOURCE the number of unique valriables is 7099
For column MGR_ID the number of unique valriables is 4158
For column ROLE_ROLLUP_1 the number of unique valriables is 128
For column ROLE_ROLLUP_2 the number of unique valriables is 176
For column ROLE_DEPTNAME the number of unique valriables is 445
For column ROLE_TITLE the number of unique valriables is 342
For column ROLE_FAMILY_DESC the number of unique valriables is 2293
For column ROLE_FAMILY the number of unique valriables is 67
For column ROLE_CODE the number of unique valriables is 342


In [0]:
print(f'Labels are {set(train_df["ACTION"])}')
ones = train_df['ACTION'].loc[train_df['ACTION']==1].shape[0]
zeros = train_df['ACTION'].loc[train_df['ACTION']==0].shape[0]
print(f'Number of 1s in a dataset: {ones}, Number of 0s in a dataset: {zeros}')

Labels are {0, 1}
Number of 1s in a dataset: 27785, Number of 0s in a dataset: 1707


In [0]:
train_df, validation_df = train_test_split(train_df, test_size=0.1, stratify=train_df['ACTION'], random_state=200)


In [0]:
X, Y = train_df.drop(['ACTION'], axis=1), train_df['ACTION']






In [0]:
X.head()

Unnamed: 0,RESOURCE,MGR_ID,ROLE_ROLLUP_1,ROLE_ROLLUP_2,ROLE_DEPTNAME,ROLE_TITLE,ROLE_FAMILY_DESC,ROLE_FAMILY,ROLE_CODE
27111,36923,14811,117961,118343,118344,120812,120812,118638,120814
24603,27665,20696,117876,117877,117878,117879,153408,19721,117880
27407,16636,217,117961,118413,120370,118321,240983,290919,118322
28521,26435,4642,117961,118225,120551,118685,279443,308574,118687
11523,23802,7504,117961,118300,118783,118321,117906,290919,118322


In [0]:
categorical_indices = list(range(0, len(X.columns))) # indices of categorical features

In [0]:
X.iloc[:, categorical_indices].head()


Unnamed: 0,RESOURCE,MGR_ID,ROLE_ROLLUP_1,ROLE_ROLLUP_2,ROLE_DEPTNAME,ROLE_TITLE,ROLE_FAMILY_DESC,ROLE_FAMILY,ROLE_CODE
27111,36923,14811,117961,118343,118344,120812,120812,118638,120814
24603,27665,20696,117876,117877,117878,117879,153408,19721,117880
27407,16636,217,117961,118413,120370,118321,240983,290919,118322
28521,26435,4642,117961,118225,120551,118685,279443,308574,118687
11523,23802,7504,117961,118300,118783,118321,117906,290919,118322


Let's inject some random noise into objects

The dataset for the purposes of this talk is very clean. All variables are of the same type, all categorical. No missing values to worry about. The only issue is imbalance of the dataset. That can, and should be treated during data preparation process, with different techniques depending on your problem. 

**Modeling**

In [0]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from catboost import CatBoostClassifier, sum_models


def cv_catboost(X, Y, cat_columns):
    # CROSS VALIDATION
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=7)

    # RUN
    c = 0
    oof_preds = np.zeros((len(X), 2))
    models = []

    for train, valid in cv.split(X, Y):
        print("VAL fold %s" % c)
        X_train = X.iloc[train]
        Y_train = Y.iloc[train]
        X_valid = X.iloc[valid]
        Y_valid = Y.iloc[valid]

        model = CatBoostClassifier(iterations=2000, 
                                   verbose=200,
                                   random_seed=43,
                                   od_type='Iter',
                                   od_wait=100,
                                   train_dir=f'model_cross_val_{c}',
                                   task_type='GPU'
                                   )

        try:
            model.fit(X_train, Y_train,
                    eval_set=(X_valid, Y_valid), 
                    use_best_model=True,
                    cat_features=cat_columns, 
                    save_snapshot=True,
                    snapshot_interval=300,
                    snapshot_file=f'snapshot_{c}',
                    )
        except:
            model.fit(X_train, Y_train,
                    eval_set=(X_valid, Y_valid), 
                    use_best_model=True,
                    cat_features=cat_columns,
                    )
        
        oof_preds[valid] = model.predict_proba(X_valid)
        models.append(model)


        # EVALUATION
        auc = roc_auc_score(validation_df['ACTION'], [r[1] for r in model.predict_proba(validation_df.drop('ACTION', axis=1))] )
        print(f'AUC for fold {c} is {auc}')
        c += 1

    final_model = sum_models(models, ctr_merge_policy='LeaveMostDiversifiedTable')
    auc = roc_auc_score(validation_df['ACTION'], [r[1] for r in final_model.predict(validation_df.drop('ACTION', axis=1), prediction_type='Probability')] )
    print(f'Final model AUC is {auc}')
    

    # OOF PREDS
    oof_preds = [row[1] for row in oof_preds]
    auc = roc_auc_score(Y, oof_preds)
    print("CV_AUC: {}".format(auc))

    # SAVE OOF PREDS
    oof_pred_df = pd.DataFrame(columns=['ID_code', 'target'])
    oof_pred_df['ID_code'] = pd.Series(X.index.tolist())
    oof_pred_df['target'] = pd.Series(oof_preds)

    return final_model, oof_pred_df, models

In [0]:
import time
start = time.time()
final_model, oof_pred_df, models = cv_catboost(X, Y, categorical_indices)
print(time.time() - start)

VAL fold 0
Learning rate set to 0.079937
Learning rate set to 0.079937
0:	learn: 0.5839973	test: 0.5808998	best: 0.5808998 (0)	total: 77.9ms	remaining: 2m 35s
200:	learn: 0.1480123	test: 0.1402514	best: 0.1400969 (184)	total: 15.5s	remaining: 2m 18s
400:	learn: 0.1400761	test: 0.1391977	best: 0.1390787 (301)	total: 30.7s	remaining: 2m 2s
bestTest = 0.1390787147
bestIteration = 301
Shrink model to first 302 iterations.
AUC for fold 0 is 0.8930049725489205
VAL fold 1
Learning rate set to 0.079937
Learning rate set to 0.079937
0:	learn: 0.5808591	test: 0.5834231	best: 0.5834231 (0)	total: 77.3ms	remaining: 2m 34s
200:	learn: 0.1460434	test: 0.1530464	best: 0.1530322 (197)	total: 15.6s	remaining: 2m 19s
bestTest = 0.152966012
bestIteration = 205
Shrink model to first 206 iterations.
AUC for fold 1 is 0.8809271289053868
VAL fold 2
Learning rate set to 0.079937
Learning rate set to 0.079937
0:	learn: 0.5810679	test: 0.5811428	best: 0.5811428 (0)	total: 75.9ms	remaining: 2m 31s
200:	learn: 0.

Get feature importance

In [0]:
import numpy as np 
from sklearn.metrics import roc_auc_score, accuracy_score


def score(model, X, y):
    y_pred = model.predict(X)
    return accuracy_score(y, y_pred)

def score_auc(model, X, y):
    y_pred_prob = [i[1] for i in model.predict_proba(X)]
    return roc_auc_score(y, y_pred_prob)

def shuffled_scores(X_orig, y, model, score_func):
    scores = []
    for col in X_orig.columns:
        X = X_orig.copy()
        X[col] = np.random.permutation(X[col])
        score = score_func(model, X, y)
        scores.append(score)
    return np.array(scores)

def permutation_feature_importance(X, y, model, score_func, n_iter):
    base_score = score_func(model, X, y)
    print('Base score is ', base_score)
    score_decreases = []
    for i in range(n_iter):
        scores = shuffled_scores(X, y, model, score_func)
        score_decreases.append(-scores + base_score)
    return base_score, score_decreases

def get_permutation_importance(X, y, model, score_func):
    '''
    Call this function to get permutation feature importances.
    X - pandas dataframe with all the features used to train the model
    y - pandas dataframe or series object 
    model - Model object used for prediction. Any model objects with predict or .predict_proba methods are acceptable. 
    score_func - function to score the model. AUC and ACCURACY are available above
    '''
    
    base_score, score_decreases = permutation_feature_importance(X, y, model, score_func, 5)
    permutation_feature_importances = np.mean(score_decreases, axis=0)

    return base_score, permutation_feature_importances

In [0]:
base_score, permutation_feature_importances = get_permutation_importance(validation_df.drop('ACTION', axis=1), validation_df['ACTION'], models[0], score_auc)

Base score is  0.8930049725489205


In [0]:
feat_df = pd.DataFrame()
feat_df['feature'] = validation_df.drop('ACTION', axis=1).columns.tolist()
feat_df['scores'] = permutation_feature_importances
feat_df.sort_values(by=['scores'], ascending=False, inplace=True)

In [0]:
feat_df

Unnamed: 0,feature,scores
1,MGR_ID,0.050355
0,RESOURCE,0.043026
6,ROLE_FAMILY_DESC,0.02749
4,ROLE_DEPTNAME,0.022763
3,ROLE_ROLLUP_2,0.015799
5,ROLE_TITLE,0.014312
8,ROLE_CODE,0.005224
7,ROLE_FAMILY,0.002234
2,ROLE_ROLLUP_1,0.002184


In [0]:
final_model.save_model('final_catboost_amazon_dataset.bin')

In [0]:
final_model

<catboost.core.CatBoost at 0x7f68768cc0f0>

In [0]:
import pickle


In [0]:
with open('models0_model_amazon.pickle', 'wb') as d_model:
    pickle.dump(models[0], d_model)

In [0]:
validation_df.drop('ACTION', axis=1).head(1)

Unnamed: 0,RESOURCE,MGR_ID,ROLE_ROLLUP_1,ROLE_ROLLUP_2,ROLE_DEPTNAME,ROLE_TITLE,ROLE_FAMILY_DESC,ROLE_FAMILY,ROLE_CODE
12822,42085,4125,118887,118888,121108,120006,311746,118424,120008


In [0]:
preds = list(final_model.staged_predict(validation_df.drop('ACTION', axis=1), prediction_type='Probability', ntree_end=200))

In [0]:
preds[199]

array([[0.01294362, 0.98705638],
       [0.01532217, 0.98467783],
       [0.04291355, 0.95708645],
       ...,
       [0.64081333, 0.35918667],
       [0.01467224, 0.98532776],
       [0.0273299 , 0.9726701 ]])

In [0]:
auc = roc_auc_score(validation_df['ACTION'], [r[1] for r in preds[199]])
print(f'Staged predict Final model AUC is {auc}')

Staged predict Final model AUC is 0.8903777074929137


In [0]:
final_model.shrink(200)

In [0]:
with open('shrink_model_amazon.pickle', 'wb') as d_model:
    pickle.dump(final_model, d_model)

In [0]:
auc = roc_auc_score(validation_df['ACTION'], [r[1] for r in final_model.predict(validation_df.drop('ACTION', axis=1), prediction_type='Probability')])
print(f'Staged predict Final model AUC is {auc}')

Staged predict Final model AUC is 0.8903777074929137
