# CatBoost
(C) 2018 Dariusz Kajtoch

In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [2]:
data = pd.read_csv('./predict_blood_donations.csv')
print(data.columns)
data.columns = ['Id', 'MonthLast', 'Num', 'Vol', 'MonthFirst', 'Predict']
data.head()

Index(['Unnamed: 0', 'Months since Last Donation', 'Number of Donations',
       'Total Volume Donated (c.c.)', 'Months since First Donation',
       'Made Donation in March 2007'],
      dtype='object')


Unnamed: 0,Id,MonthLast,Num,Vol,MonthFirst,Predict
0,619,2,50,12500,98,1
1,664,0,13,3250,28,1
2,441,1,16,4000,35,1
3,160,2,20,5000,45,1
4,358,1,24,6000,77,0


In [47]:
X = data[[
    'MonthLast',
    'Num',
    'MonthFirst'
]]
#X['Ratio'] = X['MonthLast']/X['MonthFirst']

y = data['Predict']

In [54]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, log_loss, accuracy_score
from catboost import CatBoostClassifier

clf = CatBoostClassifier(
    n_estimators=250,
    random_state=45,
    learning_rate=0.1,
    loss_function='Logloss',
    max_depth=1,
#    subsample=0.5,
#    colsample_bylevel=1.,
    l2_leaf_reg =1,
    border_count=50,
    verbose=False
)                

skf = StratifiedKFold(n_splits=10, random_state=1234)


roc_tab = []; loss_tab = []; acc_tab = []
for train_index, test_index in skf.split(X,y):
    X_train, y_train = X.iloc[train_index], y.iloc[train_index]
    X_test, y_test   = X.iloc[test_index], y.iloc[test_index]
    
    clf.fit(X_train, y_train)
    
    proba = clf.predict_proba(X_test)
    y_pred = clf.predict(X_test)
    
    roc_tab.append(roc_auc_score(y_test, proba[:,1]))
    loss_tab.append(log_loss(y_test, proba[:,1]))
    acc_tab.append(accuracy_score(y_test, y_pred)),

In [55]:
print('AUC: %.8f +/- %.8f, min: %.8f' % (np.mean(roc_tab), np.std(roc_tab), np.min(roc_tab)))
print('LogLoss: %.8f +/- %.8f, max: %.8f' % (np.mean(loss_tab), np.std(loss_tab), np.max(loss_tab)))
print('Accuracy: %.8f +/- %.8f, min: %.8f' % (np.mean(acc_tab), np.std(acc_tab), np.min(acc_tab)))

AUC: 0.83656881 +/- 0.10028948, min: 0.64691558
LogLoss: 0.53968656 +/- 0.12428400, max: 0.82770789
Accuracy: 0.76576355 +/- 0.08729790, min: 0.58620690


# Tuning CatBoostClassifier

In [None]:
from sklear.model_selection import GridSearchCV
from catboost import CatBoostClassifier

params = {'depth':[3,1,2,6,4,5,7,8,9,10],
          'iterations':[250,100,500],
          'learning_rate':[0.03,0.001,0.01,0.1,0.2,0.3], 
          'l2_leaf_reg':[0.1,1,5,10,100],
          'border_count':[32,5,10,20,50,100,200],
          'thread_count':4}

est = CatBoostClassifier(verbose=False)

clf = GridSearchCV(est,
    param_grid = params,
    scoring = ['roc_auc', 'neg_log_loss'],
    refit = 'neg_log_loss',
    cv=10,
    verbose=False
)

clf.fit(X,y)