In [1]:
import pandas as pd
import numpy as np

In [2]:
train_dataset = pd.read_csv('data/train.csv', index_col=0)

In [3]:
# train_dataset.drop(columns=['Name', 'Ticket'], inplace=True)
train_dataset.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
null_value_stats = train_dataset.isnull().sum(axis=0)
null_value_stats[null_value_stats != 0]

Age         177
Cabin       687
Embarked      2
dtype: int64

In [5]:
train_dataset.fillna(-999, inplace=True)

In [6]:
X = train_dataset.drop('Survived', axis=1)
y = train_dataset.Survived

In [7]:
print(X.dtypes)

categorical_features_indices = np.where(X.dtypes != float)[0]

Pclass        int64
Name         object
Sex          object
Age         float64
SibSp         int64
Parch         int64
Ticket       object
Fare        float64
Cabin        object
Embarked     object
dtype: object


In [8]:
categorical_features_indices

array([0, 1, 2, 4, 5, 6, 8, 9], dtype=int64)

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.75, random_state=42)

X_test = pd.read_csv('data/test.csv', index_col=0)
# X_test.drop(columns=['Name', 'Ticket'], inplace=True)
X_test.fillna(-999, inplace=True)

In [10]:
from catboost import CatBoostClassifier, Pool, metrics, cv
from sklearn.metrics import accuracy_score

In [11]:
model = CatBoostClassifier(
    custom_loss=[metrics.Accuracy()],
    random_seed=42,
    logging_level='Silent'
)

In [12]:
model.fit(
    X_train, y_train,
    cat_features=categorical_features_indices,
    eval_set=(X_validation, y_validation),
    plot=True
);

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [13]:
cv_params = model.get_params()
cv_params.update({
    'loss_function': metrics.Logloss()
})
cv_data = cv(
    Pool(X, y, cat_features=categorical_features_indices),
    cv_params,
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [16]:
print('Best validation accuracy score: {:.2f}±{:.2f} on step {}'.format(
    np.max(cv_data['test-Accuracy-mean']),
    cv_data['test-Accuracy-std'][np.argmax(cv_data['test-Accuracy-mean'])],
    np.argmax(cv_data['test-Accuracy-mean'])
))

Best validation accuracy score: 0.83±0.02 on step 379


In [17]:
predictions = model.predict(X_test)
predictions_probs = model.predict_proba(X_test)
print(predictions[:10])
print(predictions_probs[:10])

[0 0 0 0 1 0 1 0 1 0]
[[0.88124192 0.11875808]
 [0.6382326  0.3617674 ]
 [0.90397479 0.09602521]
 [0.87527912 0.12472088]
 [0.28712948 0.71287052]
 [0.89742968 0.10257032]
 [0.33163663 0.66836337]
 [0.78648208 0.21351792]
 [0.31823349 0.68176651]
 [0.95101029 0.04898971]]


In [18]:
params = {
    'iterations': 500,
    'learning_rate': 0.1,
    'eval_metric': metrics.Accuracy(),
    'random_seed': 42,
    'logging_level': 'Silent',
    'use_best_model': False
}
train_pool = Pool(X_train, y_train, cat_features=categorical_features_indices)
validate_pool = Pool(X_validation, y_validation, cat_features=categorical_features_indices)

In [19]:
model = CatBoostClassifier(**params)
model.fit(train_pool, eval_set=validate_pool)

best_model_params = params.copy()
best_model_params.update({
    'use_best_model': True
})
best_model = CatBoostClassifier(**best_model_params)
best_model.fit(train_pool, eval_set=validate_pool);

print('Simple model validation accuracy: {:.4}'.format(
    accuracy_score(y_validation, model.predict(X_validation))
))
print('')

print('Best model validation accuracy: {:.4}'.format(
    accuracy_score(y_validation, best_model.predict(X_validation))
))

Simple model validation accuracy: 0.8117

Best model validation accuracy: 0.8206


In [20]:
%%time
model = CatBoostClassifier(**params)
model.fit(train_pool, eval_set=validate_pool)

CPU times: total: 21.7 s
Wall time: 18.1 s


<catboost.core.CatBoostClassifier at 0x1f01fa55b80>

In [21]:
%%time
earlystop_params = params.copy()
earlystop_params.update({
    'od_type': 'Iter',
    'od_wait': 40
})
earlystop_model = CatBoostClassifier(**earlystop_params)
earlystop_model.fit(train_pool, eval_set=validate_pool);

CPU times: total: 2.91 s
Wall time: 2.3 s


In [22]:
print('Simple model tree count: {}'.format(model.tree_count_))
print('Simple model validation accuracy: {:.4}'.format(
    accuracy_score(y_validation, model.predict(X_validation))
))
print('')

print('Early-stopped model tree count: {}'.format(earlystop_model.tree_count_))
print('Early-stopped model validation accuracy: {:.4}'.format(
    accuracy_score(y_validation, earlystop_model.predict(X_validation))
))

Simple model tree count: 500
Simple model validation accuracy: 0.8117

Early-stopped model tree count: 82
Early-stopped model validation accuracy: 0.8161


In [23]:
current_params = params.copy()
current_params.update({
    'iterations': 10
})
model = CatBoostClassifier(**current_params).fit(X_train, y_train, categorical_features_indices)
# Get baseline (only with prediction_type='RawFormulaVal')
baseline = model.predict(X_train, prediction_type='RawFormulaVal')
# Fit new model
model.fit(X_train, y_train, categorical_features_indices, baseline=baseline);

In [24]:
params_with_snapshot = params.copy()
params_with_snapshot.update({
    'iterations': 5,
    'learning_rate': 0.5,
    'logging_level': 'Verbose'
})
model = CatBoostClassifier(**params_with_snapshot).fit(train_pool, eval_set=validate_pool, save_snapshot=True)
params_with_snapshot.update({
    'iterations': 10,
    'learning_rate': 0.1,
})
model = CatBoostClassifier(**params_with_snapshot).fit(train_pool, eval_set=validate_pool, save_snapshot=True)

0:	learn: 0.7874251	test: 0.7847534	best: 0.7847534 (0)	total: 11.8ms	remaining: 47.3ms
1:	learn: 0.8098802	test: 0.8071749	best: 0.8071749 (1)	total: 41.4ms	remaining: 62.1ms
2:	learn: 0.8143713	test: 0.8116592	best: 0.8116592 (2)	total: 79.4ms	remaining: 52.9ms
3:	learn: 0.8203593	test: 0.8116592	best: 0.8116592 (2)	total: 91.2ms	remaining: 22.8ms
4:	learn: 0.8248503	test: 0.8251121	best: 0.8251121 (4)	total: 135ms	remaining: 0us

bestTest = 0.8251121076
bestIteration = 4

5:	learn: 0.8263473	test: 0.8206278	best: 0.8251121 (4)	total: 156ms	remaining: 82.6ms
6:	learn: 0.8263473	test: 0.8206278	best: 0.8251121 (4)	total: 179ms	remaining: 65.8ms
7:	learn: 0.8278443	test: 0.8206278	best: 0.8251121 (4)	total: 216ms	remaining: 54.2ms
8:	learn: 0.8278443	test: 0.8251121	best: 0.8251121 (4)	total: 248ms	remaining: 28.3ms
9:	learn: 0.8323353	test: 0.8251121	best: 0.8251121 (4)	total: 282ms	remaining: 0us

bestTest = 0.8251121076
bestIteration = 4



In [25]:
class LoglossObjective(object):
    def calc_ders_range(self, approxes, targets, weights):
        # approxes, targets, weights are indexed containers of floats
        # (containers which have only __len__ and __getitem__ defined).
        # weights parameter can be None.
        #
        # To understand what these parameters mean, assume that there is
        # a subset of your dataset that is currently being processed.
        # approxes contains current predictions for this subset,
        # targets contains target values you provided with the dataset.
        #
        # This function should return a list of pairs (der1, der2), where
        # der1 is the first derivative of the loss function with respect
        # to the predicted value, and der2 is the second derivative.
        #
        # In our case, logloss is defined by the following formula:
        # target * log(sigmoid(approx)) + (1 - target) * (1 - sigmoid(approx))
        # where sigmoid(x) = 1 / (1 + e^(-x)).
        
        assert len(approxes) == len(targets)
        if weights is not None:
            assert len(weights) == len(approxes)
        
        result = []
        for index in range(len(targets)):
            e = np.exp(approxes[index])
            p = e / (1 + e)
            der1 = (1 - p) if targets[index] > 0.0 else -p
            der2 = -p * (1 - p)

            if weights is not None:
                der1 *= weights[index]
                der2 *= weights[index]

            result.append((der1, der2))
        return result

In [26]:
model = CatBoostClassifier(
    iterations=10,
    random_seed=42, 
    loss_function=LoglossObjective(), 
    eval_metric=metrics.Logloss()
)
# Fit model
model.fit(train_pool)
# Only prediction_type='RawFormulaVal' is allowed with custom `loss_function`
preds_raw = model.predict(X_test, prediction_type='RawFormulaVal')

0:	learn: 0.6816442	total: 1.11s	remaining: 10s
1:	learn: 0.6706701	total: 1.13s	remaining: 4.5s
2:	learn: 0.6612515	total: 1.15s	remaining: 2.68s
3:	learn: 0.6517074	total: 1.17s	remaining: 1.75s
4:	learn: 0.6430129	total: 1.18s	remaining: 1.18s
5:	learn: 0.6354009	total: 1.2s	remaining: 802ms
6:	learn: 0.6272715	total: 1.23s	remaining: 526ms
7:	learn: 0.6196604	total: 1.29s	remaining: 322ms
8:	learn: 0.6132884	total: 1.3s	remaining: 145ms
9:	learn: 0.6069064	total: 1.34s	remaining: 0us


In [27]:
class LoglossMetric(object):
    def get_final_error(self, error, weight):
        return error / (weight + 1e-38)

    def is_max_optimal(self):
        return False

    def evaluate(self, approxes, target, weight):
        # approxes is a list of indexed containers
        # (containers with only __len__ and __getitem__ defined),
        # one container per approx dimension.
        # Each container contains floats.
        # weight is a one dimensional indexed container.
        # target is float.
        
        # weight parameter can be None.
        # Returns pair (error, weights sum)
        
        assert len(approxes) == 1
        assert len(target) == len(approxes[0])

        approx = approxes[0]

        error_sum = 0.0
        weight_sum = 0.0

        for i in range(len(approx)):
            w = 1.0 if weight is None else weight[i]
            weight_sum += w
            error_sum += -w * (target[i] * approx[i] - np.log(1 + np.exp(approx[i])))

        return error_sum, weight_sum

In [28]:
model = CatBoostClassifier(
    iterations=10,
    random_seed=42, 
    loss_function=metrics.Logloss(),
    eval_metric=LoglossMetric()
)
# Fit model
model.fit(train_pool)
# Only prediction_type='RawFormulaVal' is allowed with custom `loss_function`
preds_raw = model.predict(X_test, prediction_type='RawFormulaVal')

Learning rate set to 0.5
0:	learn: 0.5432616	total: 718ms	remaining: 6.46s
1:	learn: 0.4832959	total: 740ms	remaining: 2.96s
2:	learn: 0.4478476	total: 760ms	remaining: 1.77s
3:	learn: 0.4407666	total: 767ms	remaining: 1.15s
4:	learn: 0.4236768	total: 787ms	remaining: 787ms
5:	learn: 0.4162290	total: 808ms	remaining: 539ms
6:	learn: 0.4110969	total: 829ms	remaining: 355ms
7:	learn: 0.4084910	total: 836ms	remaining: 209ms
8:	learn: 0.4059603	total: 857ms	remaining: 95.2ms
9:	learn: 0.4001567	total: 877ms	remaining: 0us


In [29]:
model = CatBoostClassifier(iterations=50, random_seed=42, logging_level='Silent').fit(train_pool)
feature_importances = model.get_feature_importance(train_pool)
feature_names = X_train.columns
for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
    print('{}: {}'.format(name, score))

Sex: 57.85231766315812
Pclass: 14.719812594545566
Ticket: 7.199474531995226
Age: 5.027414089754211
Embarked: 3.9411413823589525
Cabin: 3.129016952489596
Parch: 2.9444351585392234
SibSp: 2.6324991145661154
Fare: 2.553888512592951
Name: 0.0


In [30]:
model = CatBoostClassifier(iterations=50, random_seed=42, logging_level='Silent').fit(train_pool)
eval_metrics = model.eval_metrics(validate_pool, [metrics.AUC()], plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [31]:
print(eval_metrics['AUC'][:6])

[0.8363659231930236, 0.8363659231930236, 0.8533037061881603, 0.8503689418078149, 0.8487338587959081, 0.8441640114036559]


In [32]:
model1 = CatBoostClassifier(iterations=100, depth=1, train_dir='model_depth_1/', logging_level='Silent')
model1.fit(train_pool, eval_set=validate_pool)
model2 = CatBoostClassifier(iterations=100, depth=5, train_dir='model_depth_5/', logging_level='Silent')
model2.fit(train_pool, eval_set=validate_pool);

In [33]:
from catboost import MetricVisualizer
widget = MetricVisualizer(['model_depth_1', 'model_depth_5'])
widget.start()

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [34]:
model = CatBoostClassifier(iterations=10, random_seed=42, logging_level='Silent').fit(train_pool)
model.save_model('catboost_model.dump')
model = CatBoostClassifier()
model.load_model('catboost_model.dump');

In [35]:
import hyperopt

def hyperopt_objective(params):
    model = CatBoostClassifier(
        l2_leaf_reg=int(params['l2_leaf_reg']),
        learning_rate=params['learning_rate'],
        iterations=500,
        eval_metric=metrics.Accuracy(),
        random_seed=42,
        verbose=False,
        loss_function=metrics.Logloss(),
    )
    
    cv_data = cv(
        Pool(X, y, cat_features=categorical_features_indices),
        model.get_params(),
        logging_level='Silent',
    )
    best_accuracy = np.max(cv_data['test-Accuracy-mean'])
    
    return 1 - best_accuracy # as hyperopt minimises

In [36]:
from numpy.random import RandomState

params_space = {
    'l2_leaf_reg': hyperopt.hp.qloguniform('l2_leaf_reg', 0, 2, 1),
    'learning_rate': hyperopt.hp.uniform('learning_rate', 1e-3, 5e-1),
}

trials = hyperopt.Trials()

best = hyperopt.fmin(
    hyperopt_objective,
    space=params_space,
    algo=hyperopt.tpe.suggest,
    max_evals=50,
    trials=trials,
    rstate=RandomState(123)
)

print(best)

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]


AttributeError: 'numpy.random.mtrand.RandomState' object has no attribute 'integers'

In [37]:
model = CatBoostClassifier(
    l2_leaf_reg=1.0,
    learning_rate=0.0450866712211308,
    iterations=500,
    eval_metric=metrics.Accuracy(),
    random_seed=42,
    verbose=False,
    loss_function=metrics.Logloss(),
)
cv_data = cv(Pool(X, y, cat_features=categorical_features_indices), model.get_params())

Training on fold [0/3]

bestTest = 0.8383838384
bestIteration = 20

Training on fold [1/3]

bestTest = 0.8518518519
bestIteration = 358

Training on fold [2/3]

bestTest = 0.8181818182
bestIteration = 178



In [38]:
print('Precise validation accuracy score: {}'.format(np.max(cv_data['test-Accuracy-mean'])))

Precise validation accuracy score: 0.830527497194164


In [39]:
model.fit(X, y, cat_features=categorical_features_indices)

<catboost.core.CatBoostClassifier at 0x1f023ec5760>

In [42]:
X_test.index

Index([ 892,  893,  894,  895,  896,  897,  898,  899,  900,  901,
       ...
       1300, 1301, 1302, 1303, 1304, 1305, 1306, 1307, 1308, 1309],
      dtype='int64', name='PassengerId', length=418)

In [43]:
import pandas as pd
submisstion = pd.DataFrame()
submisstion['PassengerId'] = X_test.index
submisstion['Survived'] = model.predict(X_test)

In [44]:
submisstion.to_csv('submissions/catboost.csv', index=False)