In [25]:
import collections
import datetime
import hyperopt
import itertools
import math
import numpy
import operator
import pandas
import random
from scipy import stats
from sklearn import (
    calibration,
    ensemble,
    linear_model,
    metrics,
    model_selection,
    naive_bayes,
    neighbors,
    neural_network,
    svm,
)
import time
import xgboost

### Read and Filter

In [141]:
dtype = {'smoke': float, 'alco': float, 'active': float, 'ap_hi': float, 'ap_lo': float}

def fix_ap(value):
    value = abs(value)
    while value > 500.0:
        value /= 10.0
    return value

def read_csv(filename):
    frame = pandas.read_csv(filename, sep=';', header=0, na_values='None', dtype=dtype).drop(['id'], axis=1)
    
    frame = pandas.get_dummies(frame, columns=[
        # 'smoke',
        # 'active',
        # 'alco',
        # 'gender',
        # 'cholesterol',
        # 'gluc',
    ])
    frame = frame.assign(
        bmi=(frame['weight'] / frame['height'] ** 2),
        # aged_smoke_0=(frame['smoke_0.0'] * frame['age']),
        # aged_smoke_1=(frame['smoke_1.0'] * frame['age']),
        # aged_active_0=(frame['active_0.0'] * frame['age']),
        # aged_active_1=(frame['active_1.0'] * frame['age']),
        # aged_alco_0=(frame['alco_0.0'] * frame['age']),
        # aged_alco_1=(frame['alco_1.0'] * frame['age']),
        # smoke_1_active_0=(frame['smoke_1.0'] * frame['active_0.0']),
    )
    
    # Fix negative and too large values.
    frame['ap_hi'] = frame['ap_hi'].apply(fix_ap)
    frame['ap_lo'] = frame['ap_lo'].apply(fix_ap)
    
    # Re-order ap_hi and ap_lo.
    frame[['ap_hi', 'ap_lo']] = frame[['ap_hi', 'ap_lo']].apply(
        lambda row: [row['ap_hi'], row['ap_lo']] if row['ap_hi'] > row['ap_lo'] else [row['ap_lo'], row['ap_hi']],
        axis=1,
    )
    
    return frame

In [142]:
train = read_csv('train.csv')

X = train.drop('cardio', axis=1).values
y = train['cardio'].values
print(f'X: {X.shape}')
print(f'y: {y.shape}')

X: (70000, 12)
y: (70000,)


### Hyper-parameter Optimisation

In [39]:
def cv():
    return model_selection.StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

In [151]:
Param = collections.namedtuple('Param', 'min max step')

def optimize_estimator(estimator, X, y, n_iter=10, scoring=None, refit=True, cv=None, **params):
    def evaluate():
        return model_selection.cross_val_score(estimator, X, y, scoring=scoring, cv=cv()).mean()
    
    def set_evaluate(key, value):
        setattr(estimator, key, value)
        return evaluate()
    
    params = list(params.items())
    
    print(f'Evaluating current model…')
    current_score = evaluate()
    print(f'Initial score: {current_score}')
    print()
    
    for i in range(n_iter):
        start_time = time.time()
        early_stop = True
        
        # Shuffle params to lessen overfitting.
        random.shuffle(params)
        
        # Tune each parameter.
        for key, param in params:
            current_value = getattr(estimator, key)
            
            # Choose the best value.
            values = [current_value - param.step, current_value + param.step]
            scores = [(value, set_evaluate(key, value)) for value in values if param.min <= value <= param.max]
            best_value, best_score = max(scores, key=operator.itemgetter(1))
            if best_score > current_score:
                current_value = best_value
                current_score = best_score
                early_stop = False
                print(f'[Iteration {i}] {key} = {current_value}. Score: {current_score}')
                
            # Restore current value.
            setattr(estimator, key, current_value)
        
        print(f'[Iteration {i}] Finished in {(time.time() - start_time) / 60.0:.0f} min')
        
        if early_stop:
            print(f'[Iteration {i}] No changes, stopping')
            break
            
    print(f'Final score: {current_score}')

    # Fit on the entire dataset if needed.
    if refit:
        print(f'Final fitting…')
        estimator.fit(X, y)
    return estimator

### Predict `smoke`, `alco` and `active`

In [102]:
X_helper = train.drop(['smoke', 'alco', 'active', 'cardio'], axis=1).values

y_smoke = train['smoke'].values
y_alco = train['alco'].values
y_active = train['active'].values

In [103]:
smoke_estimator = xgboost.XGBClassifier(nthread=2, seed=0)

In [146]:
optimize_estimator(
    smoke_estimator, X_helper, y_smoke, n_iter=100, scoring='neg_log_loss', cv=cv,
    colsample_bytree=Param(0.5, 1.0, 0.01),
    subsample=Param(0.5, 1.0, 0.01),
    base_score=Param(0.0, 1.0, 0.01),
    scale_pos_weight=Param(0.0, 1.0, 0.01),
    reg_lambda=Param(0.0, 1.0, 0.01),
    reg_alpha=Param(0.0, 1.0, 0.01),
    gamma=Param(0.0, 1.0, 0.01),
    learning_rate=Param(0.01, 1.0, 0.01),
    max_depth=Param(1, 1000, 1),
    n_estimators=Param(1, 1000, 1),
)

Evaluating current model…
Initial score: -0.23877346981261632

[Iteration 0] 128s
[Iteration 0] No changes
Final score: -0.23877346981261632


XGBClassifier(base_score=0.52, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.13, max_delta_step=0, max_depth=2,
       min_child_weight=1, missing=None, n_estimators=100, nthread=2,
       objective='binary:logistic', reg_alpha=0, reg_lambda=0.99,
       scale_pos_weight=0.99, seed=0, silent=True, subsample=1.0)

In [105]:
alco_estimator = xgboost.XGBClassifier(nthread=2, seed=0)

In [147]:
optimize_estimator(
    alco_estimator, X_helper, y_alco, n_iter=100, scoring='neg_log_loss', cv=cv,
    colsample_bytree=Param(0.5, 1.0, 0.01),
    subsample=Param(0.5, 1.0, 0.01),
    base_score=Param(0.0, 1.0, 0.01),
    scale_pos_weight=Param(0.0, 1.0, 0.01),
    reg_lambda=Param(0.0, 1.0, 0.01),
    reg_alpha=Param(0.0, 1.0, 0.01),
    gamma=Param(0.0, 1.0, 0.01),
    learning_rate=Param(0.01, 1.0, 0.01),
    max_depth=Param(1, 1000, 1),
    n_estimators=Param(1, 1000, 1),
)

Evaluating current model…
Initial score: -0.19227622539807737

[Iteration 0] 206s
[Iteration 0] No changes
Final score: -0.19227622539807737


XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=101, nthread=2,
       objective='binary:logistic', reg_alpha=0.01, reg_lambda=1,
       scale_pos_weight=0.99, seed=0, silent=True, subsample=0.99)

In [107]:
active_estimator = xgboost.XGBClassifier(nthread=2, seed=0)

In [148]:
optimize_estimator(
    active_estimator, X_helper, y_active, n_iter=100, scoring='neg_log_loss', cv=cv,
    colsample_bytree=Param(0.5, 1.0, 0.01),
    subsample=Param(0.5, 1.0, 0.01),
    base_score=Param(0.0, 1.0, 0.01),
    scale_pos_weight=Param(0.0, 1.0, 0.01),
    reg_lambda=Param(0.0, 1.0, 0.01),
    reg_alpha=Param(0.0, 1.0, 0.01),
    gamma=Param(0.0, 1.0, 0.01),
    learning_rate=Param(0.01, 1.0, 0.01),
    max_depth=Param(1, 1000, 1),
    n_estimators=Param(1, 1000, 1),
)

Evaluating current model…
Initial score: -0.4924262166722723

[Iteration 0] 213s
[Iteration 0] No changes
Final score: -0.4924262166722723


XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0.0, learning_rate=0.11, max_delta_step=0, max_depth=4,
       min_child_weight=1, missing=None, n_estimators=99, nthread=2,
       objective='binary:logistic', reg_alpha=0.01, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

### Predict `cardio`

In [109]:
def print_importances(estimator):
    for name, importance in sorted(zip(test, estimator.feature_importances_), key=operator.itemgetter(1), reverse=True):
        print(f'{name}: {importance:.7f}')

In [110]:
cardio_estimator = xgboost.XGBClassifier(nthread=2, seed=0)

In [149]:
optimize_estimator(
    cardio_estimator, X, y, n_iter=100, scoring='neg_log_loss', cv=cv,
    colsample_bytree=Param(0.5, 1.0, 0.01),
    subsample=Param(0.5, 1.0, 0.01),
    base_score=Param(0.0, 1.0, 0.01),
    scale_pos_weight=Param(0.0, 1.0, 0.01),
    reg_lambda=Param(0.0, 1.0, 0.01),
    reg_alpha=Param(0.0, 1.0, 0.01),
    gamma=Param(0.0, 1.0, 0.01),
    learning_rate=Param(0.01, 1.0, 0.01),
    max_depth=Param(1, 1000, 1),
    n_estimators=Param(1, 1000, 1),
)

Evaluating current model…
Initial score: -0.5389153323250567

[Iteration 0] base_score = 0.5. Score: -0.5388995685088614
[Iteration 0] max_depth = 4. Score: -0.5387134496618897
[Iteration 0] gamma = 0.02. Score: -0.5387117586461065
[Iteration 0] reg_lambda = 0.99. Score: -0.5387098526465285
[Iteration 0] 326s
[Iteration 1] subsample = 0.98. Score: -0.5387072903563016
[Iteration 1] 322s
[Iteration 2] n_estimators = 100. Score: -0.5387029863095291
[Iteration 2] 324s
[Iteration 3] learning_rate = 0.1. Score: -0.5386959078842235
[Iteration 3] base_score = 0.51. Score: -0.5386556783886562
[Iteration 3] n_estimators = 99. Score: -0.5386424968896206
[Iteration 3] 320s
[Iteration 4] gamma = 0.03. Score: -0.5386423372311822
[Iteration 4] 317s
[Iteration 5] gamma = 0.04. Score: -0.5386348975915766
[Iteration 5] 318s
[Iteration 6] gamma = 0.05. Score: -0.5386347945247948
[Iteration 6] 318s
[Iteration 7] 319s
[Iteration 7] No changes
Final score: -0.5386347945247948


XGBClassifier(base_score=0.51, colsample_bylevel=1, colsample_bytree=0.99,
       gamma=0.05, learning_rate=0.1, max_delta_step=0, max_depth=4,
       min_child_weight=1, missing=None, n_estimators=99, nthread=2,
       objective='binary:logistic', reg_alpha=0, reg_lambda=0.99,
       scale_pos_weight=0.99, seed=0, silent=True, subsample=0.98)

In [150]:
test = read_csv('test.csv')
X_helper_test = test.drop(['smoke', 'alco', 'active'], axis=1).values

test['smoke'].fillna(pandas.Series(smoke_estimator.predict_proba(X_helper_test)[:, 1]), inplace=True)
test['alco'].fillna(pandas.Series(alco_estimator.predict_proba(X_helper_test)[:, 1]), inplace=True)
test['active'].fillna(pandas.Series(active_estimator.predict_proba(X_helper_test)[:, 1]), inplace=True)

numpy.savetxt(f'xgboost.txt', cardio_estimator.predict_proba(test.values)[:, 1], fmt='%f')

print_importances(cardio_estimator)

age: 0.2697183
bmi: 0.1478873
ap_hi: 0.1330986
ap_lo: 0.0929577
weight: 0.0809859
height: 0.0795775
cholesterol: 0.0760563
gluc: 0.0514085
active: 0.0253521
smoke: 0.0169014
alco: 0.0133803
gender: 0.0126761
