In [466]:
import collections
import datetime
import hyperopt
import itertools
import math
import numpy
import operator
import pandas
import random
from scipy import stats
from sklearn import (
    calibration,
    ensemble,
    linear_model,
    metrics,
    model_selection,
    naive_bayes,
    neighbors,
    neural_network,
    svm,
)
import time
import xgboost

### Read and Filter

In [467]:
dtype = {'smoke': float, 'alco': float, 'active': float}

def fix_ap(value):
    value = abs(value)
    while value > 500.0:
        value /= 10.0
    return value

def read_csv(filename):
    frame = pandas.read_csv(filename, sep=';', header=0, na_values='None', dtype=dtype).drop(['id'], axis=1)
    
    frame = pandas.get_dummies(frame, columns=[
        # 'smoke',
        # 'active',
        # 'alco',
        # 'gender',
        # 'cholesterol',
        # 'gluc',
    ])
    frame = frame.assign(
        bmi=(frame['weight'] / frame['height'] ** 2),
        # aged_smoke_0=(frame['smoke_0.0'] * frame['age']),
        # aged_smoke_1=(frame['smoke_1.0'] * frame['age']),
        # aged_active_0=(frame['active_0.0'] * frame['age']),
        # aged_active_1=(frame['active_1.0'] * frame['age']),
        # aged_alco_0=(frame['alco_0.0'] * frame['age']),
        # aged_alco_1=(frame['alco_1.0'] * frame['age']),
        # smoke_1_active_0=(frame['smoke_1.0'] * frame['active_0.0']),
    )
    
    frame['ap_hi'] = frame['ap_hi'].apply(fix_ap)
    frame['ap_lo'] = frame['ap_lo'].apply(fix_ap)
    
    return frame

In [468]:
train = read_csv('train.csv')

X = train.drop('cardio', axis=1).values
y = train['cardio'].values
print(f'X: {X.shape}')
print(f'y: {y.shape}')

X: (70000, 12)
y: (70000,)


### Hyper-parameter Optimisation

In [469]:
def cv():
    return model_selection.StratifiedKFold(n_splits=20, shuffle=True, random_state=0)

def make_int_grid(params, param_name, delta):
    n = params.get(param_name, 1)
    return {param_name: list(range(max(1, n - delta), n + delta + 1))}

def make_float_grid(params, param_name, delta, min_, max_, default=0.0):
    x = params.get(param_name, default)
    return {param_name: sorted({max(min_, x - delta), x, min(max_, x + delta)})}

def search(estimator, X, y, params, **param_grid):
    start_time = time.time()
    grid_search = model_selection.GridSearchCV(
        estimator.set_params(**params),
        param_grid,
        scoring='neg_log_loss',
        refit=False,
        cv=cv(),
    ).fit(X, y)
    params.update(grid_search.best_params_)
    print(f'[{(time.time() - start_time):.0f}s] {grid_search.best_score_:.10f} {grid_search.best_params_} | {param_grid}')

def search_xgb(X, y, params):
    estimator = xgboost.XGBClassifier(nthread=2, seed=0)
    for _ in range(100):
        print()
        searches = [
            lambda: search(estimator, X, y, params, **make_float_grid(params, 'colsample_bytree', 0.01, 0.5, 1.0, 1.0)),
            lambda: search(estimator, X, y, params, **make_float_grid(params, 'subsample', 0.01, 0.5, 1.0, 1.0)),
            lambda: search(estimator, X, y, params, **make_float_grid(params, 'base_score', 0.01, 0.0, 1.0)),
            lambda: search(estimator, X, y, params, **make_float_grid(params, 'scale_pos_weight', 0.01, 0.0, 1.0)),
            lambda: search(estimator, X, y, params, **make_float_grid(params, 'reg_lambda', 0.01, 0.0, 1.0)),
            lambda: search(estimator, X, y, params, **make_float_grid(params, 'reg_alpha', 0.01, 0.0, 1.0)),
            lambda: search(estimator, X, y, params, **make_float_grid(params, 'gamma', 0.01, 0.0, 1.0)),
            lambda: search(estimator, X, y, params, **make_int_grid(params, 'max_depth', 1)),
            lambda: search(estimator, X, y, params, **make_int_grid(params, 'n_estimators', 3)),
            lambda: search(estimator, X, y, params, **make_float_grid(params, 'learning_rate', 0.01, 0.01, 1.0, 0.1)),
        ]
        random.shuffle(searches)
        last_params = dict(params)
        for _search in searches:
            _search()
        if params == last_params:
            break
    
    print('---------------------------------------------------')
    print(params)
    return estimator.set_params(**params).fit(X, y)

FloatParam = collections.namedtuple('FloatParam', 'min max tolerance')
IntegerParam = collections.namedtuple('IntegerParam', 'min max')

def optimize_estimator(estimator, X, y, n_iter=10, scoring=None, refit=True, cv=None, **params):
    params = list(params.items())
    for _ in range(n_iter):
        # Shuffle params to lessen overfitting.
        random.shuffle(params)
        # Tune each parameter.
        for key, param in params:
            old_value = getattr(estimator, key)
            pass  # TODO
    # Fit on the entire dataset if needed.
    if refit:
        estimator.fit(X, y)
    return estimator

### Predict `smoke`, `alco` and `active`

In [470]:
X_helper = train.drop(['smoke', 'alco', 'active', 'cardio'], axis=1).values

y_smoke = train['smoke'].values
y_alco = train['alco'].values
y_active = train['active'].values

In [471]:
# smoke_params = {'base_score': 0.53, 'scale_pos_weight': 1.0, 'reg_lambda': 0.98, 'reg_alpha': 0.01, 'gamma': 0.03999999999999997, 'max_depth': 3, 'n_estimators': 79, 'subsample': 0.5, 'colsample_bytree': 0.7799999999999998}
# alco_params = {'base_score': 0.52, 'scale_pos_weight': 1.0, 'reg_lambda': 0.97, 'reg_alpha': 0.0, 'gamma': 0.19, 'max_depth': 3, 'n_estimators': 89, 'colsample_bytree': 1.0, 'subsample': 0.98}
# active_params = {'base_score': 0.52, 'scale_pos_weight': 0.98, 'reg_lambda': 0.99, 'reg_alpha': 0.019999999999999997, 'gamma': 0.16999999999999998, 'max_depth': 5, 'n_estimators': 82, 'colsample_bytree': 0.8899999999999999, 'subsample': 0.99}

In [472]:
smoke_estimator = search_xgb(X_helper, y_smoke, smoke_params)


[58s] -0.2389967753 {'colsample_bytree': 0.7699999999999998} | {'colsample_bytree': [0.7699999999999998, 0.7799999999999998, 0.7899999999999998]}
[56s] -0.2389967753 {'base_score': 0.51} | {'base_score': [0.5, 0.51, 0.52]}
[130s] -0.2389713498 {'n_estimators': 91} | {'n_estimators': [86, 87, 88, 89, 90, 91, 92]}
[57s] -0.2389540547 {'reg_alpha': 0.04} | {'reg_alpha': [0.019999999999999997, 0.03, 0.04]}
[57s] -0.2389540547 {'reg_lambda': 0.99} | {'reg_lambda': [0.98, 0.99, 1.0]}
[38s] -0.2389540547 {'gamma': 0.0} | {'gamma': [0.0, 0.01]}
[57s] -0.2389540547 {'scale_pos_weight': 0.99} | {'scale_pos_weight': [0.98, 0.99, 1.0]}
[57s] -0.2387819637 {'max_depth': 3} | {'max_depth': [1, 2, 3]}
[76s] -0.2387819637 {'subsample': 0.51} | {'subsample': [0.5, 0.51, 0.52]}
[75s] -0.2387819637 {'learning_rate': 0.11} | {'learning_rate': [0.1, 0.11, 0.12]}

[75s] -0.2387776254 {'base_score': 0.5} | {'base_score': [0.5, 0.51, 0.52]}
[75s] -0.2387776254 {'max_depth': 3} | {'max_depth': [2, 3, 4]}
[75s

[74s] -0.2387588663 {'learning_rate': 0.11} | {'learning_rate': [0.1, 0.11, 0.12]}
[74s] -0.2387588663 {'subsample': 0.51} | {'subsample': [0.5, 0.51, 0.52]}
[75s] -0.2387588663 {'gamma': 0.01} | {'gamma': [0.0, 0.01, 0.02]}

[75s] -0.2387588663 {'colsample_bytree': 0.6699999999999997} | {'colsample_bytree': [0.6699999999999997, 0.6799999999999997, 0.6899999999999997]}
[76s] -0.2387588663 {'learning_rate': 0.11} | {'learning_rate': [0.1, 0.11, 0.12]}
[175s] -0.2387588663 {'n_estimators': 91} | {'n_estimators': [88, 89, 90, 91, 92, 93, 94]}
[74s] -0.2387588663 {'base_score': 0.5} | {'base_score': [0.49, 0.5, 0.51]}
[74s] -0.2387588663 {'max_depth': 3} | {'max_depth': [2, 3, 4]}
[76s] -0.2387588663 {'subsample': 0.51} | {'subsample': [0.5, 0.51, 0.52]}
[78s] -0.2387588663 {'scale_pos_weight': 0.99} | {'scale_pos_weight': [0.98, 0.99, 1.0]}
[75s] -0.2387588663 {'reg_lambda': 0.98} | {'reg_lambda': [0.97, 0.98, 0.99]}
[76s] -0.2387588663 {'gamma': 0.01} | {'gamma': [0.0, 0.01, 0.02]}
[75s]

In [473]:
alco_estimator = search_xgb(X_helper, y_alco, alco_params)


[53s] -0.1923805964 {'colsample_bytree': 1.0} | {'colsample_bytree': [0.99, 1.0]}
[82s] -0.1923080163 {'subsample': 0.96} | {'subsample': [0.96, 0.97, 0.98]}
[55s] -0.1923070066 {'reg_alpha': 0.01} | {'reg_alpha': [0.0, 0.01]}
[83s] -0.1923070066 {'gamma': 0.12999999999999995} | {'gamma': [0.10999999999999996, 0.11999999999999995, 0.12999999999999995]}
[191s] -0.1922981142 {'n_estimators': 99} | {'n_estimators': [95, 96, 97, 98, 99, 100, 101]}
[84s] -0.1922981142 {'base_score': 0.52} | {'base_score': [0.51, 0.52, 0.53]}
[56s] -0.1922981142 {'reg_lambda': 1.0} | {'reg_lambda': [0.99, 1.0]}
[84s] -0.1922981142 {'max_depth': 3} | {'max_depth': [2, 3, 4]}
[84s] -0.1922981142 {'learning_rate': 0.1} | {'learning_rate': [0.09000000000000001, 0.1, 0.11]}
[83s] -0.1922981142 {'scale_pos_weight': 0.99} | {'scale_pos_weight': [0.98, 0.99, 1.0]}

[55s] -0.1922981142 {'reg_lambda': 1.0} | {'reg_lambda': [0.99, 1.0]}
[86s] -0.1922981142 {'scale_pos_weight': 0.99} | {'scale_pos_weight': [0.98, 0.99,

In [474]:
active_estimator = search_xgb(X_helper, y_active, active_params)


[257s] -0.4925650488 {'n_estimators': 81} | {'n_estimators': [81, 82, 83, 84, 85, 86, 87]}
[106s] -0.4924130172 {'reg_lambda': 1.0} | {'reg_lambda': [0.98, 0.99, 1.0]}
[107s] -0.4924130172 {'learning_rate': 0.1} | {'learning_rate': [0.09000000000000001, 0.1, 0.11]}
[105s] -0.4924130172 {'subsample': 0.99} | {'subsample': [0.98, 0.99, 1.0]}
[106s] -0.4924028923 {'scale_pos_weight': 0.99} | {'scale_pos_weight': [0.97, 0.98, 0.99]}
[103s] -0.4924028923 {'colsample_bytree': 0.7799999999999998} | {'colsample_bytree': [0.7699999999999998, 0.7799999999999998, 0.7899999999999998]}
[107s] -0.4924028923 {'base_score': 0.54} | {'base_score': [0.53, 0.54, 0.55]}
[106s] -0.4923540204 {'gamma': 0.16999999999999998} | {'gamma': [0.16999999999999998, 0.18, 0.19]}
[106s] -0.4923540204 {'reg_alpha': 0.019999999999999997} | {'reg_alpha': [0.009999999999999997, 0.019999999999999997, 0.03]}
[108s] -0.4923540204 {'max_depth': 6} | {'max_depth': [5, 6, 7]}

[105s] -0.4923540204 {'subsample': 0.99} | {'subsa

[91s] -0.4921237814 {'learning_rate': 0.1} | {'learning_rate': [0.09000000000000001, 0.1, 0.11]}
[91s] -0.4921237814 {'base_score': 0.53} | {'base_score': [0.52, 0.53, 0.54]}
[91s] -0.4921237814 {'scale_pos_weight': 0.99} | {'scale_pos_weight': [0.98, 0.99, 1.0]}
[91s] -0.4921237814 {'gamma': 0.16999999999999998} | {'gamma': [0.15999999999999998, 0.16999999999999998, 0.18]}

[91s] -0.4921237814 {'base_score': 0.53} | {'base_score': [0.52, 0.53, 0.54]}
[91s] -0.4921237814 {'reg_alpha': 0.019999999999999997} | {'reg_alpha': [0.009999999999999997, 0.019999999999999997, 0.03]}
[91s] -0.4921237814 {'scale_pos_weight': 0.99} | {'scale_pos_weight': [0.98, 0.99, 1.0]}
[91s] -0.4921237814 {'subsample': 0.98} | {'subsample': [0.97, 0.98, 0.99]}
[91s] -0.4921237814 {'colsample_bytree': 0.6899999999999997} | {'colsample_bytree': [0.6899999999999997, 0.6999999999999997, 0.7099999999999997]}
[91s] -0.4921237814 {'reg_lambda': 0.99} | {'reg_lambda': [0.98, 0.99, 1.0]}
[91s] -0.4921237814 {'learning_r

### Predict `cardio`

In [475]:
def print_importances(estimator):
    for name, importance in sorted(zip(test, estimator.feature_importances_), key=operator.itemgetter(1), reverse=True):
        print(f'{name}: {importance:.7f}')

In [476]:
# cardio_params = {'base_score': 0.51, 'gamma': 0.18, 'max_depth': 5, 'n_estimators': 80, 'reg_alpha': 0.019999999999999997, 'reg_lambda': 0.99, 'scale_pos_weight': 1.0, 'subsample': 1.0, 'colsample_bytree': 1.0}

In [477]:
cardio_estimator = search_xgb(X, y, cardio_params)
print_importances(cardio_estimator)


[72s] -0.5385640981 {'colsample_bytree': 0.99} | {'colsample_bytree': [0.99, 1.0]}
[106s] -0.5384566930 {'reg_alpha': 0.04} | {'reg_alpha': [0.019999999999999997, 0.03, 0.04]}
[105s] -0.5384566930 {'base_score': 0.52} | {'base_score': [0.51, 0.52, 0.53]}
[105s] -0.5384566930 {'gamma': 0.18} | {'gamma': [0.16999999999999998, 0.18, 0.19]}
[70s] -0.5384566930 {'scale_pos_weight': 1.0} | {'scale_pos_weight': [0.99, 1.0]}
[106s] -0.5384566930 {'max_depth': 5} | {'max_depth': [4, 5, 6]}
[106s] -0.5383636186 {'learning_rate': 0.09000000000000001} | {'learning_rate': [0.09000000000000001, 0.1, 0.11]}
[107s] -0.5383636186 {'reg_lambda': 0.97} | {'reg_lambda': [0.96, 0.97, 0.98]}
[72s] -0.5383636186 {'subsample': 1.0} | {'subsample': [0.99, 1.0]}
[248s] -0.5383282897 {'n_estimators': 79} | {'n_estimators': [74, 75, 76, 77, 78, 79, 80]}

[109s] -0.5383282897 {'learning_rate': 0.09000000000000001} | {'learning_rate': [0.08000000000000002, 0.09000000000000001, 0.1]}
[111s] -0.5383282897 {'colsampl

In [478]:
test = read_csv('test.csv')
X_helper_test = test.drop(['smoke', 'alco', 'active'], axis=1).values

test['smoke'].fillna(pandas.Series(smoke_estimator.predict_proba(X_helper_test)[:, 1]), inplace=True)
test['alco'].fillna(pandas.Series(alco_estimator.predict_proba(X_helper_test)[:, 1]), inplace=True)
test['active'].fillna(pandas.Series(active_estimator.predict_proba(X_helper_test)[:, 1]), inplace=True)

numpy.savetxt(f'xgboost.txt', cardio_estimator.predict_proba(test.values)[:, 1], fmt='%f')