In [25]:
import collections
import datetime
import hyperopt
import itertools
import math
import numpy
import operator
import pandas
import random
from scipy import stats
from sklearn import (
    calibration,
    ensemble,
    linear_model,
    metrics,
    model_selection,
    naive_bayes,
    neighbors,
    neural_network,
    svm,
)
import time
import xgboost

### Read and Filter

In [26]:
dtype = {'smoke': float, 'alco': float, 'active': float}

def fix_ap(value):
    value = abs(value)
    while value > 500.0:
        value /= 10.0
    return value

def read_csv(filename):
    frame = pandas.read_csv(filename, sep=';', header=0, na_values='None', dtype=dtype).drop(['id'], axis=1)
    
    frame = pandas.get_dummies(frame, columns=[
        # 'smoke',
        # 'active',
        # 'alco',
        # 'gender',
        # 'cholesterol',
        # 'gluc',
    ])
    frame = frame.assign(
        bmi=(frame['weight'] / frame['height'] ** 2),
        # aged_smoke_0=(frame['smoke_0.0'] * frame['age']),
        # aged_smoke_1=(frame['smoke_1.0'] * frame['age']),
        # aged_active_0=(frame['active_0.0'] * frame['age']),
        # aged_active_1=(frame['active_1.0'] * frame['age']),
        # aged_alco_0=(frame['alco_0.0'] * frame['age']),
        # aged_alco_1=(frame['alco_1.0'] * frame['age']),
        # smoke_1_active_0=(frame['smoke_1.0'] * frame['active_0.0']),
    )
    
    frame['ap_hi'] = frame['ap_hi'].apply(fix_ap)
    frame['ap_lo'] = frame['ap_lo'].apply(fix_ap)
    
    return frame

In [27]:
train = read_csv('train.csv')

X = train.drop('cardio', axis=1).values
y = train['cardio'].values
print(f'X: {X.shape}')
print(f'y: {y.shape}')

X: (70000, 12)
y: (70000,)


### Hyper-parameter Optimisation

In [39]:
def cv():
    return model_selection.StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

In [65]:
Param = collections.namedtuple('Param', 'min max step')

def optimize_estimator(estimator, X, y, n_iter=10, scoring=None, refit=True, cv=None, **params):
    def evaluate():
        return model_selection.cross_val_score(estimator, X, y, scoring=scoring, cv=cv()).mean()
    
    def set_evaluate(key, value):
        setattr(estimator, key, value)
        return evaluate()
    
    params = list(params.items())
    
    print(f'Evaluating current model')
    current_score = evaluate()
    print(f'Initial score: {current_score}')
    
    for i in range(n_iter):
        print()
        print(f'Starting iteration {i}. Current score: {current_score}')
        early_stop = True
        
        # Shuffle params to lessen overfitting.
        random.shuffle(params)
        
        # Tune each parameter.
        for key, param in params:
            current_value = getattr(estimator, key)
            print(f'[{key}] Current value: {current_value}')
            
            # Find out the optimal step.
            best_step, best_value, best_score = max([
                (step, current_value + step, set_evaluate(key, current_value + step))
                for step in (-param.step, +param.step)
                if param.min <= current_value + step <= param.max
            ], default=(None, None, None), key=operator.itemgetter(1))
            
            # Check if we can move further.
            if best_step is None or best_score <= current_score:
                print(f'[{key}] No improvement')
                # Restore current value.
                setattr(estimator, key, current_value)
                continue

            # Move further.
            early_stop = False
            print(f'[{key}] Trying step: {best_step}. Got: {best_score}')
            value = best_value
            while True:
                value += best_step
                if not param.min <= value <= param.max:
                    print(f'[{key}] Value outside bounds: {value}')
                    break
                # Evaluate model with the new value.
                score = set_evaluate(key, value)
                print(f'[{key}] Tried {value}: {score}')
                if score <= best_score:
                    # Could not improve further.
                    break
                # This value is better.
                best_score, best_value = score, value
                
            # Set the best value.
            current_score = best_score
            setattr(estimator, key, best_value)
            print(f'[{key}] New value: {best_value}. New score: {current_score}')
        
        if early_stop:
            print()
            print(f'Early stopping')
            break
            
    print(f'Final score: {current_score}')

    # Fit on the entire dataset if needed.
    if refit:
        estimator.fit(X, y)
    return estimator

### Predict `smoke`, `alco` and `active`

In [60]:
X_helper = train.drop(['smoke', 'alco', 'active', 'cardio'], axis=1).values

y_smoke = train['smoke'].values
y_alco = train['alco'].values
y_active = train['active'].values

In [61]:
smoke_estimator = xgboost.XGBClassifier(nthread=5, seed=0)

In [66]:
optimize_estimator(
    smoke_estimator, X_helper, y_smoke, scoring='neg_log_loss', cv=cv,
    colsample_bytree=Param(0.5, 1.0, 0.01),
    subsample=Param(0.5, 1.0, 0.01),
    base_score=Param(0.0, 1.0, 0.01),
    scale_pos_weight=Param(0.0, 1.0, 0.01),
    reg_lambda=Param(0.0, 1.0, 0.01),
    reg_alpha=Param(0.0, 1.0, 0.01),
    gamma=Param(0.0, 1.0, 0.01),
    learning_rate=Param(0.01, 1.0, 0.01),
    max_depth=Param(1, 1000, 1),
    n_estimators=Param(1, 1000, 1),
)

Evaluating current model
Initial score: -0.23886296819062042

Starting iteration 0. Current score: -0.23886296819062042
[colsample_bytree] Current value: 0.99
[colsample_bytree] No improvement
[reg_alpha] Current value: 0.01
[reg_alpha] Trying step: 0.01. Got: -0.23885996958642125
[reg_alpha] Tried 0.03: -0.23886246593361982
[reg_alpha] New value: 0.02. New score: -0.23885996958642125
[scale_pos_weight] Current value: 1
[scale_pos_weight] No improvement
[gamma] Current value: 0
[gamma] No improvement
[reg_lambda] Current value: 1
[reg_lambda] Trying step: -0.01. Got: -0.2388594250855693
[reg_lambda] Tried 0.98: -0.23886098068919287
[reg_lambda] New value: 0.99. New score: -0.2388594250855693
[learning_rate] Current value: 0.1
[learning_rate] No improvement
[n_estimators] Current value: 101
[n_estimators] No improvement
[subsample] Current value: 0.97
[subsample] No improvement
[base_score] Current value: 0.51
[base_score] No improvement
[max_depth] Current value: 3
[max_depth] No impro

XGBClassifier(base_score=0.51, colsample_bylevel=1, colsample_bytree=0.99,
       gamma=0.01, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=101, nthread=5,
       objective='binary:logistic', reg_alpha=0.02, reg_lambda=0.99,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.97)

In [67]:
alco_estimator = xgboost.XGBClassifier(nthread=5, seed=0)

In [69]:
optimize_estimator(
    alco_estimator, X_helper, y_alco, scoring='neg_log_loss', cv=cv,
    colsample_bytree=Param(0.5, 1.0, 0.01),
    subsample=Param(0.5, 1.0, 0.01),
    base_score=Param(0.0, 1.0, 0.01),
    scale_pos_weight=Param(0.0, 1.0, 0.01),
    reg_lambda=Param(0.0, 1.0, 0.01),
    reg_alpha=Param(0.0, 1.0, 0.01),
    gamma=Param(0.0, 1.0, 0.01),
    learning_rate=Param(0.01, 1.0, 0.01),
    max_depth=Param(1, 1000, 1),
    n_estimators=Param(1, 1000, 1),
)

Evaluating current model
Initial score: -0.19240139584600163

Starting iteration 0. Current score: -0.19240139584600163
[gamma] Current value: 0
[gamma] No improvement
[learning_rate] Current value: 0.1
[learning_rate] No improvement
[max_depth] Current value: 3
[max_depth] No improvement
[reg_alpha] Current value: 0
[reg_alpha] No improvement
[colsample_bytree] Current value: 1
[colsample_bytree] No improvement
[scale_pos_weight] Current value: 1
[scale_pos_weight] No improvement
[base_score] Current value: 0.5
[base_score] No improvement
[subsample] Current value: 1
[subsample] Trying step: -0.01. Got: -0.19237007084120925
[subsample] Tried 0.98: -0.19234001700012576
[subsample] Tried 0.97: -0.1923226390888372
[subsample] Tried 0.96: -0.19232403579595364
[subsample] New value: 0.97. New score: -0.1923226390888372
[reg_lambda] Current value: 1
[reg_lambda] Trying step: -0.01. Got: -0.1923100056249212
[reg_lambda] Tried 0.98: -0.19233244067110233
[reg_lambda] New value: 0.99. New score

XGBClassifier(base_score=0.51, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=5,
       objective='binary:logistic', reg_alpha=0.01, reg_lambda=1.0,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.97)

In [70]:
active_estimator = xgboost.XGBClassifier(nthread=5, seed=0)

In [71]:
optimize_estimator(
    active_estimator, X_helper, y_active, scoring='neg_log_loss', cv=cv,
    colsample_bytree=Param(0.5, 1.0, 0.01),
    subsample=Param(0.5, 1.0, 0.01),
    base_score=Param(0.0, 1.0, 0.01),
    scale_pos_weight=Param(0.0, 1.0, 0.01),
    reg_lambda=Param(0.0, 1.0, 0.01),
    reg_alpha=Param(0.0, 1.0, 0.01),
    gamma=Param(0.0, 1.0, 0.01),
    learning_rate=Param(0.01, 1.0, 0.01),
    max_depth=Param(1, 1000, 1),
    n_estimators=Param(1, 1000, 1),
)

Evaluating current model
Initial score: -0.49276407584835874

Starting iteration 0. Current score: -0.49276407584835874
[scale_pos_weight] Current value: 1
[scale_pos_weight] No improvement
[learning_rate] Current value: 0.1
[learning_rate] Trying step: 0.01. Got: -0.4926561522892731
[learning_rate] Tried 0.12: -0.49268431598130913
[learning_rate] New value: 0.11. New score: -0.4926561522892731
[colsample_bytree] Current value: 1
[colsample_bytree] No improvement
[reg_lambda] Current value: 1
[reg_lambda] No improvement
[n_estimators] Current value: 100
[n_estimators] Trying step: 1. Got: -0.49264919387044215
[n_estimators] Tried 102: -0.49265514594885296
[n_estimators] New value: 101. New score: -0.49264919387044215
[gamma] Current value: 0
[gamma] No improvement
[subsample] Current value: 1
[subsample] No improvement
[reg_alpha] Current value: 0
[reg_alpha] No improvement
[max_depth] Current value: 3
[max_depth] Trying step: 1. Got: -0.4925968661347799
[max_depth] Tried 5: -0.4926068

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.11, max_delta_step=0, max_depth=4,
       min_child_weight=1, missing=None, n_estimators=101, nthread=5,
       objective='binary:logistic', reg_alpha=0.01, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

### Predict `cardio`

In [72]:
def print_importances(estimator):
    for name, importance in sorted(zip(test, estimator.feature_importances_), key=operator.itemgetter(1), reverse=True):
        print(f'{name}: {importance:.7f}')

In [73]:
cardio_estimator = xgboost.XGBClassifier(nthread=5, seed=0)

In [74]:
optimize_estimator(
    cardio_estimator, X, y, n_iter=100, scoring='neg_log_loss', cv=cv,
    colsample_bytree=Param(0.5, 1.0, 0.01),
    subsample=Param(0.5, 1.0, 0.01),
    base_score=Param(0.0, 1.0, 0.01),
    scale_pos_weight=Param(0.0, 1.0, 0.01),
    reg_lambda=Param(0.0, 1.0, 0.01),
    reg_alpha=Param(0.0, 1.0, 0.01),
    gamma=Param(0.0, 1.0, 0.01),
    learning_rate=Param(0.01, 1.0, 0.01),
    max_depth=Param(1, 1000, 1),
    n_estimators=Param(1, 1000, 1),
)

Evaluating current model
Initial score: -0.5394364373333194

Starting iteration 0. Current score: -0.5394364373333194
[scale_pos_weight] Current value: 1
[scale_pos_weight] No improvement
[learning_rate] Current value: 0.1
[learning_rate] Trying step: 0.01. Got: -0.5393749453345829
[learning_rate] Tried 0.12: -0.539346653216222
[learning_rate] Tried 0.13: -0.5392163866601066
[learning_rate] Tried 0.14: -0.539138600310165
[learning_rate] Tried 0.15000000000000002: -0.5390872039404008
[learning_rate] Tried 0.16000000000000003: -0.5390199100692072
[learning_rate] Tried 0.17000000000000004: -0.5391587002114411
[learning_rate] New value: 0.16000000000000003. New score: -0.5390199100692072
[gamma] Current value: 0
[gamma] Trying step: 0.01. Got: -0.5390128691927909
[gamma] Tried 0.02: -0.5390128691927909
[gamma] New value: 0.01. New score: -0.5390128691927909
[base_score] Current value: 0.5
[base_score] No improvement
[colsample_bytree] Current value: 1
[colsample_bytree] No improvement
[reg

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0.04, learning_rate=0.16000000000000003, max_delta_step=0,
       max_depth=4, min_child_weight=1, missing=None, n_estimators=102,
       nthread=5, objective='binary:logistic', reg_alpha=0.01,
       reg_lambda=1, scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [76]:
test = read_csv('test.csv')
X_helper_test = test.drop(['smoke', 'alco', 'active'], axis=1).values

test['smoke'].fillna(pandas.Series(smoke_estimator.predict_proba(X_helper_test)[:, 1]), inplace=True)
test['alco'].fillna(pandas.Series(alco_estimator.predict_proba(X_helper_test)[:, 1]), inplace=True)
test['active'].fillna(pandas.Series(active_estimator.predict_proba(X_helper_test)[:, 1]), inplace=True)

numpy.savetxt(f'xgboost.txt', cardio_estimator.predict_proba(test.values)[:, 1], fmt='%f')

print_importances(cardio_estimator)

age: 0.2669649
bmi: 0.1707681
ap_hi: 0.1431767
height: 0.1066368
weight: 0.0835198
ap_lo: 0.0760626
cholesterol: 0.0559284
gluc: 0.0365399
active: 0.0216257
gender: 0.0141685
smoke: 0.0134228
alco: 0.0111857
