In [193]:
import datetime
import hyperopt
import itertools
import math
import numpy
import operator
import pandas
import random
import requests
from IPython.display import Audio
from sklearn import (
    calibration,
    ensemble,
    linear_model,
    metrics,
    model_selection,
    naive_bayes,
    neighbors,
    neural_network,
    svm,
)
import time
import tqdm
import xgboost

### Read and Filter

In [207]:
dtype = {'smoke': float, 'alco': float, 'active': float}

def fix_ap(value):
    value = abs(value)
    while value > 500.0:
        value /= 10.0
    return value

def read_csv(filename):
    frame = pandas.read_csv(filename, sep=';', header=0, na_values='None', dtype=dtype).drop(['id'], axis=1)
    
    frame = pandas.get_dummies(frame, columns=[
        # 'smoke',
        # 'active',
        # 'alco',
        # 'gender',
        # 'cholesterol',
        # 'gluc',
    ])
    frame = frame.assign(
        bmi=(frame['weight'] / frame['height'] ** 2),
        # aged_smoke_0=(frame['smoke_0.0'] * frame['age']),
        # aged_smoke_1=(frame['smoke_1.0'] * frame['age']),
        # aged_active_0=(frame['active_0.0'] * frame['age']),
        # aged_active_1=(frame['active_1.0'] * frame['age']),
        # aged_alco_0=(frame['alco_0.0'] * frame['age']),
        # aged_alco_1=(frame['alco_1.0'] * frame['age']),
        # smoke_1_active_0=(frame['smoke_1.0'] * frame['active_0.0']),
    )
    frame = frame.drop([
        # 'smoke',
        # 'alco',
        # 'active',
    ], axis=1)
    
    frame['ap_hi'] = frame['ap_hi'].apply(fix_ap)
    
    return frame

In [208]:
train = read_csv('train.csv')

X = train.drop('cardio', axis=1).values
y = train['cardio'].values
print(f'X: {X.shape}')
print(f'y: {y.shape}')

X: (70000, 12)
y: (70000,)


In [242]:
test = read_csv('test.csv')

Xs_test = []
for smoke_value, alco_value, active_value in itertools.product(range(2), repeat=3):
    test_copy = test.copy()
    test_copy['smoke'] = test_copy['smoke'].fillna(smoke_value)
    test_copy['alco'] = test_copy['alco'].fillna(alco_value)
    test_copy['active'] = test_copy['active'].fillna(active_value)
    Xs_test.append(test_copy.values)


In [210]:
train.head()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,bmi
0,18393,2,168,62.0,110.0,80,1,1,0.0,0.0,1.0,0,0.002197
1,20228,1,156,85.0,140.0,90,3,1,0.0,0.0,1.0,1,0.003493
2,18857,1,165,64.0,130.0,70,3,1,0.0,0.0,0.0,1,0.002351
3,17623,2,169,82.0,150.0,100,1,1,0.0,0.0,1.0,1,0.002871
4,17474,1,156,56.0,100.0,60,1,1,0.0,0.0,0.0,0,0.002301


### Predict and Save

In [252]:
def save_predictions(estimator, filename):
    for name, importance in sorted(zip(test, estimator.feature_importances_), key=operator.itemgetter(1), reverse=True):
        print(f'{name}: {importance:.7f}')
    y_pred = numpy.vstack(estimator.predict_proba(X_test)[:, 1] for X_test in Xs_test).mean(axis=0)
    numpy.savetxt(filename, y_pred, fmt='%f')

### CV

In [213]:
def cv():
    return model_selection.StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

### XGBoost

In [214]:
def search_xgb(params, **param_grid):
    start_time = time.time()
    grid_search = model_selection.GridSearchCV(
        xgboost.XGBClassifier(nthread=5, seed=0, **params),
        param_grid,
        scoring='neg_log_loss',
        refit=False,
        cv=cv(),
    ).fit(X, y)
    print(f'[{(time.time() - start_time):.0f}s] {grid_search.best_score_:.7f} {grid_search.best_params_} | {param_grid}')
    return grid_search

In [79]:
'''
xgb_params = {
    'base_score': 0.51,
    'gamma': 0.18,
    'max_depth': 5,
    'n_estimators': 80,
    'reg_alpha': 0.019999999999999997,
    'reg_lambda': 0.99,
    'scale_pos_weight': 1.0,
}
'''

In [215]:
def make_int_grid(params, param_name, delta):
    n = params.get(param_name, 1)
    return {param_name: list(range(max(1, n - delta), n + delta + 1))}

def make_float_grid(params, param_name, delta, min_, max_):
    x = params.get(param_name, 0.0)
    return {param_name: [max(min_, x - delta), x, min(max_, x + delta)]}

for _ in itertools.count(start=1):
    print()
    last_params = dict(xgb_params)
    xgb_params.update(search_xgb(xgb_params, **make_float_grid(xgb_params, 'base_score', 0.01, 0.0, 1.0)).best_params_)
    xgb_params.update(search_xgb(xgb_params, **make_float_grid(xgb_params, 'scale_pos_weight', 0.02, 0.0, 1.0)).best_params_)
    xgb_params.update(search_xgb(xgb_params, **make_float_grid(xgb_params, 'reg_lambda', 0.01, 0.0, 1.0)).best_params_)
    xgb_params.update(search_xgb(xgb_params, **make_float_grid(xgb_params, 'reg_alpha', 0.01, 0.0, 1.0)).best_params_)
    xgb_params.update(search_xgb(xgb_params, **make_float_grid(xgb_params, 'gamma', 0.01, 0.0, 1.0)).best_params_)
    xgb_params.update(search_xgb(xgb_params, **make_int_grid(xgb_params, 'max_depth', 1)).best_params_)
    xgb_params.update(search_xgb(xgb_params, **make_int_grid(xgb_params, 'n_estimators', 3)).best_params_)
    if xgb_params == last_params:
        break


[32s] -0.5384644 {'base_score': 0.51} | {'base_score': [0.5, 0.51, 0.52]}
[34s] -0.5384644 {'scale_pos_weight': 1.0} | {'scale_pos_weight': [0.98, 1.0, 1.0]}
[34s] -0.5384644 {'reg_lambda': 0.99} | {'reg_lambda': [0.98, 0.99, 1.0]}
[33s] -0.5384644 {'reg_alpha': 0.019999999999999997} | {'reg_alpha': [0.009999999999999997, 0.019999999999999997, 0.03]}
[33s] -0.5384644 {'gamma': 0.18} | {'gamma': [0.16999999999999998, 0.18, 0.19]}
[34s] -0.5384644 {'max_depth': 5} | {'max_depth': [4, 5, 6]}
[79s] -0.5384644 {'n_estimators': 80} | {'n_estimators': [77, 78, 79, 80, 81, 82, 83]}


In [253]:
save_predictions(xgboost.XGBClassifier(nthread=5, **xgb_params).fit(X, y), f'xgboost.txt')

age: 0.2551440
bmi: 0.1627801
ap_hi: 0.1252858
height: 0.1001372
weight: 0.0960220
ap_lo: 0.0759031
cholesterol: 0.0653864
gluc: 0.0480110
active: 0.0301783
smoke: 0.0169182
alco: 0.0132602
gender: 0.0109739
