In [370]:
import datetime
import hyperopt
import numpy
import operator
import pandas
import requests
from IPython.display import Audio
from sklearn import (
    ensemble,
    linear_model,
    metrics,
    model_selection,
    naive_bayes,
    neighbors,
    neural_network,
    svm,
)
import time
import xgboost

### Read and Filter

In [428]:
dtype = {'smoke': float, 'alco': float, 'active': float}

def fix_ap(value):
    value = abs(value)
    while value > 500.0:
        value /= 10.0
    return value

def read_csv(filename):
    frame = pandas.read_csv(filename, sep=';', header=0, na_values='None', dtype=dtype).drop(['id'], axis=1)
    frame['ap_hi'] = frame['ap_hi'].apply(fix_ap)
    frame = pandas.get_dummies(frame, columns=[
        'gender',
        'cholesterol',
        'gluc',
        'smoke',
        'active',
        'alco',
    ])
    frame = frame.assign(
        # aged_smoke_0=(frame['smoke_0.0'] * frame['age']),
        # aged_smoke_1=(frame['smoke_1.0'] * frame['age']),
        # aged_active_0=(frame['active_0.0'] * frame['age']),
        # aged_active_1=(frame['active_1.0'] * frame['age']),
        # aged_alco_0=(frame['alco_0.0'] * frame['age']),
        # aged_alco_1=(frame['alco_1.0'] * frame['age']),
        # bmi=(frame['weight'] / frame['height'] ** 2),
        # smoke_1_active_0=(frame['smoke_1.0'] * frame['active_0.0']),
        # aged_cholesterol_1=(frame['cholesterol_1'] * frame['age']),
        # aged_cholesterol_2=(frame['cholesterol_2'] * frame['age']),
        # aged_cholesterol_3=(frame['cholesterol_3'] * frame['age']),
        # gender_1_smoke_1=(frame['gender_1'] * frame['smoke_1.0']),
        # gender_2_smoke_1=(frame['gender_2'] * frame['smoke_1.0']),
    )
    return frame

In [438]:
train = read_csv('train.csv')

X = train.drop('cardio', axis=1).values
y = train['cardio'].values
print(f'X: {X.shape}')
print(f'y: {y.shape}')

X: (70000, 19)
y: (70000,)


In [439]:
test = read_csv('test.csv')

X_test = test.values
print(f'X_test: {X_test.shape}')

X_test: (30000, 19)


In [440]:
train.head()

Unnamed: 0,age,height,weight,ap_hi,ap_lo,cardio,gender_1,gender_2,cholesterol_1,cholesterol_2,cholesterol_3,gluc_1,gluc_2,gluc_3,smoke_0.0,smoke_1.0,active_0.0,active_1.0,alco_0.0,alco_1.0
0,18393,168,62.0,110.0,80,0,0,1,1,0,0,1,0,0,1,0,0,1,1,0
1,20228,156,85.0,140.0,90,1,1,0,0,0,1,1,0,0,1,0,0,1,1,0
2,18857,165,64.0,130.0,70,1,1,0,0,0,1,1,0,0,1,0,1,0,1,0
3,17623,169,82.0,150.0,100,1,0,1,1,0,0,1,0,0,1,0,0,1,1,0
4,17474,156,56.0,100.0,60,0,1,0,1,0,0,1,0,0,1,0,1,0,1,0


In [441]:
test.head()

Unnamed: 0,age,height,weight,ap_hi,ap_lo,gender_1,gender_2,cholesterol_1,cholesterol_2,cholesterol_3,gluc_1,gluc_2,gluc_3,smoke_0.0,smoke_1.0,active_0.0,active_1.0,alco_0.0,alco_1.0
0,18888,154,85.0,130.0,80,1,0,1,0,0,1,0,0,1,0,0,1,1,0
1,19042,170,69.0,130.0,90,0,1,1,0,0,1,0,0,0,0,0,1,1,0
2,20432,160,70.0,120.0,75,1,0,1,0,0,1,0,0,1,0,1,0,1,0
3,18133,185,94.0,130.0,90,0,1,1,0,0,1,0,0,0,0,0,1,1,0
4,16093,170,76.0,120.0,80,0,1,1,0,0,1,0,0,1,0,0,1,1,0


### Predict and Save

In [376]:
def predict_save(estimator, filename):
    estimator.fit(X, y)
    for name, importance in sorted(zip(test, estimator.feature_importances_), key=operator.itemgetter(1), reverse=True):
        print(f'{name}: {importance:.7f}')
    numpy.savetxt(filename, estimator.predict_proba(X_test)[:, 1], fmt='%f')

### Objective

In [377]:
def make_objective(make_estimator, **fit_params):
    def objective(kwargs):
        start_time = time.time()
        score = -model_selection.cross_val_score(
            make_estimator(kwargs),
            X,
            y,
            cv=model_selection.StratifiedKFold(n_splits=3, shuffle=True, random_state=0),
            scoring='neg_log_loss',
            fit_params=fit_params,
        ).mean()
        print(f'[{(time.time() - start_time) / 60.0:.2f} min] {score:.7f} {kwargs}')
        return score
    return objective

### XGBoost

In [436]:
# xgboost_trials = hyperopt.Trials()

In [445]:
def make_estimator(kwargs):
    return xgboost.XGBClassifier(
        n_estimators=int(kwargs['n_estimators']), 
        max_depth=int(kwargs['max_depth']),
        nthread=5,
    )

space = {
    'n_estimators': hyperopt.hp.quniform('n_estimators', 1, 200, 1),
    'max_depth': hyperopt.hp.quniform('max_depth', 1, 15, 1),
}
objective = make_objective(make_estimator, eval_metric='logloss')
best = hyperopt.fmin(objective, space, hyperopt.tpe.suggest, 100, trials=xgboost_trials)
result = xgboost_trials.best_trial['result']['loss']

print()
print(f'Trials: {len(xgboost_trials)}')
print(f'Best: {result:.7f} {best}')

[0.03 min] 0.5424246 {'max_depth': 2.0, 'n_estimators': 99.0}
[0.05 min] 0.5391981 {'max_depth': 4.0, 'n_estimators': 123.0}
[0.09 min] 0.5400571 {'max_depth': 5.0, 'n_estimators': 179.0}
[0.12 min] 0.5482817 {'max_depth': 9.0, 'n_estimators': 127.0}
[0.07 min] 0.5395281 {'max_depth': 4.0, 'n_estimators': 163.0}
[0.12 min] 0.5422347 {'max_depth': 6.0, 'n_estimators': 186.0}
[0.02 min] 0.5443270 {'max_depth': 2.0, 'n_estimators': 67.0}
[0.04 min] 0.5391863 {'max_depth': 4.0, 'n_estimators': 87.0}
[0.02 min] 0.5462328 {'max_depth': 6.0, 'n_estimators': 19.0}
[0.11 min] 0.5520865 {'max_depth': 11.0, 'n_estimators': 90.0}
[0.05 min] 0.5410286 {'max_depth': 7.0, 'n_estimators': 62.0}
[0.04 min] 0.5393177 {'max_depth': 5.0, 'n_estimators': 76.0}
[0.02 min] 0.5428461 {'max_depth': 3.0, 'n_estimators': 43.0}
[0.11 min] 0.5474152 {'max_depth': 9.0, 'n_estimators': 109.0}
[0.01 min] 0.5468341 {'max_depth': 2.0, 'n_estimators': 48.0}
[0.04 min] 0.5392235 {'max_depth': 4.0, 'n_estimators': 84.0}
[

In [446]:
predict_save(make_estimator(best), f'xgboost-{result:.7f}.txt')

age: 0.2612137
ap_hi: 0.1781003
weight: 0.1536939
height: 0.0969657
ap_lo: 0.0903694
cholesterol_1: 0.0382586
cholesterol_3: 0.0310026
active_0.0: 0.0296834
gluc_3: 0.0283641
gluc_1: 0.0184697
alco_0.0: 0.0171504
gender_1: 0.0164908
smoke_0.0: 0.0145119
gluc_2: 0.0131926
cholesterol_2: 0.0125330
gender_2: 0.0000000
smoke_1.0: 0.0000000
active_1.0: 0.0000000
alco_1.0: 0.0000000
