In [306]:
import datetime
import hyperopt
import numpy
import operator
import pandas
import requests
from IPython.display import Audio
from sklearn import (
    ensemble,
    linear_model,
    metrics,
    model_selection,
    naive_bayes,
    neighbors,
    neural_network,
    svm,
)
import time
import xgboost



### Read and Filter

In [321]:
dtype = {'smoke': float, 'alco': float, 'active': float}

def read_csv(filename):
    frame = pandas.read_csv(filename, sep=';', header=0, na_values='None', dtype=dtype).drop(['id'], axis=1)
    frame = pandas.get_dummies(frame, columns=[
        'gender',
        'cholesterol',
        'gluc',
        'smoke',
        'active',
        'alco',
    ])
    frame = frame.assign(
        aged_smoke_0=(frame['smoke_0.0'] * frame['age']),
        aged_smoke_1=(frame['smoke_1.0'] * frame['age']),
        aged_active_0=(frame['active_0.0'] * frame['age']),
        aged_active_1=(frame['active_1.0'] * frame['age']),
        aged_alco_0=(frame['alco_0.0'] * frame['age']),
        aged_alco_1=(frame['alco_1.0'] * frame['age']),
        bmi=(frame['weight'] / frame['height'] ** 2),
        smoke_1_active_0=(frame['smoke_1.0'] * frame['active_0.0']),
        aged_cholesterol_1=(frame['cholesterol_1'] * frame['age']),
        aged_cholesterol_2=(frame['cholesterol_2'] * frame['age']),
        aged_cholesterol_3=(frame['cholesterol_3'] * frame['age']),
    )
    '''
    frame = frame.drop([
        'age',
        'gluc',
        'alco',
        'weight',
        'height',
        'gender',
        'cholesterol',
        'active',
        'alco',
        'ap_lo',
        'smoke',
    ], axis=1)
    '''
    return frame

In [322]:
train = read_csv('train.csv')
train = train[
    (train['weight'] > 40.0)
    & (train['weight'] < 130.0)
    & (train['height'] > 140.0)
    & (train['height'] < 190.0)
    & (train['ap_hi'] > 80.0)
    & (train['ap_hi'] < 250.0)
    & (train['ap_lo'] > 40.0)
    & (train['ap_lo'] < 250.0)
].dropna()

X = train.drop('cardio', axis=1).values
y = train['cardio'].values
print(f'X: {X.shape}')
print(f'y: {y.shape}')

X: (68033, 30)
y: (68033,)


In [323]:
test = read_csv('test.csv')

X_test = test.values
print(f'X_test: {X_test.shape}')

X_test: (30000, 30)


In [324]:
train.head()

Unnamed: 0,age,height,weight,ap_hi,ap_lo,cardio,gender_1,gender_2,cholesterol_1,cholesterol_2,...,aged_active_1,aged_alco_0,aged_alco_1,aged_cholesterol_1,aged_cholesterol_2,aged_cholesterol_3,aged_smoke_0,aged_smoke_1,bmi,smoke_1_active_0
0,18393,168,62.0,110,80,0,0,1,1,0,...,18393,18393,0,18393,0,0,18393,0,0.002197,0
1,20228,156,85.0,140,90,1,1,0,0,0,...,20228,20228,0,0,0,20228,20228,0,0.003493,0
2,18857,165,64.0,130,70,1,1,0,0,0,...,0,18857,0,0,0,18857,18857,0,0.002351,0
3,17623,169,82.0,150,100,1,0,1,1,0,...,17623,17623,0,17623,0,0,17623,0,0.002871,0
4,17474,156,56.0,100,60,0,1,0,1,0,...,0,17474,0,17474,0,0,17474,0,0.002301,0


In [325]:
test.head()

Unnamed: 0,age,height,weight,ap_hi,ap_lo,gender_1,gender_2,cholesterol_1,cholesterol_2,cholesterol_3,...,aged_active_1,aged_alco_0,aged_alco_1,aged_cholesterol_1,aged_cholesterol_2,aged_cholesterol_3,aged_smoke_0,aged_smoke_1,bmi,smoke_1_active_0
0,18888,154,85.0,130,80,1,0,1,0,0,...,18888,18888,0,18888,0,0,18888,0,0.003584,0
1,19042,170,69.0,130,90,0,1,1,0,0,...,19042,19042,0,19042,0,0,0,0,0.002388,0
2,20432,160,70.0,120,75,1,0,1,0,0,...,0,20432,0,20432,0,0,20432,0,0.002734,0
3,18133,185,94.0,130,90,0,1,1,0,0,...,18133,18133,0,18133,0,0,0,0,0.002747,0
4,16093,170,76.0,120,80,0,1,1,0,0,...,16093,16093,0,16093,0,0,16093,0,0.00263,0


### Predict and Save

In [326]:
def predict_save(estimator, filename):
    estimator.fit(X, y)
    for name, importance in sorted(zip(test, estimator.feature_importances_), key=operator.itemgetter(1), reverse=True):
        print(f'{name}: {importance:.7f}')
    numpy.savetxt(filename, estimator.predict_proba(X_test)[:, 1], fmt='%f')

### Objective

In [333]:
def make_objective(make_estimator, **fit_params):
    def objective(kwargs):
        start_time = time.time()
        score = -model_selection.cross_val_score(
            make_estimator(kwargs),
            X,
            y,
            cv=model_selection.StratifiedKFold(n_splits=3, shuffle=True),
            scoring='neg_log_loss',
            fit_params=fit_params,
        ).mean()
        print(f'[{(time.time() - start_time) / 60.0:.2f} min] {score:.7f} {kwargs}')
        return score
    return objective

### Random Forest

In [304]:
def make_estimator(kwargs):
    return ensemble.RandomForestClassifier(
        n_estimators=int(kwargs['n_estimators']), 
        max_depth=int(kwargs['max_depth']),
        n_jobs=4,
    )

space = {
    'n_estimators': hyperopt.hp.quniform('n_estimators', 1, 100, 1),
    'max_depth': hyperopt.hp.quniform('max_depth', 1, 50, 1),
}
best = hyperopt.fmin(make_objective(make_estimator), space, hyperopt.tpe.suggest, 100)
best

[0.16 min] 0.5478589 {'max_depth': 16.0, 'n_estimators': 83.0}
[0.04 min] 0.5611649 {'max_depth': 5.0, 'n_estimators': 32.0}
[0.13 min] 0.5650408 {'max_depth': 23.0, 'n_estimators': 62.0}
[0.04 min] 0.5706355 {'max_depth': 4.0, 'n_estimators': 37.0}
[0.04 min] 1.1895872 {'max_depth': 38.0, 'n_estimators': 11.0}
[0.12 min] 0.6085148 {'max_depth': 42.0, 'n_estimators': 54.0}
[0.05 min] 1.0809867 {'max_depth': 47.0, 'n_estimators': 13.0}
[0.07 min] 0.7110677 {'max_depth': 38.0, 'n_estimators': 26.0}
[0.02 min] 0.5708085 {'max_depth': 5.0, 'n_estimators': 4.0}
[0.06 min] 0.5464596 {'max_depth': 13.0, 'n_estimators': 30.0}
[0.09 min] 0.6500366 {'max_depth': 44.0, 'n_estimators': 34.0}
[0.06 min] 0.7161583 {'max_depth': 32.0, 'n_estimators': 21.0}
[0.12 min] 0.5538628 {'max_depth': 19.0, 'n_estimators': 65.0}
[0.10 min] 0.6296145 {'max_depth': 44.0, 'n_estimators': 44.0}
[0.09 min] 0.5740403 {'max_depth': 24.0, 'n_estimators': 38.0}
[0.10 min] 0.5492698 {'max_depth': 16.0, 'n_estimators': 55

{'max_depth': 11.0, 'n_estimators': 56.0}

In [305]:
predict_save(make_estimator(best), 'random-forest.txt')

ap_hi: 0.3595507
ap_lo: 0.1899967
age: 0.0760487
aged_smoke_0: 0.0665916
bmi: 0.0523563
cholesterol_3: 0.0458820
cholesterol_1: 0.0430720
weight: 0.0378965
aged_active_1: 0.0368078
aged_active_0: 0.0236854
height: 0.0200185
cholesterol_2: 0.0090780
aged_smoke_1: 0.0073815
gluc_1: 0.0057995
gluc_3: 0.0050410
gender_1: 0.0030363
gluc_2: 0.0028736
gender_2: 0.0028507
active_0.0: 0.0026435
alco_1.0: 0.0022493
active_1.0: 0.0022305
alco_0.0: 0.0020149
smoke_0.0: 0.0015447
smoke_1.0: 0.0013503


### XGBoost

In [343]:
xgboost_trials = hyperopt.Trials()

In [352]:
def make_estimator(kwargs):
    return xgboost.XGBClassifier(
        n_estimators=int(kwargs['n_estimators']), 
        max_depth=int(kwargs['max_depth']),
        nthread=5,
    )

space = {
    'n_estimators': hyperopt.hp.quniform('n_estimators', 1, 100, 1),
    'max_depth': hyperopt.hp.quniform('max_depth', 1, 20, 1),
}
objective = make_objective(make_estimator, eval_metric='logloss')
best = hyperopt.fmin(objective, space, hyperopt.tpe.suggest, 30, trials=xgboost_trials)
result = xgboost_trials.best_trial['result']['loss']

print()
print(f'Trials: {len(xgboost_trials)}')
print(f'Best: {result:.7f} {best}')

[0.08 min] 0.5427101 {'max_depth': 6.0, 'n_estimators': 72.0}
[0.06 min] 0.5417397 {'max_depth': 6.0, 'n_estimators': 55.0}
[0.03 min] 0.5426937 {'max_depth': 5.0, 'n_estimators': 31.0}
[0.02 min] 0.5576714 {'max_depth': 1.0, 'n_estimators': 69.0}
[0.07 min] 0.5416419 {'max_depth': 5.0, 'n_estimators': 82.0}
[0.02 min] 0.5542847 {'max_depth': 7.0, 'n_estimators': 14.0}
[0.04 min] 0.5413521 {'max_depth': 4.0, 'n_estimators': 51.0}
[0.03 min] 0.5428777 {'max_depth': 3.0, 'n_estimators': 52.0}
[0.01 min] 0.5702489 {'max_depth': 1.0, 'n_estimators': 33.0}
[0.01 min] 0.5615349 {'max_depth': 4.0, 'n_estimators': 13.0}

Trials: 30
Best: 0.5413521 {'max_depth': 4.0, 'n_estimators': 51.0}


In [354]:
predict_save(make_estimator(best), f'xgboost-{result:.7f}.txt')

ap_hi: 0.1571238
bmi: 0.1145140
aged_alco_0: 0.0812250
age: 0.0785619
ap_lo: 0.0719041
weight: 0.0705726
aged_cholesterol_3: 0.0572570
aged_smoke_0: 0.0572570
gluc_3: 0.0426099
aged_cholesterol_1: 0.0426099
height: 0.0399467
aged_active_0: 0.0399467
aged_active_1: 0.0359521
cholesterol_1: 0.0252996
aged_cholesterol_2: 0.0226365
cholesterol_3: 0.0186418
gluc_2: 0.0146471
gluc_1: 0.0133156
gender_1: 0.0066578
aged_smoke_1: 0.0053262
aged_alco_1: 0.0039947
gender_2: 0.0000000
cholesterol_2: 0.0000000
smoke_0.0: 0.0000000
smoke_1.0: 0.0000000
active_0.0: 0.0000000
active_1.0: 0.0000000
alco_0.0: 0.0000000
alco_1.0: 0.0000000
smoke_1_active_0: 0.0000000
