In [34]:
import datetime
import hyperopt
import numpy
import pandas
import requests
from IPython.core.display import display, HTML
from sklearn import (
    ensemble,
    linear_model,
    metrics,
    model_selection,
    naive_bayes,
    neighbors,
    neural_network,
    svm,
)
import time

### Read and Filter

In [86]:
dtype = {'smoke': float, 'alco': float, 'active': float}

def read_csv(filename):
    return pandas \
        .get_dummies(
            pandas.read_csv(filename, sep=';', header=0, na_values='None', dtype=dtype),
            columns=['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active'],
        ) \
        .drop(['id', 'ap_lo'], axis=1)

In [89]:
train = read_csv('train.csv')
train = train[
    (train['weight'] > 40.0)
    & (train['weight'] < 130.0)
    & (train['height'] > 140.0)
    & (train['height'] < 190.0)
    & (train['ap_hi'] > 80.0)
    & (train['ap_hi'] < 250.0)
    # & (train['ap_lo'] > 40.0)
    # & (train['ap_lo'] < 250.0)
].dropna()

X = train.drop('cardio', axis=1).values
y = train['cardio'].values
print(f'X: {X.shape}')
print(f'y: {y.shape}')

X: (69021, 18)
y: (69021,)


In [90]:
test = read_csv('test.csv')

X_test = test.values
print(f'X_test: {X_test.shape}')

X_test: (30000, 18)


In [91]:
train.head()

Unnamed: 0,age,height,weight,ap_hi,cardio,gender_1,gender_2,cholesterol_1,cholesterol_2,cholesterol_3,gluc_1,gluc_2,gluc_3,smoke_0.0,smoke_1.0,alco_0.0,alco_1.0,active_0.0,active_1.0
0,18393,168,62.0,110,0,0,1,1,0,0,1,0,0,1,0,1,0,0,1
1,20228,156,85.0,140,1,1,0,0,0,1,1,0,0,1,0,1,0,0,1
2,18857,165,64.0,130,1,1,0,0,0,1,1,0,0,1,0,1,0,1,0
3,17623,169,82.0,150,1,0,1,1,0,0,1,0,0,1,0,1,0,0,1
4,17474,156,56.0,100,0,1,0,1,0,0,1,0,0,1,0,1,0,1,0


In [92]:
test.head()

Unnamed: 0,age,height,weight,ap_hi,gender_1,gender_2,cholesterol_1,cholesterol_2,cholesterol_3,gluc_1,gluc_2,gluc_3,smoke_0.0,smoke_1.0,alco_0.0,alco_1.0,active_0.0,active_1.0
0,18888,154,85.0,130,1,0,1,0,0,1,0,0,1,0,1,0,0,1
1,19042,170,69.0,130,0,1,1,0,0,1,0,0,0,0,1,0,0,1
2,20432,160,70.0,120,1,0,1,0,0,1,0,0,1,0,1,0,1,0
3,18133,185,94.0,130,0,1,1,0,0,1,0,0,0,0,1,0,0,1
4,16093,170,76.0,120,0,1,1,0,0,1,0,0,1,0,1,0,0,1


### Predict and Save

In [101]:
def predict_save(estimator, filename):
    estimator.fit(X, y)
    for name, importance in zip(train, estimator.feature_importances_):
        print(f'{name}: {importance:.7f}')
    numpy.savetxt(filename, estimator.predict_proba(X_test)[:, 1], fmt='%f')

### Objective

In [94]:
def make_objective(make_estimator):
    def objective(kwargs):
        start_time = time.time()
        score = -model_selection.cross_val_score(
            make_estimator(kwargs),
            X,
            y,
            cv=model_selection.StratifiedKFold(shuffle=True),
            scoring='neg_log_loss',
        ).mean()
        print(f'[{(time.time() - start_time) / 60.0:.1f} min] {score:.7f} {kwargs}')
        return score
    return objective

### Random Forest

In [95]:
def make_estimator(kwargs):
    return ensemble.RandomForestClassifier(
        n_estimators=int(kwargs['n_estimators']), 
        max_depth=int(kwargs['max_depth']),
    )

space = {
    'n_estimators': hyperopt.hp.quniform('n_estimators', 1, 1000, 1),
    'max_depth': hyperopt.hp.quniform('max_depth', 1, 50, 1),
}
best = hyperopt.fmin(make_objective(make_estimator), space, hyperopt.tpe.suggest, 5)
best

[1.3 min] 0.5551171 {'max_depth': 18.0, 'n_estimators': 537.0}
[2.2 min] 0.5942244 {'max_depth': 43.0, 'n_estimators': 687.0}
[2.1 min] 0.5656297 {'max_depth': 22.0, 'n_estimators': 747.0}
[0.5 min] 0.5991992 {'max_depth': 47.0, 'n_estimators': 162.0}
[1.9 min] 0.5682895 {'max_depth': 23.0, 'n_estimators': 651.0}


{'max_depth': 18.0, 'n_estimators': 537.0}

In [None]:
predict_save(make_estimator(best), 'random-forest.txt')

age: 0.2379431
height: 0.1187979
weight: 0.1436067
ap_hi: 0.3710759
cardio: 0.0064917
gender_1: 0.0064895
gender_2: 0.0325123
cholesterol_1: 0.0073940
cholesterol_2: 0.0260537
cholesterol_3: 0.0076698
gluc_1: 0.0051966
gluc_2: 0.0059060
gluc_3: 0.0048101
smoke_0.0: 0.0048383
smoke_1.0: 0.0045761
alco_0.0: 0.0045918
alco_1.0: 0.0060101
active_0.0: 0.0060362
