In [2]:
import hyperopt
import numpy
import pandas
from sklearn import (
    ensemble,
    linear_model,
    metrics,
    model_selection,
    naive_bayes,
    neighbors,
    neural_network,
    svm,
)

### Reading

In [3]:
train = pandas \
    .read_csv('train.csv', sep=';', header=0, na_values='None') \
    .drop(['id', 'smoke', 'alco', 'active'], axis=1)
train.head()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,cardio
0,18393,2,168,62.0,110,80,1,1,0
1,20228,1,156,85.0,140,90,3,1,1
2,18857,1,165,64.0,130,70,3,1,1
3,17623,2,169,82.0,150,100,1,1,1
4,17474,1,156,56.0,100,60,1,1,0


In [4]:
test = pandas \
    .read_csv('test.csv', sep=';', header=0, na_values='None') \
    .drop(['id', 'smoke', 'alco', 'active'], axis=1)
test.head()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc
0,18888,1,154,85.0,130,80,1,1
1,19042,2,170,69.0,130,90,1,1
2,20432,1,160,70.0,120,75,1,1
3,18133,2,185,94.0,130,90,1,1
4,16093,2,170,76.0,120,80,1,1


In [5]:
# test[['smoke', 'alco', 'active']] = test[['smoke', 'alco', 'active']].fillna(value=0.0)
X_test = test.values

### Filtering

In [6]:
filtered_train = train[
    (train['weight'] > 40.0) &
    (train['weight'] < 130.0) &
    (train['height'] > 140.0) &
    (train['height'] < 190.0) &
    (train['ap_hi'] > 80.0) &
    (train['ap_hi'] < 250.0) &
    (train['ap_lo'] > 40.0) &
    (train['ap_lo'] < 250.0)
].dropna()
filtered_train.head()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,cardio
0,18393,2,168,62.0,110,80,1,1,0
1,20228,1,156,85.0,140,90,3,1,1
2,18857,1,165,64.0,130,70,3,1,1
3,17623,2,169,82.0,150,100,1,1,1
4,17474,1,156,56.0,100,60,1,1,0


### Make arrays

In [7]:
X = filtered_train.drop('cardio', axis=1).values
y = filtered_train['cardio'].values
X.shape, y.shape

((68033, 8), (68033,))

### Saving

In [55]:
def predict_save(estimator, filename):
    numpy.savetxt(filename, estimator.fit(X, y).predict_proba(X_test)[:, 1], fmt='%f')

### Objective

In [58]:
def make_objective(make_estimator):
    def objective(kwargs):
        score = -model_selection.cross_val_score(
            make_estimator(kwargs),
            X,
            y,
            cv=model_selection.StratifiedKFold(shuffle=True),
            scoring='neg_log_loss',
        ).mean()
        print(score, kwargs)
        return score
    return objective

### Random forest

In [None]:
def make_estimator(kwargs):
    return ensemble.RandomForestClassifier(
        n_estimators=int(kwargs['n_estimators']), 
        max_depth=int(kwargs['max_depth']),
    )

space = {
    'n_estimators': hyperopt.hp.quniform('n_estimators', 1, 1000, 1),
    'max_depth': hyperopt.hp.quniform('max_depth', 1, 1000, 1),
}
objective = make_objective(make_estimator)
best = hyperopt.fmin(objective, space, hyperopt.tpe.suggest, 25)
objective(best), best

0.630232298033 {'max_depth': 324.0, 'n_estimators': 76.0}
0.599287941728 {'max_depth': 716.0, 'n_estimators': 161.0}
0.551906840042 {'max_depth': 17.0, 'n_estimators': 171.0}
0.590086110223 {'max_depth': 490.0, 'n_estimators': 596.0}
0.590211697569 {'max_depth': 815.0, 'n_estimators': 870.0}
0.597662794508 {'max_depth': 786.0, 'n_estimators': 222.0}
0.589336879566 {'max_depth': 552.0, 'n_estimators': 557.0}
0.588835704501 {'max_depth': 554.0, 'n_estimators': 409.0}
0.590234343168 {'max_depth': 615.0, 'n_estimators': 948.0}
0.592391568597 {'max_depth': 710.0, 'n_estimators': 743.0}
0.608859882058 {'max_depth': 87.0, 'n_estimators': 118.0}


In [56]:
predict_save(make_estimator(best), '00-random-forest.txt')

### SVC

In [12]:
grid_search = model_selection.GridSearchCV(
    svm.SVC(),
    param_grid={
        'C': [1.0],
    },
    n_jobs=4,
    scoring=scoring,
    verbose=2,
).fit(X, y)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] C=1.0 ...........................................................
[CV] C=1.0 ...........................................................
[CV] ............................................ C=1.0, total= 2.8min
[CV] C=1.0 ...........................................................
[CV] ............................................ C=1.0, total= 2.8min
[CV] ............................................ C=1.0, total= 2.8min


[Parallel(n_jobs=2)]: Done   3 out of   3 | elapsed:  6.7min finished


In [13]:
grid_search.best_score_, grid_search.best_params_

(-14.633041639810525, {'C': 1.0})

### Naive Bayes

In [33]:
model_selection.cross_val_score(naive_bayes.GaussianNB(), X, y, scoring=scoring)

array([ -9.93922383, -10.03365181,  -9.97926246])

### Neighbors

In [31]:
grid_search = model_selection.GridSearchCV(
    neighbors.KNeighborsClassifier(),
    param_grid={
        'n_neighbors': [5, 10, 15, 20, 40, 50, 60, 75, 100],
    },
    n_jobs=4,
    scoring=scoring,
    verbose=1,
).fit(X, y)
print(grid_search.best_score_, grid_search.best_params_)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
-10.1333132386 {'n_neighbors': 50}


[Parallel(n_jobs=4)]: Done  27 out of  27 | elapsed:   11.5s finished


### NN

In [32]:
grid_search = model_selection.GridSearchCV(
    neural_network.MLPClassifier(max_iter=10000),
    param_grid={
        'alpha': [0.001, 0.0001, 0.00001],
        'hidden_layer_sizes': [(100,), (200,), (300,)],
    },
    n_jobs=1,
    scoring=scoring,
    verbose=1,
).fit(X, y)
print(grid_search.best_score_, grid_search.best_params_)

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:  1.0min finished


-11.7111743968 {'alpha': 0.0001, 'hidden_layer_sizes': (300,)}


### AdaBoost

In [16]:
grid_search = model_selection.GridSearchCV(
    ensemble.AdaBoostClassifier(),
    param_grid={
        'n_estimators': [50, 100, 150, 200],
        'learning_rate': [0.9, 1.0, 1.1],
    },
    n_jobs=4,
    scoring=scoring,
    verbose=1,
).fit(X, y)
print(grid_search.best_score_, grid_search.best_params_)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=4)]: Done  36 out of  36 | elapsed:   34.3s finished


-9.37433031998 {'learning_rate': 1.0, 'n_estimators': 100}


In [19]:
predict_save(grid_search.best_estimator_, '01-adaboost.txt')