In [119]:
import numpy
import pandas
from sklearn import ensemble, linear_model, metrics, model_selection, svm

### Reading

In [90]:
train = pandas \
    .read_csv('train.csv', sep=';', header=0, na_values='None') \
    .drop('id', axis=1)
train.head()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [91]:
test = pandas \
    .read_csv('test.csv', sep=';', header=0, na_values='None') \
    .drop('id', axis=1)
test.head()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active
0,18888,1,154,85.0,130,80,1,1,0.0,0.0,1.0
1,19042,2,170,69.0,130,90,1,1,,0.0,1.0
2,20432,1,160,70.0,120,75,1,1,0.0,0.0,0.0
3,18133,2,185,94.0,130,90,1,1,,0.0,1.0
4,16093,2,170,76.0,120,80,1,1,0.0,0.0,1.0


In [106]:
test[['smoke', 'alco', 'active']] = test[['smoke', 'alco', 'active']].fillna(value=0.0)
X_test = test.values

### Filtering

In [92]:
filtered_train = train[
    (train['weight'] > 40.0) &
    (train['weight'] < 130.0) &
    (train['height'] > 140.0) &
    (train['height'] < 190.0) &
    (train['ap_hi'] > 80.0) &
    (train['ap_hi'] < 250.0) &
    (train['ap_lo'] > 40.0) &
    (train['ap_lo'] < 250.0)
].dropna()
filtered_train.head()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,17474,1,156,56.0,100,60,1,1,0,0,0,0


### Make arrays

In [81]:
X = filtered_train.drop('cardio', axis=1).values
y = filtered_train['cardio'].values
X.shape, y.shape

((68033, 11), (68033,))

### Metrics

In [86]:
scoring = metrics.make_scorer(metrics.log_loss, greater_is_better=False)

### Random forest

In [98]:
grid_search = model_selection.GridSearchCV(
    ensemble.RandomForestClassifier(),
    param_grid={
        'n_estimators': [10, 15, 20, 25],
        'max_depth': [None, 5, 10, 15, 20],
    },
    n_jobs=5,
    scoring=scoring,
).fit(X, y)

In [100]:
grid_search.best_score_, grid_search.best_params_

(-9.1854808464227737, {'max_depth': 10, 'n_estimators': 20})

In [114]:
y_test = grid_search.best_estimator_ \
    .fit(X, y) \
    .predict(X_test)

In [117]:
numpy.savetxt('00-random-forest.txt', y_test, fmt='%d')

### SVC

In [None]:
grid_search = model_selection.GridSearchCV(
    svm.SVC(),
    param_grid={
        'C': [0.9, 1.0, 1.1],
    },
    n_jobs=5,
    scoring=scoring,
).fit(X, y)

In [None]:
grid_search.best_score_, grid_search.best_params_