In [39]:
import numpy
import pandas
from sklearn import (
    ensemble,
    linear_model,
    metrics,
    model_selection,
    naive_bayes,
    neighbors,
    neural_network,
    svm,
)

### Reading

In [22]:
train = pandas \
    .read_csv('train.csv', sep=';', header=0, na_values='None') \
    .drop('id', axis=1)
train.head()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [23]:
test = pandas \
    .read_csv('test.csv', sep=';', header=0, na_values='None') \
    .drop('id', axis=1)
test.head()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active
0,18888,1,154,85.0,130,80,1,1,0.0,0.0,1.0
1,19042,2,170,69.0,130,90,1,1,,0.0,1.0
2,20432,1,160,70.0,120,75,1,1,0.0,0.0,0.0
3,18133,2,185,94.0,130,90,1,1,,0.0,1.0
4,16093,2,170,76.0,120,80,1,1,0.0,0.0,1.0


In [24]:
test[['smoke', 'alco', 'active']] = test[['smoke', 'alco', 'active']].fillna(value=0.0)
X_test = test.values

### Filtering

In [25]:
filtered_train = train[
    (train['weight'] > 40.0) &
    (train['weight'] < 130.0) &
    (train['height'] > 140.0) &
    (train['height'] < 190.0) &
    (train['ap_hi'] > 80.0) &
    (train['ap_hi'] < 250.0) &
    (train['ap_lo'] > 40.0) &
    (train['ap_lo'] < 250.0)
].dropna()
filtered_train.head()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,17474,1,156,56.0,100,60,1,1,0,0,0,0


### Make arrays

In [26]:
X = filtered_train.drop('cardio', axis=1).values
y = filtered_train['cardio'].values
X.shape, y.shape

((68033, 11), (68033,))

### Metrics

In [27]:
scoring = metrics.make_scorer(metrics.log_loss, greater_is_better=False)

### Random forest

In [8]:
grid_search = model_selection.GridSearchCV(
    ensemble.RandomForestClassifier(),
    param_grid={
        'n_estimators': [10, 15, 20, 25],
        'max_depth': [None, 5, 10, 15, 20],
    },
    n_jobs=4,
    scoring=scoring,
).fit(X, y)

In [9]:
grid_search.best_score_, grid_search.best_params_

(-9.2083265849336335, {'max_depth': 10, 'n_estimators': 20})

In [10]:
y_test = grid_search.best_estimator_ \
    .fit(X, y) \
    .predict(X_test)

In [11]:
numpy.savetxt('00-random-forest.txt', y_test, fmt='%d')

### SVC

In [12]:
grid_search = model_selection.GridSearchCV(
    svm.SVC(),
    param_grid={
        'C': [1.0],
    },
    n_jobs=4,
    scoring=scoring,
    verbose=2,
).fit(X, y)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] C=1.0 ...........................................................
[CV] C=1.0 ...........................................................
[CV] ............................................ C=1.0, total= 2.8min
[CV] C=1.0 ...........................................................
[CV] ............................................ C=1.0, total= 2.8min
[CV] ............................................ C=1.0, total= 2.8min


[Parallel(n_jobs=2)]: Done   3 out of   3 | elapsed:  6.7min finished


In [13]:
grid_search.best_score_, grid_search.best_params_

(-14.633041639810525, {'C': 1.0})

In [14]:
def predict_save(estimator, filename):
    numpy.savetxt(filename, estimator.fit(X, y).predict(X_test), fmt='%d')

### Naive Bayes

In [30]:
model_selection.cross_val_score(naive_bayes.GaussianNB(), X, y, scoring=scoring)

array([-9.90724066, -9.99253083, -9.97164783])

### Neighbors

In [36]:
grid_search = model_selection.GridSearchCV(
    neighbors.KNeighborsClassifier(),
    param_grid={
        'n_neighbors': [5, 10, 15, 20, 40, 50, 60, 75, 100],
    },
    n_jobs=4,
    scoring=scoring,
    verbose=1,
).fit(X, y)
print(grid_search.best_score_, grid_search.best_params_)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
-10.1363593582 {'n_neighbors': 50}


[Parallel(n_jobs=4)]: Done  27 out of  27 | elapsed:   14.1s finished


In [38]:
grid_search = model_selection.GridSearchCV(
    neighbors.RadiusNeighborsClassifier(),
    param_grid={
        'radius': [100.0],
    },
    n_jobs=4,
    scoring=scoring,
    verbose=1,
).fit(X, y)
print(grid_search.best_score_, grid_search.best_params_)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
-13.5398862061 {'radius': 100.0}


[Parallel(n_jobs=4)]: Done   3 out of   3 | elapsed:   15.2s finished


### NN

In [47]:
grid_search = model_selection.GridSearchCV(
    neural_network.MLPClassifier(max_iter=10000),
    param_grid={
        'alpha': [0.001, 0.0001, 0.00001],
        'hidden_layer_sizes': [(100,), (200,), (300,)],
    },
    n_jobs=1,
    scoring=scoring,
    verbose=1,
).fit(X, y)
print(grid_search.best_score_, grid_search.best_params_)

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:  1.0min finished


-13.8551487802 {'alpha': 0.0001, 'hidden_layer_sizes': (200,)}


### AdaBoost

In [51]:
grid_search = model_selection.GridSearchCV(
    ensemble.AdaBoostClassifier(),
    param_grid={
        'n_estimators': [50, 100, 150, 200],
        'learning_rate': [0.9, 1.0, 1.1],
    },
    n_jobs=4,
    scoring=scoring,
    verbose=1,
).fit(X, y)
print(grid_search.best_score_, grid_search.best_params_)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=4)]: Done  36 out of  36 | elapsed:   37.7s finished


-9.34234807959 {'learning_rate': 0.9, 'n_estimators': 50}


In [53]:
predict_save(grid_search.best_estimator_, '01-adaboost.txt')