Dataset: http://archive.ics.uci.edu/ml/datasets/Pima+Indians+Diabetes

Replace zeros = NaN in some columns by median or mean

Feature Selection:
Principal Component Analysis (PCA)
Recursive Feature Elimination (RFE)
Feature Importance (e.g. by Gradient Boosting)

In [None]:
%matplotlib inline
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import (ExtraTreesClassifier, RandomForestClassifier, 
                              AdaBoostClassifier, GradientBoostingClassifier)
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

In [None]:
df = pd.read_csv('pima-indians-diabetes.data', header=None,
                 names=['preg', 'gluc', 'blood_p', 'skin', 'insu', 'bmi', 'pedi', 'age', 'class'])
df.head()

In [None]:
df.describe()

In [None]:
bins = range(10, 60, 6)
df['bmi_bin'] = pd.cut(df['bmi'], bins, labels=bins[:-1])
df.head()

In [None]:
grouped = df[['bmi_bin', 'class']].groupby(['bmi_bin'], as_index=False).mean()
grouped['class'] = grouped['class'].fillna(0.0)
df = df.drop('bmi_bin', axis=1)
grouped

In [None]:
plt.plot(bins[:-1], grouped['class'], 'ro')
plt.xlabel('bmi greater than')
plt.ylabel('diabetes rate')
plt.show()

It's no surprise that overweight and diabetes are positively correlated.

In [None]:
# Compute the correlation matrix
corr = df.corr()

# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3,
            square=True, linewidths=.5, cbar_kws={"shrink": .5}, ax=ax)

In [None]:
X, y = df.drop('class', axis=1), df['class']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

~~~~
Best parameters set found on development set:

{'C': 1000, 'kernel': 'linear'}

Grid scores on development set:

0.687 (+/-0.047) for {'gamma': 0.001, 'C': 1, 'kernel': 'rbf'}
0.686 (+/-0.067) for {'gamma': 0.0001, 'C': 1, 'kernel': 'rbf'}
0.676 (+/-0.068) for {'gamma': 0.001, 'C': 10, 'kernel': 'rbf'}
0.692 (+/-0.062) for {'gamma': 0.0001, 'C': 10, 'kernel': 'rbf'}
0.634 (+/-0.070) for {'gamma': 0.001, 'C': 100, 'kernel': 'rbf'}
0.702 (+/-0.083) for {'gamma': 0.0001, 'C': 100, 'kernel': 'rbf'}
0.627 (+/-0.061) for {'gamma': 0.001, 'C': 1000, 'kernel': 'rbf'}
0.700 (+/-0.045) for {'gamma': 0.0001, 'C': 1000, 'kernel': 'rbf'}
0.717 (+/-0.067) for {'C': 1, 'kernel': 'linear'}
0.709 (+/-0.064) for {'C': 10, 'kernel': 'linear'}
0.713 (+/-0.060) for {'C': 100, 'kernel': 'linear'}
0.727 (+/-0.092) for {'C': 1000, 'kernel': 'linear'}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

             precision    recall  f1-score   support

          0       0.81      0.73      0.77        99
          1       0.58      0.69      0.63        55

avg / total       0.73      0.71      0.72       154
~~~~

Comparing different estimators in one grid search from http://www.codiply.com/blog/hyperparameter-grid-search-across-multiple-models-in-scikit-learn/

I had to make some adaptions to port it to Python 3 (should probably communicate them back to the author)

In [None]:
class EstimatorSelectionHelper:
    def __init__(self, models, params):
        if not set(models.keys()).issubset(set(params.keys())):
            missing_params = list(set(models.keys()) - set(params.keys()))
            raise ValueError("Some estimators are missing parameters: %s" % missing_params)
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}

    def fit(self, X, y, cv=3, n_jobs=1, verbose=1, scoring=None, refit=False):
        for key in self.keys:
            print("Running GridSearchCV for %s." % key)
            model = self.models[key]
            params = self.params[key]
            gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs, 
                              verbose=verbose, scoring=scoring, refit=refit)
            gs.fit(X,y)
            self.grid_searches[key] = gs    

    def score_summary(self, sort_by='mean_score'):
        def row(key, scores, params):
            d = {
                 'estimator': key,
                 'min_score': min(scores),
                 'max_score': max(scores),
                 'mean_score': np.mean(scores),
                 'std_score': np.std(scores),
            }
            return pd.Series(dict(params.items() | d.items()))

        rows = [row(k, gsc.cv_validation_scores, gsc.parameters) 
                     for k in self.keys
                     for gsc in self.grid_searches[k].grid_scores_]
        df = pd.concat(rows, axis=1).T.sort([sort_by], ascending=False)

        columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
        columns = columns + [c for c in df.columns if c not in columns]

        return df[columns]

In [None]:
models1 = { 
    #'GaussianNB' : GaussianNB()#,
    #'KNeighborsClassifier' : KNeighborsClassifier(),
    #'ExtraTreesClassifier': ExtraTreesClassifier(),
    #'RandomForestClassifier': RandomForestClassifier(),
    #'AdaBoostClassifier': AdaBoostClassifier(),
    'GradientBoostingClassifier': GradientBoostingClassifier(),
    #'SVC': SVC()
}

params1 = { 
    #'GaussianNB': {},
    #'KNeighborsClassifier': {'n_neighbors': [1, 2, 3, 4, 5, 10, 15] },
    #'ExtraTreesClassifier': { 'n_estimators': [16, 32, 64, 128] },
    #'RandomForestClassifier': { 'n_estimators': [16, 32, 64, 128] },
    #'AdaBoostClassifier':  { 'n_estimators': [16, 32, 64, 128] },
    'GradientBoostingClassifier': { 'n_estimators': list(range(46, 47, 1)), 'learning_rate': [1.0], 'max_depth': [3] },
    #'SVC': {'kernel': ['linear'], 'C': [1, 10, 100, 1000, 10000], 'gamma': [0.001]}
}

helper1 = EstimatorSelectionHelper(models1, params1)
helper1.fit(X_train, y_train, scoring='f1', n_jobs=-1)

helper1.score_summary(sort_by='min_score')

In [None]:
# Authors: Robert McGibbon, Joel Nothman

from __future__ import print_function, division

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.decomposition import PCA, NMF
from sklearn.feature_selection import SelectKBest, chi2

print(__doc__)

pipe = Pipeline([
    ('reduce_dim', PCA()),
    ('classify', LinearSVC())
])

N_FEATURES_OPTIONS = [2, 4, 8]
C_OPTIONS = [1, 10, 100, 1000]
param_grid = [
    {
        'reduce_dim': [PCA(iterated_power=7), NMF()],
        'reduce_dim__n_components': N_FEATURES_OPTIONS,
        'classify__C': C_OPTIONS
    },
    {
        'reduce_dim': [SelectKBest(chi2)],
        'reduce_dim__k': N_FEATURES_OPTIONS,
        'classify__C': C_OPTIONS
    },
]
reducer_labels = ['PCA', 'NMF', 'KBest(chi2)']

grid = GridSearchCV(pipe, cv=3, n_jobs=2, param_grid=param_grid)
digits = load_digits()
grid.fit(digits.data, digits.target)

mean_scores = np.array(grid.cv_results_['mean_test_score'])
# scores are in the order of param_grid iteration, which is alphabetical
mean_scores = mean_scores.reshape(len(C_OPTIONS), -1, len(N_FEATURES_OPTIONS))
# select score for best C
mean_scores = mean_scores.max(axis=0)
bar_offsets = (np.arange(len(N_FEATURES_OPTIONS)) *
               (len(reducer_labels) + 1) + .5)

plt.figure()
COLORS = 'bgrcmyk'
for i, (label, reducer_scores) in enumerate(zip(reducer_labels, mean_scores)):
    plt.bar(bar_offsets + i, reducer_scores, label=label, color=COLORS[i])

plt.title("Comparing feature reduction techniques")
plt.xlabel('Reduced number of features')
plt.xticks(bar_offsets + len(reducer_labels) / 2, N_FEATURES_OPTIONS)
plt.ylabel('Digit classification accuracy')
plt.ylim((0, 1))
plt.legend(loc='upper left')
plt.show()

In [38]:
from __future__ import print_function, division

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.decomposition import PCA, NMF
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import (ExtraTreesClassifier, RandomForestClassifier, 
                              AdaBoostClassifier, GradientBoostingClassifier)


pipe = Pipeline([
    ('reduce_dim', PCA()),
    ('clf', LinearSVC())
])

C_OPTIONS = [1, 0.001]
N_ESTIMATORS_OPTIONS = [5, 10, 20]
N_FEATURES_OPTIONS = [2, 4]

param_grid = [
    {
        'reduce_dim' : [None],
        'clf' : [LinearSVC()],
        'clf__C': C_OPTIONS,
    },
    {
        'reduce_dim' : [PCA(), NMF()],
        'reduce_dim__n_components': N_FEATURES_OPTIONS,
        'clf' : [LinearSVC()],
        'clf__C': C_OPTIONS,
    },
    {
        'reduce_dim' : [SelectKBest(chi2)],
        'reduce_dim__k': N_FEATURES_OPTIONS,
        'clf' : [GradientBoostingClassifier()],
        'clf__n_estimators': N_ESTIMATORS_OPTIONS,
    },
    {
        'reduce_dim' : [PCA(), NMF()],
        'reduce_dim__n_components': N_FEATURES_OPTIONS,
        'clf' : [LinearSVC()],
        'clf__C': C_OPTIONS
    },
    {
        'reduce_dim' : [SelectKBest(chi2)],
        'reduce_dim__k': N_FEATURES_OPTIONS,
        'clf' : [GradientBoostingClassifier()],
        'clf__n_estimators': N_ESTIMATORS_OPTIONS
    },
]

grid = GridSearchCV(pipe, cv=3, n_jobs=2, param_grid=param_grid)
digits = load_digits()
grid.fit(digits.data, digits.target)

results = dict()
results['mean'] = grid.cv_results_['mean_test_score']
results['std'] = grid.cv_results_['std_test_score']

for i, params in enumerate(grid.cv_results_['params']):
    for k, v in params.items():
        if k in ['clf', 'reduce_dim']:
            v = str(v)
            v = v[:v.find('(')]
        if not k in results:
            results[k] = [None] * len(results['mean'])
        results[k][i] = v
    
pd.DataFrame.from_dict(results).sort_values('mean', ascending=False)

Unnamed: 0,clf,clf__C,clf__n_estimators,mean,reduce_dim,reduce_dim__k,reduce_dim__n_components,std
1,LinearSVC,0.001,,0.929883,Non,,,0.013024
0,LinearSVC,1.0,,0.902615,Non,,,0.02799
3,LinearSVC,1.0,,0.72621,PCA,,4.0,0.015354
17,LinearSVC,1.0,,0.715081,PCA,,4.0,0.013982
21,LinearSVC,0.001,,0.668336,PCA,,4.0,0.009084
7,LinearSVC,0.001,,0.668336,PCA,,4.0,0.009084
5,LinearSVC,1.0,,0.658319,NMF,,4.0,0.009081
19,LinearSVC,1.0,,0.658319,NMF,,4.0,0.009081
23,LinearSVC,0.001,,0.549805,NMF,,4.0,0.023598
9,LinearSVC,0.001,,0.549805,NMF,,4.0,0.023598
