[Introduction to Machine Learning with Python and Scikit-Learn](http://kukuruku.co/hub/python/introduction-to-machine-learning-with-python-andscikit-learn)

In [1]:
import numpy as np
import urllib

In [3]:
## load data
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"
raw_data = urllib.urlopen(url)
dataset = np.loadtxt(raw_data, delimiter=',')
X = dataset[:, 0:7]
y = dataset[:, 8]

In [4]:
## data normalization
from sklearn import  preprocessing
normalized_X = preprocessing.normalize(X)
standardized_X = preprocessing.scale(X)

In [5]:
## feature selection
from sklearn import  metrics
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()
model.fit(X, y)
print(model.feature_importances_)

[ 0.13093777  0.25575154  0.11844188  0.08568266  0.09105997  0.18077661
  0.13734957]


In [7]:
## recursive feature elemination algorithm
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
# select 3 attributes
rfe = RFE(model, 3)
rfe = rfe.fit(X, y)
# summarize the selection of attributes
print(rfe.support_)
print(rfe.ranking_)

[ True False False False False  True  True]
[1 2 3 5 4 1 1]


In [8]:
## logistic regression
model = LogisticRegression()
model.fit(X, y)
predicted = model.predict(X)
print(metrics.classification_report(y, predicted))
print(metrics.confusion_matrix(y, predicted))

             precision    recall  f1-score   support

        0.0       0.79      0.89      0.84       500
        1.0       0.74      0.55      0.63       268

avg / total       0.77      0.77      0.77       768

[[447  53]
 [120 148]]


In [9]:
## naive bayes
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X, y)
predicted = model.predict(X)
print(metrics.classification_report(y, predicted))
print(metrics.confusion_matrix(y, predicted))

             precision    recall  f1-score   support

        0.0       0.80      0.86      0.83       500
        1.0       0.69      0.60      0.64       268

avg / total       0.76      0.77      0.76       768

[[429  71]
 [108 160]]


In [10]:
## k-nearest neighbours
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()
model.fit(X, y)
print(model)
predicted = model.predict(X)
print(metrics.classification_report(y, predicted))
print(metrics.confusion_matrix(y, predicted))

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_neighbors=5, p=2, weights='uniform')
             precision    recall  f1-score   support

        0.0       0.82      0.90      0.86       500
        1.0       0.77      0.63      0.69       268

avg / total       0.80      0.80      0.80       768

[[448  52]
 [ 98 170]]


In [11]:
## decision trees: classification and regression trees (CART)
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(X, y)
print(model)
predicted = model.predict(X)
print(metrics.classification_report(y, predicted))
print(metrics.confusion_matrix(y, predicted))

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            random_state=None, splitter='best')
             precision    recall  f1-score   support

        0.0       1.00      1.00      1.00       500
        1.0       1.00      1.00      1.00       268

avg / total       1.00      1.00      1.00       768

[[500   0]
 [  0 268]]


In [12]:
## support vector machines
from sklearn.svm import SVC
model = SVC()
model.fit(X, y)
print(model)
predicted = model.predict(X)
print(metrics.classification_report(y, predicted))
print(metrics.confusion_matrix(y, predicted))

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)
             precision    recall  f1-score   support

        0.0       1.00      1.00      1.00       500
        1.0       1.00      1.00      1.00       268

avg / total       1.00      1.00      1.00       768

[[500   0]
 [  0 268]]


In [13]:
## optimize algorithm parameters: selection of the regularization parameter
from sklearn.linear_model import Ridge
from sklearn.grid_search import GridSearchCV
alphas = np.array([1, 0.1, 0.01, 0.001, 0.0001, 0])
model = Ridge()
grid = GridSearchCV(estimator=model, param_grid=dict(alpha=alphas))
grid.fit(X, y)
print(grid)
print(grid.best_score_)
print(grid.best_estimator_.alpha)

GridSearchCV(cv=None, error_score='raise',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, solver='auto', tol=0.001),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'alpha': array([  1.00000e+00,   1.00000e-01,   1.00000e-02,   1.00000e-03,
         1.00000e-04,   0.00000e+00])},
       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
       verbose=0)
0.282118955686
1.0


In [15]:
## optimize algorithm parameters: randomly select a parameter from the given range
from scipy.stats import uniform as sp_rand
from sklearn.linear_model import Ridge
from sklearn.grid_search import RandomizedSearchCV
param_grid = {'alpha': sp_rand()}
model = Ridge()
rsearch = RandomizedSearchCV(estimator=model, 
                             param_distributions=param_grid, 
                             n_iter=100)
rsearch.fit(X, y)
print(rsearch)
print(rsearch.best_score_)
print(rsearch.best_estimator_.alpha)

RandomizedSearchCV(cv=None, error_score='raise',
          estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, solver='auto', tol=0.001),
          fit_params={}, iid=True, n_iter=100, n_jobs=1,
          param_distributions={'alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x107429b90>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          scoring=None, verbose=0)
0.282117844453
0.959009203043


In [21]:
predicted = rsearch.predict(X)
print(metrics.classification_report(y, predicted))
print(metrics.confusion_matrix(y, predicted))

ValueError: Mix type of y not allowed, got types set(['binary', 'continuous'])

In [22]:
predicted

array([ 0.62174181, -0.00917087,  0.77448528, -0.010409  ,  0.81177823,
        0.22924422,  0.05655705,  0.61845581,  0.60921831, -0.01845282,
        0.28696287,  0.88561773,  0.68792214,  0.52907262,  0.56489235,
        0.43436414,  0.36813547,  0.28491162,  0.35341672,  0.27933079,
        0.42042116,  0.3368798 ,  0.95944756,  0.38809787,  0.6266047 ,
        0.46754439,  0.65463671,  0.01329612,  0.53009562,  0.33053046,
        0.3720682 ,  0.55953298,  0.02967293, -0.00644394,  0.45456468,
        0.20898945,  0.64252865,  0.4039501 ,  0.21729759,  0.46570167,
        0.69750195,  0.62888089,  0.14736654,  0.88265242,  0.60022478,
        0.97945418,  0.45784946, -0.02422951,  0.41436794,  0.04312367,
       -0.01702931,  0.10972975,  0.07480502,  0.72541294,  0.63338839,
       -0.11164323,  0.83275587,  0.34725555,  0.70364623,  0.23387697,
       -0.21044558,  0.51874275, -0.13599472,  0.3859496 ,  0.386895  ,
        0.16948256,  0.2201885 ,  0.39586989, -0.04952617,  0.38

In [20]:
y.shape

(768,)