In [2]:
import numpy as np
from sklearn import datasets

### Nearest neighbor

In [3]:
iris = datasets.load_iris()
iris_X = iris.data
iris_y = iris.target

In [9]:
print np.unique(iris_y)
print iris_X.shape

[0 1 2]
(150L, 4L)


In [7]:
# Split iris data in train and test data
# A random permutation, to split the data randomly
np.random.seed(0)
indices = np.random.permutation(len(iris_X))
iris_X_train = iris_X[indices[:-10]]
iris_y_train = iris_y[indices[:-10]]
iris_X_test = iris_X[indices[-10:]]
iris_y_test = iris_y[indices[-10:]]

In [8]:
print indices

[114  62  33 107   7 100  40  86  76  71 134  51  73  54  63  37  78  90
  45  16 121  66  24   8 126  22  44  97  93  26 137  84  27 127 132  59
  18  83  61  92 112   2 141  43  10  60 116 144 119 108  69 135  56  80
 123 133 106 146  50 147  85  30 101  94  64  89  91 125  48  13 111  95
  20  15  52   3 149  98   6  68 109  96  12 102 120 104 128  46  11 110
 124  41 148   1 113 139  42   4 129  17  38   5  53 143 105   0  34  28
  55  75  35  23  74  31 118  57 131  65  32 138  14 122  19  29 130  49
 136  99  82  79 115 145  72  77  25  81 140 142  39  58  88  70  87  36
  21   9 103  67 117  47]


In [12]:
# Create and fit a nearest-neighbor classifier
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(iris_X_train, iris_y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [13]:
knn.predict(iris_X_test)

array([1, 2, 1, 0, 0, 0, 2, 1, 2, 0])

In [14]:
iris_y_test

array([1, 1, 1, 0, 0, 0, 2, 1, 2, 0])

### Linear model: from regression to sparsity

In [96]:
diabetes = datasets.load_diabetes()
diabetes_X =  diabetes.data
diabetes_y =  diabetes.target
diabetes_X_train =  diabetes.data[:-20]
diabetes_X_test =  diabetes.data[-20:]
diabetes_y_train =  diabetes.target[:-20]
diabetes_y_test =  diabetes.target[-20:]

In [26]:
print diabetes_X_train.shape

(422L, 10L)


In [21]:
from sklearn import linear_model
regr = linear_model.LinearRegression()
regr.fit(diabetes_X_train, diabetes_y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [24]:
print regr.coef_

 [  3.03499549e-01  -2.37639315e+02   5.10530605e+02   3.27736980e+02
  -8.14131709e+02   4.92814588e+02   1.02848452e+02   1.84606489e+02
   7.43519617e+02   7.60951722e+01]


In [27]:
# The mean square error
np.mean((regr.predict(diabetes_X_test) - diabetes_y_test) ** 2)

2004.5676026898223

In [30]:
# Explained variance score: 1 is perfect prediction
# and 0 means that there is no linear relationship between X and y
regr.score(diabetes_X_test, diabetes_y_test)

0.58507530226905713

In [33]:
alphas = np.logspace(-4, -1, 6)
print alphas

[ 0.0001      0.00039811  0.00158489  0.00630957  0.02511886  0.1       ]


In [34]:
# Ridge regression
for alpha in alphas:
    regr = linear_model.Ridge(alpha)
    print regr.fit(diabetes_X_train, diabetes_y_train).score(diabetes_X_test, diabetes_y_test)

0.585111068388
0.585207301544
0.58546775407
0.58555120365
0.583071708555
0.570589994373


In [38]:
# Lasso, set some coefficients to zero
regr = linear_model.Lasso()
scores = [regr.set_params(alpha=alpha)
             .fit(diabetes_X_train, diabetes_y_train)
             .score(diabetes_X_test, diabetes_y_test)
             for alpha in alphas]
print scores

[0.58511910691622271, 0.58524713649060511, 0.58571895391793549, 0.58730094854527404, 0.58876224183092618, 0.58284500296816799]


In [40]:
best_alpha = alphas[scores.index(max(scores))]
print best_alpha
regr.alpha = best_alpha
regr.fit(diabetes_X_train, diabetes_y_train)
regr.coef_

0.0251188643151


array([   0.        , -212.43764548,  517.19478111,  313.77959962,
       -160.8303982 ,   -0.        , -187.19554705,   69.38229038,
        508.66011217,   71.84239008])

##### classification

In [41]:
# Logistic regression
logistic = linear_model.LogisticRegression(C=1e5)
logistic.fit(iris_X_train, iris_y_train)

LogisticRegression(C=100000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [42]:
print logistic.predict(iris_X_test)
print iris_y_test

[1 2 1 0 0 0 2 1 2 0]
[1 1 1 0 0 0 2 1 2 0]


In [59]:
# Exercise: Try classifying the digits dataset with nearest neighbors and a linear model. 
# Leave out the last 10% and test prediction performance on these observations

digits = datasets.load_digits()
digits_X = digits.data
digits_y = digits.target

test_size = digits_y.size / 10

digits_X_train = digits_X[:-test_size]
digits_y_train = digits_y[:-test_size]
digits_X_test = digits_X[-test_size:]
digits_y_test = digits_y[-test_size:]

knn_digits = KNeighborsClassifier()
knn_digits.fit(digits_X_train, digits_y_train)
test_result = knn_digits.predict(digits_X_test)
print test_result
print digits_y_test

[2 8 0 1 7 6 3 2 1 7 4 6 3 1 3 9 1 7 6 8 4 3 1 4 0 5 3 6 9 6 1 7 5 4 4 7 2
 2 5 7 3 5 8 4 5 0 8 9 7 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4
 5 6 7 8 9 0 9 5 5 6 5 0 9 8 9 8 4 1 7 7 3 5 1 0 0 2 2 7 8 2 0 1 2 6 3 2 7
 3 3 4 6 6 6 4 9 1 5 0 9 5 2 8 2 0 0 1 7 6 3 2 1 7 4 6 3 1 3 9 1 7 6 8 4 5
 1 4 0 5 3 6 9 6 1 7 5 4 4 7 2 8 2 2 5 7 9 5 4 8 1 4 9 0 8 9 8]
[2 8 0 1 7 6 3 2 1 7 4 6 3 1 3 9 1 7 6 8 4 3 1 4 0 5 3 6 9 6 1 7 5 4 4 7 2
 2 5 7 9 5 4 4 9 0 8 9 8 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4
 5 6 7 8 9 0 9 5 5 6 5 0 9 8 9 8 4 1 7 7 3 5 1 0 0 2 2 7 8 2 0 1 2 6 3 3 7
 3 3 4 6 6 6 4 9 1 5 0 9 5 2 8 2 0 0 1 7 6 3 2 1 7 4 6 3 1 3 9 1 7 6 8 4 3
 1 4 0 5 3 6 9 6 1 7 5 4 4 7 2 8 2 2 5 7 9 5 4 8 8 4 9 0 8 9 8]


In [60]:
logistic_digits = linear_model.LogisticRegression()
logistic_digits.fit(digits_X_train, digits_y_train)
test_result = logistic_digits.predict(digits_X_test)
print test_result
print digits_y_test

[2 8 0 1 7 6 3 2 1 7 9 6 3 1 3 9 1 7 6 8 4 3 1 4 0 5 3 6 9 6 1 7 5 4 4 7 2
 2 5 7 3 5 9 4 9 0 8 9 8 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 8 2 8 4
 5 6 7 8 9 0 9 5 5 6 5 0 9 8 9 8 4 1 7 7 3 5 1 0 0 2 2 7 8 2 0 1 2 6 8 8 7
 5 8 4 6 6 6 4 9 1 5 0 9 5 2 8 2 0 0 4 7 6 3 2 1 7 4 6 3 1 3 9 1 7 6 8 4 5
 1 4 0 5 3 6 9 6 1 7 5 4 4 7 2 8 2 2 5 7 9 5 4 8 8 4 9 0 8 9 8]
[2 8 0 1 7 6 3 2 1 7 4 6 3 1 3 9 1 7 6 8 4 3 1 4 0 5 3 6 9 6 1 7 5 4 4 7 2
 2 5 7 9 5 4 4 9 0 8 9 8 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4
 5 6 7 8 9 0 9 5 5 6 5 0 9 8 9 8 4 1 7 7 3 5 1 0 0 2 2 7 8 2 0 1 2 6 3 3 7
 3 3 4 6 6 6 4 9 1 5 0 9 5 2 8 2 0 0 1 7 6 3 2 1 7 4 6 3 1 3 9 1 7 6 8 4 3
 1 4 0 5 3 6 9 6 1 7 5 4 4 7 2 8 2 2 5 7 9 5 4 8 8 4 9 0 8 9 8]


In [61]:
# SVMs
from sklearn import svm
svc =  svm.SVC(kernel='linear')
svc.fit(iris_X_train, iris_y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [62]:
iris_svm_result = svc.predict(iris_X_test)
print iris_svm_result
print iris_y_test

[1 2 1 0 0 0 2 1 2 0]
[1 1 1 0 0 0 2 1 2 0]


In [70]:
# SVMs using kernel
svc_poly = svm.SVC(kernel='poly', degree=3)
svc_poly.fit(iris_X_train, iris_y_train)
iris_svm_poly_result = svc_poly.predict(iris_X_test)
print iris_svm_poly_result
print iris_y_test

[1 2 1 0 0 0 2 1 2 0]
[1 1 1 0 0 0 2 1 2 0]


In [71]:
svc_rbf = svm.SVC(kernel='rbf')
svc_rbf.fit(iris_X_train, iris_y_train)
iris_svm_rbf_result = svc_rbf.predict(iris_X_test)
print iris_svm_rbf_result
print iris_y_test

[1 2 1 0 0 0 2 1 2 0]
[1 1 1 0 0 0 2 1 2 0]


### Score, and cross-validated scores

In [74]:
svc_linear = svm.SVC(C=1, kernel='linear')
svc_linear.fit(digits_X_train, digits_y_train)
score = svc_linear.score(digits_X_test, digits_y_test)
print score

0.966480446927


In [77]:
print knn_digits.score(digits_X_test, digits_y_test)
print logistic_digits.score(digits_X_test, digits_y_test)

0.960893854749
0.938547486034


In [78]:
# test
print knn_digits.score(digits_X_train, digits_y_train)
print logistic_digits.score(digits_X_train, digits_y_train)

0.990111248455
0.995673671199


In [83]:
# KFlod cross-validation
from sklearn.model_selection import KFold, cross_val_score
X = ['a', 'a',  'b',  'c', 'c', 'c']
k_fold = KFold(n_splits=3)
for train_indicies, test_indicies in k_fold.split(X):
    print 'Train: %s | test %s' % (train_indicies, test_indicies)

Train: [2 3 4 5] | test [0 1]
Train: [0 1 4 5] | test [2 3]
Train: [0 1 2 3] | test [4 5]


In [81]:
[svc_linear.fit(digits_X[train], digits_y[train]).score(digits_X[test], digits_y[test])
    for train, test in k_fold.split(digits_X)]

[0.93489148580968284, 0.95659432387312182, 0.93989983305509184]

In [84]:
cross_val_score(svc_linear,  digits_X, digits_y, cv=k_fold, scoring='precision_macro')

array([ 0.93969761,  0.95911415,  0.94041254])

### Grid-search and cross-validated estimators

In [87]:
from sklearn.model_selection import GridSearchCV
Cs = np.logspace(-6, -1, 10)
#print Cs
clf = GridSearchCV(estimator=svc_linear, param_grid=dict(C=Cs), n_jobs=-1)
clf.fit(digits_X[:1000], digits_y[:1000])

GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'C': array([  1.00000e-06,   3.59381e-06,   1.29155e-05,   4.64159e-05,
         1.66810e-04,   5.99484e-04,   2.15443e-03,   7.74264e-03,
         2.78256e-02,   1.00000e-01])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [89]:
print clf.best_score_
print clf.best_estimator_.C

0.925
0.00774263682681


In [90]:
clf.score(digits_X[1000:], digits_y[1000:])

0.94353826850690092

In [101]:
# cross-validated estimators
lasso = linear_model.LassoCV()
lasso.fit(diabetes_X_train, diabetes_y_train)

LassoCV(alphas=None, copy_X=True, cv=None, eps=0.001, fit_intercept=True,
    max_iter=1000, n_alphas=100, n_jobs=1, normalize=False, positive=False,
    precompute='auto', random_state=None, selection='cyclic', tol=0.0001,
    verbose=False)

In [102]:
lasso.alpha_

0.034628679642636641

In [103]:
lasso.score(diabetes_X_test, diabetes_y_test)

0.58835979803792382