#Tutorial

##Loading Data Sets

In [None]:
from sklearn import datasets
digits = datasets.load_digits()
digits.images.shape

In [None]:
import matplotlib.pyplot as plt
%pylab inline

In [None]:
plt.imshow(digits.images[-1], cmap=plt.cm.gray_r)

In [None]:
data = digits.images.reshape((digits.images.shape[0], -1))

##Classifying Irises with k-NN

In [None]:
import numpy as np
from sklearn import datasets
iris = datasets.load_iris()
iris_X = iris.data
iris_y = iris.target
np.unique(iris_y)

In [None]:
iris.data[:10]

In [None]:
iris.target

In [None]:
for i in range(4):
    for j in range(4):
        plt.subplot(4, 4, i*4 + j)
        plt.scatter(iris.data[:, i], iris.data[:, j], c=iris.target)
fig = matplotlib.pyplot.gcf()
fig.set_size_inches(15, 10, forward=True)

In [None]:
# Split iris data in train and test data
# A random permutation, to split the data randomly
np.random.seed(0)
indices = np.random.permutation(len(iris_X))
iris_X_train = iris_X[indices[:-10]]
iris_y_train = iris_y[indices[:-10]]
iris_X_test = iris_X[indices[-10:]]
iris_y_test = iris_y[indices[-10:]]
# Create and fit a nearest-neighbor classifier
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(iris_X_train, iris_y_train)

In [None]:
knn.predict(iris_X_test)

In [None]:
iris_y_test

##Linear Model for the Diabetes Data Set

In [None]:
diabetes = datasets.load_diabetes()
diabetes_X_train = diabetes.data[:-20]
diabetes_X_test = diabetes.data[-20:]
diabetes_y_train = diabetes.target[:-20]
diabetes_y_test = diabetes.target[-20:]

In [None]:
diabetes.data.shape

In [None]:
from sklearn import linear_model
regr = linear_model.LinearRegression()
regr.fit(diabetes_X_train, diabetes_y_train)

In [None]:
print(regr.coef_)

In [None]:
# The mean square error
np.mean((regr.predict(diabetes_X_test)-diabetes_y_test)**2)

In [None]:
# Explained variance score: 1 is perfect prediction
# and 0 means that there is no linear relationship
# between X and Y.
regr.score(diabetes_X_test, diabetes_y_test)

In [None]:
X = np.c_[ .5, 1].T
y = [.5, 1]
test = np.c_[ 0, 2].T
regr = linear_model.LinearRegression()

In [None]:
np.random.seed(0)
for _ in range(6):
    this_X = .1*np.random.normal(size=(2, 1)) + X
    regr.fit(this_X, y)
    plt.plot(test, regr.predict(test))
    plt.scatter(this_X, y, s=3)
fig = matplotlib.pyplot.gcf()
fig.set_size_inches(15, 10, forward=True)

In [None]:
regr = linear_model.Ridge(alpha=.1)
np.random.seed(0)
for _ in range(6):
    this_X = .1*np.random.normal(size=(2, 1)) + X
    regr.fit(this_X, y)
    plt.plot(test, regr.predict(test))
    plt.scatter(this_X, y, s=3)
fig = matplotlib.pyplot.gcf()
fig.set_size_inches(15, 10, forward=True)

##SVM on the Iris Data Set

In [None]:
from sklearn import svm
svc = svm.SVC(kernel='linear')
svc.fit(iris_X_train, iris_y_train)

In [None]:
iris = datasets.load_iris()

In [None]:
X = iris.data
X.shape

In [None]:
y = iris.target
y.shape

In [None]:
X = X[y != 0, 0:2]
X.shape

In [None]:
y = y[y != 0]
y.shape

In [None]:
y

In [None]:
cs = ['w'] * len(y)
for i in range(len(y)):
    if y[i] == 2:
        cs[i] = 'k'

In [None]:
plt.scatter(X[:, 0], X[:, 1], c=cs)
fig = matplotlib.pyplot.gcf()
fig.set_size_inches(6, 6, forward=True)

In [None]:
svc = svm.SVC(kernel='linear')
svc.fit(X, y)

##Model Selection

In [None]:
digits = datasets.load_digits()
X_digits = digits.data
y_digits = digits.target
svc = svm.SVC(C=1, kernel='linear')
svc.fit(X_digits[:-100], y_digits[:-100]).score(X_digits[-100:], y_digits[-100:])

In [None]:
import numpy as np
X_folds = np.array_split(X_digits, 3)
y_folds = np.array_split(y_digits, 3)
scores = list()
for k in range(3):
    # We use 'list' to copy, in order to 'pop' later on
    X_train = list(X_folds)
    X_test = X_train.pop(k)
    X_train = np.concatenate(X_train)
    y_train = list(y_folds)
    y_test = y_train.pop(k)
    y_train = np.concatenate(y_train)
    scores.append(svc.fit(X_train, y_train).score(X_test, y_test))
print(scores)

In [None]:
from sklearn import cross_validation
k_fold = cross_validation.KFold(n=6, n_folds=3)
for train_indices, test_indices in k_fold:
    print('Train: %s | test: %s' % (train_indices, test_indices))

In [None]:
kfold = cross_validation.KFold(len(X_digits), n_folds=3)
[svc.fit(X_digits[train], y_digits[train]).score(X_digits[test], y_digits[test]) for train, test in kfold]

In [None]:
cross_validation.cross_val_score(svc, X_digits, y_digits, cv=kfold, n_jobs=-1)