# An introduction to machine learning with scikit-learn

In [63]:
from sklearn import datasets
from sklearn import svm
from sklearn.externals import joblib
from sklearn import random_projection
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import MultiLabelBinarizer

import pickle
import numpy as np

## Loading an example dataset

In [2]:
iris = datasets.load_iris()

In [3]:
digits = datasets.load_digits()

In [7]:
type(iris), type(digits)

(sklearn.utils.Bunch, sklearn.utils.Bunch)

In [8]:
digits.data

array([[ 0.,  0.,  5., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ..., 10.,  0.,  0.],
       [ 0.,  0.,  0., ..., 16.,  9.,  0.],
       ...,
       [ 0.,  0.,  1., ...,  6.,  0.,  0.],
       [ 0.,  0.,  2., ..., 12.,  0.,  0.],
       [ 0.,  0., 10., ..., 12.,  1.,  0.]])

In [9]:
digits.target

array([0, 1, 2, ..., 8, 9, 8])

In [10]:
digits.target_names

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [11]:
digits.images[0]

array([[ 0.,  0.,  5., 13.,  9.,  1.,  0.,  0.],
       [ 0.,  0., 13., 15., 10., 15.,  5.,  0.],
       [ 0.,  3., 15.,  2.,  0., 11.,  8.,  0.],
       [ 0.,  4., 12.,  0.,  0.,  8.,  8.,  0.],
       [ 0.,  5.,  8.,  0.,  0.,  9.,  8.,  0.],
       [ 0.,  4., 11.,  0.,  1., 12.,  7.,  0.],
       [ 0.,  2., 14.,  5., 10., 12.,  0.,  0.],
       [ 0.,  0.,  6., 13., 10.,  0.,  0.,  0.]])

## Learning and predicting

In [13]:
clf = svm.SVC(gamma=0.001, C=100.)

In [14]:
clf.fit(digits.data[:-1], digits.target[:-1])

SVC(C=100.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [16]:
clf.predict(digits.data[-1:])

array([8])

## Model persistence

In [18]:
clf = svm.SVC(gamma='scale')

In [19]:
X, y = iris.data, iris.target

In [20]:
clf.fit(X, y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [21]:
s = pickle.dumps(clf)

In [22]:
clf2 = pickle.loads(s)

In [23]:
clf2.predict(X[0:1])

array([0])

In [24]:
y[0]

0

In [26]:
joblib.dump(clf, 'iris-svm-gamma=scale.joblib')

['iris-svm-gamma=scale.joblib']

In [27]:
clf = joblib.load('iris-svm-gamma=scale.joblib')

## Conventions

In [29]:
rng = np.random.RandomState(0)

In [30]:
X = rng.rand(10, 2000)

In [31]:
X = np.array(X, dtype='float32')

In [34]:
X.dtype

dtype('float32')

In [35]:
transformer = random_projection.GaussianRandomProjection()

In [36]:
X_new = transformer.fit_transform(X)

In [37]:
X_new.dtype

dtype('float64')

In [39]:
clf = svm.SVC(gamma='scale')

In [40]:
clf.fit(iris.data, iris.target)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [41]:
list(clf.predict(iris.data[:3]))

[0, 0, 0]

In [42]:
clf.fit(iris.data, iris.target_names[iris.target])

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [43]:
list(clf.predict(iris.data[:3]))

['setosa', 'setosa', 'setosa']

## Refitting and updating parameters

In [44]:
rng = np.random.RandomState(0)

In [45]:
X = rng.rand(100, 10)

In [46]:
y = rng.binomial(1, 0.5, 100)

In [47]:
X_test = rng.rand(5, 10)

In [48]:
clf = svm.SVC()

In [49]:
clf.set_params(kernel='linear')

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [51]:
clf.fit(X, y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [52]:
clf.predict(X_test)

array([1, 0, 1, 1, 0])

In [53]:
clf.set_params(kernel='rbf', gamma='scale').fit(X, y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [54]:
clf.predict(X_test)

array([1, 0, 1, 1, 0])

## Multiclass vs. multilabel fitting

In [56]:
X = [[1,2], [2,4], [4,5], [3,2], [3,1]]

In [57]:
y = [0,0,1,1,2]

In [58]:
classif = OneVsRestClassifier(estimator=svm.SVC(gamma='scale', random_state=0))

In [59]:
classif.fit(X, y).predict(X)

array([0, 0, 1, 1, 2])

In [61]:
y = LabelBinarizer().fit_transform(y)

In [62]:
classif.fit(X, y).predict(X)

array([[1, 0, 0],
       [1, 0, 0],
       [0, 1, 0],
       [0, 0, 0],
       [0, 0, 0]])

In [64]:
y = [[0, 1], [0, 2], [1, 3], [0, 2, 3], [2, 4]]

In [65]:
y = MultiLabelBinarizer().fit_transform(y)

In [66]:
classif.fit(X, y).predict(X)

array([[1, 1, 0, 0, 0],
       [1, 0, 1, 0, 0],
       [0, 1, 0, 1, 0],
       [1, 0, 1, 0, 0],
       [1, 0, 1, 0, 0]])