# An introduction to machine learning with scikit-learn#

## Loading an example dataset ##

In [30]:
from sklearn import datasets
iris=datasets.load_iris()
digits=datasets.load_digits()

In [31]:
# dataset.data => (n_samples,n_features)
# dataset.target
# scikit-learn datasets:
# http://scikit-learn.org/stable/datasets/index.html#datasets
print(digits.data)
print()
# each digits sample is of shape (8,8)
print(digits.images[0])
print()
print(digits.target)

[[  0.   0.   5. ...,   0.   0.   0.]
 [  0.   0.   0. ...,  10.   0.   0.]
 [  0.   0.   0. ...,  16.   9.   0.]
 ..., 
 [  0.   0.   1. ...,   6.   0.   0.]
 [  0.   0.   2. ...,  12.   0.   0.]
 [  0.   0.  10. ...,  12.   1.   0.]]

[[  0.   0.   5.  13.   9.   1.   0.   0.]
 [  0.   0.  13.  15.  10.  15.   5.   0.]
 [  0.   3.  15.   2.   0.  11.   8.   0.]
 [  0.   4.  12.   0.   0.   8.   8.   0.]
 [  0.   5.   8.   0.   0.   9.   8.   0.]
 [  0.   4.  11.   0.   1.  12.   7.   0.]
 [  0.   2.  14.   5.  10.  12.   0.   0.]
 [  0.   0.   6.  13.  10.   0.   0.   0.]]

[0 1 2 ..., 8 9 8]


## Learning and predicting ##
In the case of the digits dataset, the task is to predict, given an image, which digit it represents.

In scikit-learn, an estimator for classification is a Python object that implements the methods fit(X, y) and predict(T).

In [32]:
# support vector classification
from sklearn import svm
clf=svm.SVC(gamma=0.001,C=100)

# train with all but one data
clf.fit(digits.data[:-1],digits.target[:-1])

SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [33]:
# test with the one left out
clf.predict(digits.data[-1:])

array([8])

## Model persistence ##

In [34]:
from sklearn import svm
from sklearn import datasets
clf = svm.SVC()
iris = datasets.load_iris()
X, y = iris.data, iris.target
clf.fit(X, y)  

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [35]:
import pickle
s = pickle.dumps(clf) # pickle to string
clf2 = pickle.loads(s)
print(clf2.predict(X[0:1]))
print(y[0])

[0]
0


joblib is replacement of pickle, which is more efficient on big data, but can only pickle to the disk and not to a string, pickle can do both.

In [36]:
from sklearn.externals import joblib
joblib.dump(clf, './model-storage/filename.pkl') 

['./model-storage/filename.pkl',
 './model-storage/filename.pkl_01.npy',
 './model-storage/filename.pkl_02.npy',
 './model-storage/filename.pkl_03.npy',
 './model-storage/filename.pkl_04.npy',
 './model-storage/filename.pkl_05.npy',
 './model-storage/filename.pkl_06.npy',
 './model-storage/filename.pkl_07.npy',
 './model-storage/filename.pkl_08.npy',
 './model-storage/filename.pkl_09.npy',
 './model-storage/filename.pkl_10.npy',
 './model-storage/filename.pkl_11.npy']

In [37]:
clf3 = joblib.load('./model-storage/filename.pkl') 
print(clf3.predict(X[0:1]))

[0]


## Conventions ##
scikit-learn estimators follow certain rules to make their behavior more predictive.

### Type casting ###
Unless otherwise specified, input will be cast to float64

In [38]:
import numpy as np
from sklearn import random_projection

# generate random numbers drawn from a variety
# of probability distributions.
rng = np.random.RandomState(0)
X = rng.rand(10, 2000)
X = np.array(X, dtype='float32')
print(X.dtype)

transformer = random_projection.GaussianRandomProjection()
X_new = transformer.fit_transform(X)
print(X_new.dtype)

float32
float64


Regression targets are cast to float64, classification targets are maintained.

Here, the first predict() returns an integer array, since iris.target (an integer array) was used in fit. The second predict() returns a string array, since iris.target_names was for fitting.

In [39]:
from sklearn import datasets
from sklearn.svm import SVC
iris = datasets.load_iris()
clf = SVC()
clf.fit(iris.data, iris.target)  

print(list(clf.predict(iris.data[:3])))

clf.fit(iris.data, iris.target_names[iris.target])  

print(list(clf.predict(iris.data[:3])))

[0, 0, 0]
['setosa', 'setosa', 'setosa']


### Refitting and updating parameters (hyper-parameters) ###

In [40]:
import numpy as np
from sklearn.svm import SVC

rng = np.random.RandomState(0)
X = rng.rand(100, 10)
y = rng.binomial(1, 0.5, 100)
X_test = rng.rand(5, 10)

clf = SVC()
clf.set_params(kernel='linear').fit(X, y)  
clf.predict(X_test)

array([1, 0, 1, 1, 0])

In [41]:
# re-fit
clf.set_params(kernel='rbf').fit(X, y)  
# re-predict
clf.predict(X_test)

array([0, 0, 0, 1, 0])