[Source][1]

***

[1]: http://scikit-learn.org/stable/tutorial/basic/tutorial.html

### Loading example data

In [1]:
from sklearn import datasets
iris = datasets.load_iris()
digits = datasets.load_digits()
# these datasets have 'data' and 'target' as attributes 
# the '.data' attribute gives access to the sample data
# the '.target' attribute gives access to 'each digit image we are trying to learn'; end goal, desired outcome

In [16]:
print(digits.data)

[[  0.   0.   5. ...,   0.   0.   0.]
 [  0.   0.   0. ...,  10.   0.   0.]
 [  0.   0.   0. ...,  16.   9.   0.]
 ..., 
 [  0.   0.   1. ...,   6.   0.   0.]
 [  0.   0.   2. ...,  12.   0.   0.]
 [  0.   0.  10. ...,  12.   1.   0.]]


In [3]:
digits.target

array([0, 1, 2, ..., 8, 9, 8])

In [4]:
digits.images[0]
# this data is a matrix of image information; an array of lists of lists

array([[  0.,   0.,   5.,  13.,   9.,   1.,   0.,   0.],
       [  0.,   0.,  13.,  15.,  10.,  15.,   5.,   0.],
       [  0.,   3.,  15.,   2.,   0.,  11.,   8.,   0.],
       [  0.,   4.,  12.,   0.,   0.,   8.,   8.,   0.],
       [  0.,   5.,   8.,   0.,   0.,   9.,   8.,   0.],
       [  0.,   4.,  11.,   0.,   1.,  12.,   7.,   0.],
       [  0.,   2.,  14.,   5.,  10.,  12.,   0.,   0.],
       [  0.,   0.,   6.,  13.,  10.,   0.,   0.,   0.]])

### Learning and predicting

In [5]:
# estimators are an object class that can use the following methods:
# fit(X, y)
# predict(T)
from sklearn import svm
clf = svm.SVC(gamma=0.001, C=100)
# now we have a basic estimator, named 'clf' for 'classifier'

In [6]:
# model still needs to be fitted, the training and sample data attributes will be used
clf.fit(digits.data[:-1], digits.target[:-1]) # the last object in both sets are held back here

SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [7]:
# because the last entry was held back during training, this can be used to predict a new value on 
# unseen data which was not used during training
clf.predict(digits.data[-1:])

array([8])

This is the image the model attempted to predict:
![plot](http://scikit-learn.org/stable/_images/sphx_glr_plot_digits_last_image_001.png)

Using [pickle()](https://docs.python.org/2/library/pickle.html), python's persistence model, the model can be saved

In this example, we are calling the [dumps](https://docs.python.org/3/library/pickle.html#pickle.dumps) function

### Model persistence

In [24]:
from sklearn import svm
from sklearn import datasets
clf = svm.SVC()
iris = datasets.load_iris()
X, y = iris.data, iris.target
print(repr(clf.fit(X, y)))
print()
print()

import pickle # pickle is a module for serialisation (conversion to byte stream)
s = pickle.dumps(clf)
clf2 = pickle.loads(s)
print(repr(clf2.predict(X[0:1]))) # returns an array
print(repr(y[0])) 

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)


array([0])
0


In [34]:
# can also use 'joblib' in place of pickle, good for big data
from sklearn.externals import joblib
joblib.dump(clf, 'filename.pkl') # a pickle file!

['filename.pkl']

***
## Conventions  
### Type Casting

In [36]:
import numpy as np
from sklearn import random_projection
rng = np.random.RandomState(0)
X = rng.rand(10, 2000)
X = np.array(X, dtype = 'float32')
print(X.dtype)
print()

transformer = random_projection.GaussianRandomProjection()
X_new = transformer.fit_transform(X)
X_new.dtype

float32



dtype('float64')

Above code: `X` which is `float32` is cast to `float64` using `fit_transform()`.

Regression targets are also cast to `float64`, classification targets are maintained.  
[Source][1]


[1]: http://scikit-learn.org/stable/tutorial/basic/tutorial.html#type-casting

In [None]:
# input is cast to 'float64' type unless otherwise specified
from sklearn import datasets
from sklearn.svm import SVC
iris = datasets.load_iris()
clf = SVC()
print(repr(
    clf.fit(iris.data, iris.target)
))
print()
print(
           list(clf.predict(iris.data[:3]))
)
print()
print()

print(repr(clf.fit(iris.data, iris.target_names[iris.target])))
print(list(clf.predict(iris.data[0:3])))
print()



