In [1]:
# - Finding an accurate machine learning model is not the end of the project.

# - The model needs to be saved to a file and loaded later in order to make predictions.

In [2]:
from pandas import read_csv

In [3]:
import numpy

In [4]:
import sys

In [5]:
def print_data(_data):
    return numpy.savetxt(sys.stdout, _data[:5,:], '%5.3f')

In [6]:
_uri = 'https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data'

In [7]:
_col_names = ['preg','plas','pres','skin','test','mass','pedi','age','class']

In [8]:
_dataframe = read_csv(_uri, names=_col_names)

In [9]:
_array = _dataframe.values

In [10]:
print_data(_array)

6.000 148.000 72.000 35.000 0.000 33.600 0.627 50.000 1.000
1.000 85.000 66.000 29.000 0.000 26.600 0.351 31.000 0.000
8.000 183.000 64.000 0.000 0.000 23.300 0.672 32.000 1.000
1.000 89.000 66.000 23.000 94.000 28.100 0.167 21.000 0.000
0.000 137.000 40.000 35.000 168.000 43.100 2.288 33.000 1.000


In [11]:
_X = _array[:,0:8]

In [12]:
print_data(_X)

6.000 148.000 72.000 35.000 0.000 33.600 0.627 50.000
1.000 85.000 66.000 29.000 0.000 26.600 0.351 31.000
8.000 183.000 64.000 0.000 0.000 23.300 0.672 32.000
1.000 89.000 66.000 23.000 94.000 28.100 0.167 21.000
0.000 137.000 40.000 35.000 168.000 43.100 2.288 33.000


In [13]:
_Y = _array[:,8:]

In [14]:
print_data(_Y)

1.000
0.000
1.000
0.000
1.000


In [15]:
_Y = numpy.ravel(_Y)

In [16]:
print(_Y[:5])

[ 1.  0.  1.  0.  1.]


In [17]:
from sklearn.model_selection import train_test_split

In [18]:
from sklearn.linear_model import LogisticRegression

In [19]:
X_train, X_test, Y_train, Y_test = train_test_split(_X, _Y, test_size=0.33, random_state=7)

In [20]:
_model = LogisticRegression()

In [21]:
_model.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [22]:
_score = _model.score(X_test, Y_test)

In [23]:
'{:.3%}'.format(_score)

'75.591%'

In [24]:
# 17.2 Finalize Your Model with Pickle

In [25]:
from pickle import dump

In [26]:
_filename = '_model.pickle'

In [27]:
dump(_model, open(_filename,'wb'))

In [28]:
# 17.2 Finalize Your Model with Joblib

In [29]:
# - The Joblib2 library is part of the SciPy ecosystem and provides utilities for pipelining Python jobs. 

# - It provides utilities for saving and loading Python objects that make use of NumPy data structures, 
# efficiently. 

# - This can be useful for some machine learning algorithms that require a lot of parameters or store 
# the entire dataset (e.g. k-Nearest Neighbors).

In [30]:
from sklearn.externals.joblib import dump

In [31]:
_filename = '_model.joblib'

In [32]:
dump(_model, _filename)

['_model.joblib']

In [33]:
# 17.3 Tips for Finalizing Your Model

In [34]:
# - Take note of python and other library versions used in serialization, as deserialization would require 
# them to be the same.

In [35]:
# - Manual Serialization: You might like to manually output the parameters of your learned model so that you 
# can use them directly in scikit-learn or another platform in the future. 

# - Often the techniques used internally by machine learning algorithms to make predictions are a lot simpler 
# than those used to learn the parameters and can be easy to implement in custom code that you have control over.