In [1]:
# - Machine learning models are parameterized so that their behavior can be tuned for a given problem. 

# - Models can have many parameters and finding the best combination of parameters can be treated as a search problem.

In [2]:
# 16.1 Machine Learning Algorithm Parameters

In [3]:
# - Algorithm tuning is a final step in the process of applied machine learning before finalizing your model. 

# - It is sometimes called hyperparameter optimization where the algorithm parameters are referred to as 
# hyperparameters, whereas the coefficients found by the machine learning algorithm itself are referred 
# to as parameters.
    # - Grid Search Parameter Tuning.
    # - Random Search Parameter Tuning.

In [4]:
from pandas import read_csv

In [5]:
import numpy

In [6]:
import sys

In [7]:
def print_data(_data):
    return numpy.savetxt(sys.stdout, _data[:5,:], '%5.3f')

In [8]:
_uri = 'https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data'

In [9]:
_col_names = ['preg','plas','pres','skin','test','mass','pedi','age','class']

In [10]:
_dataframe = read_csv(_uri, names=_col_names)

In [11]:
_array = _dataframe.values

In [12]:
print_data(_array)

6.000 148.000 72.000 35.000 0.000 33.600 0.627 50.000 1.000
1.000 85.000 66.000 29.000 0.000 26.600 0.351 31.000 0.000
8.000 183.000 64.000 0.000 0.000 23.300 0.672 32.000 1.000
1.000 89.000 66.000 23.000 94.000 28.100 0.167 21.000 0.000
0.000 137.000 40.000 35.000 168.000 43.100 2.288 33.000 1.000


In [13]:
_X = _array[:,0:8]

In [14]:
print_data(_X)

6.000 148.000 72.000 35.000 0.000 33.600 0.627 50.000
1.000 85.000 66.000 29.000 0.000 26.600 0.351 31.000
8.000 183.000 64.000 0.000 0.000 23.300 0.672 32.000
1.000 89.000 66.000 23.000 94.000 28.100 0.167 21.000
0.000 137.000 40.000 35.000 168.000 43.100 2.288 33.000


In [15]:
_Y = _array[:,8:]

In [16]:
print_data(_Y)

1.000
0.000
1.000
0.000
1.000


In [17]:
_Y = numpy.ravel(_Y)

In [18]:
print(_Y[:5])

[ 1.  0.  1.  0.  1.]


In [19]:
# 16.2 Grid Search Parameter Tuning

In [20]:
# - Grid search is an approach to parameter tuning that will methodically build and evaluate a model for 
# each combination of algorithm parameters specified in a grid.

In [21]:
from sklearn.linear_model import Ridge

In [22]:
from sklearn.model_selection import GridSearchCV

In [23]:
_alphas = numpy.array([1,0.1,0.01,0.001,0.0001,0])

In [24]:
_param_grid = dict(alpha=_alphas)

In [25]:
_model = Ridge()

In [26]:
_grid = GridSearchCV(estimator=_model, param_grid=_param_grid)

In [27]:
_grid.fit(_X, _Y)

GridSearchCV(cv=None, error_score='raise',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'alpha': array([  1.00000e+00,   1.00000e-01,   1.00000e-02,   1.00000e-03,
         1.00000e-04,   0.00000e+00])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [28]:
'{:.3%}'.format(_grid.best_score_)

'27.962%'

In [29]:
_grid.best_estimator_.alpha

1.0

In [30]:
# 16.3 Random Search Parameter Tuning

In [31]:
# - Random search is an approach to parameter tuning that will sample algorithm parameters from a 
# random distribution (i.e. uniform) for a fixed number of iterations. 

# - A model is constructed and evaluated for each combination of parameters chosen. 

In [32]:
from sklearn.model_selection import RandomizedSearchCV

In [33]:
from scipy.stats import uniform

In [34]:
_param_grid = {'alpha':uniform()}

In [35]:
_model = Ridge()

In [36]:
_random_search = RandomizedSearchCV(estimator=_model, param_distributions=_param_grid, n_iter=100, random_state=7)

In [37]:
_random_search.fit(_X, _Y)

RandomizedSearchCV(cv=None, error_score='raise',
          estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
          fit_params={}, iid=True, n_iter=100, n_jobs=1,
          param_distributions={'alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1167425c0>},
          pre_dispatch='2*n_jobs', random_state=7, refit=True,
          return_train_score=True, scoring=None, verbose=0)

In [38]:
'{:.3%}'.format(_random_search.best_score_)

'27.962%'

In [39]:
'{:.2f}'.format(_random_search.best_estimator_.alpha)

'0.98'