# Sklearn

## sklearn.grid_search

документация: http://scikit-learn.org/stable/modules/grid_search.html

In [2]:
from sklearn import model_selection, datasets, linear_model, metrics

import numpy as np
import pandas as pd

### Генерация датасета

In [4]:
iris = datasets.load_iris()

In [5]:
train_data, test_data, train_labels, test_labels = model_selection.train_test_split(iris.data, iris.target, 
                                                                                     test_size = 0.3, random_state = 0)

In [17]:
print(train_data)
print(train_labels)

[[5.  2.  3.5 1. ]
 [6.5 3.  5.5 1.8]
 [6.7 3.3 5.7 2.5]
 [6.  2.2 5.  1.5]
 [6.7 2.5 5.8 1.8]
 [5.6 2.5 3.9 1.1]
 [7.7 3.  6.1 2.3]
 [6.3 3.3 4.7 1.6]
 [5.5 2.4 3.8 1.1]
 [6.3 2.7 4.9 1.8]
 [6.3 2.8 5.1 1.5]
 [4.9 2.5 4.5 1.7]
 [6.3 2.5 5.  1.9]
 [7.  3.2 4.7 1.4]
 [6.5 3.  5.2 2. ]
 [6.  3.4 4.5 1.6]
 [4.8 3.1 1.6 0.2]
 [5.8 2.7 5.1 1.9]
 [5.6 2.7 4.2 1.3]
 [5.6 2.9 3.6 1.3]
 [5.5 2.5 4.  1.3]
 [6.1 3.  4.6 1.4]
 [7.2 3.2 6.  1.8]
 [5.3 3.7 1.5 0.2]
 [4.3 3.  1.1 0.1]
 [6.4 2.7 5.3 1.9]
 [5.7 3.  4.2 1.2]
 [5.4 3.4 1.7 0.2]
 [5.7 4.4 1.5 0.4]
 [6.9 3.1 4.9 1.5]
 [4.6 3.1 1.5 0.2]
 [5.9 3.  5.1 1.8]
 [5.1 2.5 3.  1.1]
 [4.6 3.4 1.4 0.3]
 [6.2 2.2 4.5 1.5]
 [7.2 3.6 6.1 2.5]
 [5.7 2.9 4.2 1.3]
 [4.8 3.  1.4 0.1]
 [7.1 3.  5.9 2.1]
 [6.9 3.2 5.7 2.3]
 [6.5 3.  5.8 2.2]
 [6.4 2.8 5.6 2.1]
 [5.1 3.8 1.6 0.2]
 [4.8 3.4 1.6 0.2]
 [6.5 3.2 5.1 2. ]
 [6.7 3.3 5.7 2.1]
 [4.5 2.3 1.3 0.3]
 [6.2 3.4 5.4 2.3]
 [4.9 3.  1.4 0.2]
 [5.7 2.5 5.  2. ]
 [6.9 3.1 5.4 2.1]
 [4.4 3.2 1.3 0.2]
 [5.  3.6 1.

### Задание модели

In [6]:
classifier = linear_model.SGDClassifier(random_state = 0)

### Генерация сетки

In [7]:
classifier.get_params().keys()

dict_keys(['alpha', 'average', 'class_weight', 'early_stopping', 'epsilon', 'eta0', 'fit_intercept', 'l1_ratio', 'learning_rate', 'loss', 'max_iter', 'n_iter', 'n_iter_no_change', 'n_jobs', 'penalty', 'power_t', 'random_state', 'shuffle', 'tol', 'validation_fraction', 'verbose', 'warm_start'])

In [43]:
parameters_grid = {
    'loss' : ['hinge', 'log', 'squared_hinge', 'squared_loss'],
    'penalty' : ['l1', 'l2'],
    'max_iter' : range(5, 10),
    'alpha' : np.linspace(0.0001, 0.001, num = 5),
}

In [44]:
cv = model_selection.StratifiedShuffleSplit(train_labels, test_size = 0.2, random_state = 0)

### Подбор параметров и оценка качества

#### Grid search

In [45]:
grid_cv = model_selection.GridSearchCV(classifier, parameters_grid, scoring = 'accuracy', cv = 10)

In [46]:
%%time
grid_cv.fit(train_data, train_labels)





















































Wall time: 4.87 s


GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=None,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
       power_t=0.5, random_state=0, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'loss': ['hinge', 'log', 'squared_hinge', 'squared_loss'], 'penalty': ['l1', 'l2'], 'max_iter': range(5, 10), 'alpha': array([0.0001 , 0.00032, 0.00055, 0.00078, 0.001  ])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [47]:
grid_cv.best_estimator_

SGDClassifier(alpha=0.001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='log', max_iter=6,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l1',
       power_t=0.5, random_state=0, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False)

In [49]:
print(grid_cv.best_score_)
print(grid_cv.best_params_)

0.9142857142857143
{'alpha': 0.001, 'loss': 'log', 'max_iter': 6, 'penalty': 'l1'}


In [65]:
grid_cv.cv_results_

{'mean_fit_time': array([0.00199704, 0.00112431, 0.00109608, 0.        , 0.001563  ,
        0.        , 0.        , 0.00264742, 0.00150304, 0.00160453,
        0.00150416, 0.00140383, 0.00180497, 0.00084052, 0.00156264,
        0.00238051, 0.00180733, 0.00149488, 0.00109956, 0.00312569,
        0.00156243, 0.        , 0.0018497 , 0.0017045 , 0.00140345,
        0.00197182, 0.00156269, 0.00156264, 0.        , 0.        ,
        0.00080235, 0.00130327, 0.00150743, 0.00175955, 0.00156322,
        0.0015635 , 0.00155954, 0.00217676, 0.00150368, 0.00160327,
        0.00130343, 0.00140369, 0.0013922 , 0.00119145, 0.00139773,
        0.00149229, 0.00150406, 0.00160644, 0.00150316, 0.00140331,
        0.0013983 , 0.00151434, 0.00030072, 0.00140123, 0.00149865,
        0.00132685, 0.00156281, 0.        , 0.        , 0.00322304,
        0.00170147, 0.00150089, 0.00138719, 0.00019484, 0.        ,
        0.        , 0.00173035, 0.00158944, 0.00140169, 0.00170462,
        0.00138862, 0.00129156,

#### Randomized grid search

In [75]:
randomized_grid_cv = model_selection.RandomizedSearchCV(classifier, parameters_grid, scoring = 'accuracy', cv = 10, n_iter = 40, 
                                                   random_state = 0)

In [76]:
%%time
randomized_grid_cv.fit(train_data, train_labels)











Wall time: 1.03 s




RandomizedSearchCV(cv=10, error_score='raise-deprecating',
          estimator=SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=None,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
       power_t=0.5, random_state=0, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=40, n_jobs=None,
          param_distributions={'loss': ['hinge', 'log', 'squared_hinge', 'squared_loss'], 'penalty': ['l1', 'l2'], 'max_iter': range(5, 10), 'alpha': array([0.0001 , 0.00032, 0.00055, 0.00078, 0.001  ])},
          pre_dispatch='2*n_jobs', random_state=0, refit=True,
          return_train_score='warn', scoring='accuracy', verbose=0)

In [77]:
print(randomized_grid_cv.best_score_)
print(randomized_grid_cv.best_params_)

0.8476190476190476
{'penalty': 'l1', 'max_iter': 9, 'loss': 'log', 'alpha': 0.00055}
