# 모델 선택

### 완전 탐색을 사용해 최선의 모델 선택하기

In [1]:
import numpy as np 
from sklearn import linear_model, datasets
from sklearn.model_selection import GridSearchCV

iris = datasets.load_iris()
features = iris.data
target = iris.target

logistic = linear_model.LogisticRegression()

penalty = ['l1','l2']

C = np.logspace(0,4,10)

hyperparameters = dict(C = C, penalty = penalty)

gridsearch = GridSearchCV(logistic, hyperparameters, cv = 5, verbose = 0)

best_model = gridsearch.fit(features,target)

Traceback (most recent call last):
  File "C:\Users\jlee0\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\jlee0\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1304, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\jlee0\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
Traceback (most recent call last):
  File "C:\Us

In [2]:
print('가장 좋은 페널티:', best_model.best_estimator_.get_params()['penalty'])
print('가장 좋은 C값:', best_model.best_estimator_.get_params()['C'])

가장 좋은 페널티: l2
가장 좋은 C값: 7.742636826811269


In [3]:
best_model.predict(features)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

### 랜덤 탐색을 사용해 최선의 모델 선택하기

In [4]:
from scipy.stats import uniform
from sklearn import linear_model, datasets
from sklearn.model_selection import RandomizedSearchCV

iris = datasets.load_iris()
features = iris.data
target = iris.target

logistic = linear_model.LogisticRegression()

penalty = ['l1','l2']

C = uniform(loc = 0, scale = 4)

hyperparameters = dict(C=C,penalty = penalty)

randomizedsearch = RandomizedSearchCV(logistic, hyperparameters, random_state = 1, n_iter = 100, cv=5, verbose = 0, n_jobs = -1)

best_model = randomizedsearch.fit(features, target)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [5]:
print('가장 좋은 페널티:',best_model.best_estimator_.get_params()['penalty'])
print('가장 좋은 C값:', best_model.best_estimator_.get_params()['C'])

가장 좋은 페널티: l2
가장 좋은 C값: 3.730229437354635


In [6]:
best_model.predict(features)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

### 여러 학습 알고리즘에서 최선의 모델 선택하기

In [12]:
import numpy as np
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

np.random.seed(0)

iris = datasets.load_iris()
features = iris.data
target = iris.target

pipe = Pipeline([('classifier',RandomForestClassifier())])

search_space = [{'classifier' : [LogisticRegression()],
                'classifier__penalty': ['l1','l2'],
                'classifier__C' : np.logspace(0,4,10)},
               {'classifier' : [RandomForestClassifier()],
               'classifier__n_estimators':[10,100,1000],
               'classifier__max_features' : [1,2,3]}]

gridsearch = GridSearchCV(pipe,search_space,cv=5,verbose = 0)

best_model = gridsearch.fit(features,target)

Traceback (most recent call last):
  File "C:\Users\jlee0\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\jlee0\anaconda3\lib\site-packages\sklearn\pipeline.py", line 335, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\jlee0\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1304, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\jlee0\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alterna

In [10]:
best_model.best_estimator_.get_params()['classifier']

LogisticRegression(C=7.742636826811269)

In [13]:
best_model.predict(features)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

### 전처리와 함께 최선의 모델 선택하기

In [18]:
import numpy as np
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

np.random.seed(0)

iris = datasets.load_iris()
features = iris.data
target = iris.target

preprocess = FeatureUnion([('std',StandardScaler()),('pca',PCA())])

pipe = Pipeline([('preprocess',preprocess),('classifier',LogisticRegression())])

search_space = [{'preprocess__pca__n_components': [1,2,3],
                'classifier__penalty': ['l1','l2'],
                'classifier__C':np.logspace(0,4,10)}]

clf = GridSearchCV(pipe,search_space,cv=5,verbose = 0,n_jobs = -1)

best_model = clf.fit(features, target)

In [19]:
best_model.best_estimator_.get_params()['preprocess__pca__n_components']

2

In [20]:
clf.best_score_

0.9800000000000001

In [24]:
pipe = Pipeline([('std', StandardScaler()),
                ('pca', PCA()),
                ('classifier', LogisticRegression())],
               memory = 'cache')

search_space = [{'pca__n_components':[1,2,3],
                'classifier__penalty':['l1','l2'],
                'classifier__C': np.logspace(0,4,10)}]

clf = GridSearchCV(pipe,search_space,cv=5, verbose =1, n_jobs = -1)

best_model = clf.fit(features,target)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    5.6s finished


In [25]:
clf.best_score_

0.9733333333333334

In [27]:
clf.best_estimator_.get_params()['pca__n_components']

3

In [28]:
clf.best_estimator_.named_steps['pca'].transform(features[0:1])

array([[ 2.64026976,  5.2040413 , -2.48862071]])

### 병렬화로 모델 선택 속도 높이기

In [30]:
import numpy as np
from sklearn import linear_model, datasets
from sklearn.model_selection import GridSearchCV

iris = datasets.load_iris()
features = iris.data
target = iris.target

logistic = linear_model.LogisticRegression()

penalty = ['l1','l2']

C = np.logspace(0,4,1000)

hyperparameters = dict(C = C, penalty = penalty)

gridsearch = GridSearchCV(logistic, hyperparameters, cv =5, n_jobs = -1, verbose = 1)

best_model = gridsearch.fit(features,target)

Fitting 5 folds for each of 2000 candidates, totalling 10000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done 1560 tasks      | elapsed:    7.9s
[Parallel(n_jobs=-1)]: Done 5560 tasks      | elapsed:   19.3s
[Parallel(n_jobs=-1)]: Done 9960 tasks      | elapsed:   34.1s
[Parallel(n_jobs=-1)]: Done 10000 out of 10000 | elapsed:   34.3s finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


### 알고리즘에 특화된 기법을 사용하여 모델 선택 수행 속도 높이기

In [32]:
from sklearn import linear_model, datasets

iris = datasets.load_iris()
features = iris.data
target = iris.target

logit = linear_model.LogisticRegressionCV(Cs=100)

logit.fit(features,target)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

LogisticRegressionCV(Cs=100)

### 모델 선택 후 성능 평가하기

In [37]:
import numpy as np
from sklearn import linear_model, datasets
from sklearn.model_selection import GridSearchCV, cross_val_score


iris = datasets.load_iris()
features = iris.data
target = iris.data

logistic = linear_model.LogisticRegression(solver = 'liblinear',multi_class = 'auto')

C = np.logspace(0,4,20)

hyperparameters = dict(C=C)

gridsearch = GridSearchCV(logistic, hyperparameters, cv = 5, n_jobs = -1, verbose = 1, iid = False)

cross_val_score(gridsearch, features,target,cv=3).sum()

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.0s finished
Traceback (most recent call last):
  File "C:\Users\jlee0\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\jlee0\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 72, in inner_f
    return f(**kwargs)
  File "C:\Users\jlee0\anaconda3\lib\site-packages\sklearn\model_selection\_search.py", line 765, in fit
    self.best_estimator_.fit(X, y, **fit_params)
  File "C:\Users\jlee0\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1344, in fit
    accept_large_sparse=solver != 'liblinear')
  File "C:\Users\jlee0\anaconda3\lib\site-packages\sklearn\base.py", line 432, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "C:\Users

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.0s finished
Traceback (most recent call last):
  File "C:\Users\jlee0\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\jlee0\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 72, in inner_f
    return f(**kwargs)
  File "C:\Users\jlee0\anaconda3\lib\site-packages\sklearn\model_selection\_search.py", line 765, in fit
    self.best_estimator_.fit(X, y, **fit_params)
  File "C:\Users\jlee0\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1344, in fit
    accept_large_sparse=solver != 'liblinear')
  File "C:\Users\jlee0\anaconda3\lib\site-packages\sklearn\base.py", line 432, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "C:\Users\jlee0\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 72, in

nan