# Chapter 12
## Model selection

### 12.1 Select best models using exhaustive search

In [None]:
import numpy as np
from sklearn import linear_model, datasets
from sklearn.model_selection import GridSearchCV

In [None]:
iris = datasets.load_iris()
features = iris.data
target = iris.target

# Create logistic regression model
logistic = linear_model.LogisticRegression(max_iter=1000)

# Create range of candidate penalty hyperparameter values
penalty = ['l1', 'l2']

# Create range of candidate regularization hyperparameter values
C = np.logspace(0, 4, 10)

# Create dictionary hyperparameter candidates
hyperparameters = dict(C=C, penalty=penalty) 

# Create grid search
gridsearch = GridSearchCV(logistic, hyperparameters, cv=5, verbose=0) 

# Fit grid search
best_model = gridsearch.fit(features, target)

In [None]:
print('Best Penalty:', best_model.best_estimator_.get_params()['penalty']) 
print('Best C:', best_model.best_estimator_.get_params()['C'])

In [None]:
best_model.predict(features)

### 12.2 Selecting Best Models Using Randomized Search

In [None]:
from scipy.stats import uniform
from sklearn import linear_model, datasets
from sklearn.model_selection import RandomizedSearchCV

In [None]:
# Load data
iris = datasets.load_iris()
features = iris.data
target = iris.target

# Create logistic regression
logistic = linear_model.LogisticRegression()

penalty = ['l1', 'l2']
C = uniform(loc=0, scale=4)
hyperparameters = dict(C=C, penalty=penalty)

# Create randomized search
randomizedsearch = RandomizedSearchCV(
    logistic, hyperparameters, random_state=1, n_iter=1000, cv=5, verbose=0, n_jobs=-1
)

# Fit randomized search
best_model = randomizedsearch.fit(features, target)

In [None]:
print('Best Penalty:', best_model.best_estimator_.get_params()['penalty']) 
print('Best C:', best_model.best_estimator_.get_params()['C'])

### 12.3 Selecting Best Models from Multiple Learning Algorithms

In [None]:
import numpy as np
from sklearn import datasets
from sklearn.linear_model import LogisticRegression 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import GridSearchCV 
from sklearn.pipeline import Pipeline

In [None]:
np.random.seed(0)

iris = datasets.load_iris()
features = iris.data
target = iris.target

pipe = Pipeline([("classifier", RandomForestClassifier())])

# Create dictionary with candidate learning algorithms and their hyperparameters
search_space = [{"classifier": [LogisticRegression()],
                "classifier__penalty": ['l1', 'l2'],
                "classifier__C": np.logspace(0, 4, 10)},
                {"classifier": [RandomForestClassifier()],
                "classifier__n_estimators": [10, 100, 1000],
                "classifier__max_features": [1, 2, 3]}]


gridsearch = GridSearchCV(pipe, search_space, cv=5, n_iter=1000, verbose=0) 
best_model = gridsearch.fit(features, target)

### 12.4 Selecting Best Models When Preprocessing

In [None]:
import numpy as np
from sklearn import datasets
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import GridSearchCV 
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [None]:
np.random.seed(0)

iris = datasets.load_iris()
features = iris.data
target = iris.target

# Create a preprocessing object that includes StandardScaler features and PCA
preprocess = FeatureUnion([("std", StandardScaler()), ("pca", PCA())])

# Create a pipeline
pipe = Pipeline([("preprocess", preprocess),
                ("classifier", LogisticRegression())])

# Create space of candidate values
search_space = [{"preprocess__pca__n_components": [1, 2, 3],
                "classifier__penalty": ["l2"],
                "classifier__C": np.logspace(0, 4, 10)}] 

# Create grid search
clf = GridSearchCV(pipe, search_space, cv=5, verbose=0, n_jobs=-1) 

# Fit grid search
best_model = clf.fit(features, target)

In [None]:
import pandas as pd
pd.DataFrame(clf.cv_results_)

### 12.5 Speeding Up Model Selection with Parallelization

In [None]:
import numpy as np
from sklearn import linear_model, datasets
from sklearn.model_selection import GridSearchCV

In [None]:
iris = datasets.load_iris()
features = iris.data
target = iris.target

logistic = linear_model.LogisticRegression()
penalty = ["l1", "l2"]
C = np.logspace(0, 4, 1000)

hyperparameters = dict(C=C, penalty=penalty) 
gridsearch = GridSearchCV(logistic, hyperparameters, cv=5, n_jobs=-1, verbose=1)
best_model = gridsearch.fit(features, target)

### 12.6 Speeding Up Model Selection Using Algorithm-Specific Methods

In [None]:
from sklearn import linear_model, datasets

In [None]:
iris = datasets.load_iris()
features = iris.data
target = iris.target

logit = linear_model.LogisticRegressionCV(Cs=100)
logit.fit(features, target)

### 12.7 Evaluating Performance After Model Selection

In [None]:
import numpy as np
from sklearn import linear_model, datasets
from sklearn.model_selection import GridSearchCV, cross_val_score

In [None]:
iris = datasets.load_iris()
features = iris.data
target = iris.target

logistic = linear_model.LogisticRegression(max_iter=1000) 
C = np.logspace(0, 4, 20)
hyperparameters = dict(C=C)

gridsearch = GridSearchCV(logistic, hyperparameters, cv=5, n_jobs=-1, verbose=0)

In [None]:
cross_val_score(gridsearch, features, target).mean()

In [None]:
gridsearch = GridSearchCV(logistic, hyperparameters, cv=5, verbose=1)

In [None]:
best_model = gridsearch.fit(features, target)

In [None]:
scores = cross_val_score(gridsearch, features, target)

In [None]:
scores.mean()