# Model Selection and Hyperparameter Tuning using Grid Search CV

In [10]:
import pandas as pd

from sklearn.datasets import load_iris

In [12]:
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['flower'] = iris.target
df['flower'] = df['flower'].apply(lambda x : iris.target_names[x])

In [16]:
df[45:55]

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),flower
45,4.8,3.0,1.4,0.3,setosa
46,5.1,3.8,1.6,0.2,setosa
47,4.6,3.2,1.4,0.2,setosa
48,5.3,3.7,1.5,0.2,setosa
49,5.0,3.3,1.4,0.2,setosa
50,7.0,3.2,4.7,1.4,versicolor
51,6.4,3.2,4.5,1.5,versicolor
52,6.9,3.1,4.9,1.5,versicolor
53,5.5,2.3,4.0,1.3,versicolor
54,6.5,2.8,4.6,1.5,versicolor


`Approach 1:` Use train_test_split and manually tune your parameters by trial and error

In [20]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2)

In [22]:
from sklearn import svm

model = svm.SVC(kernel='rbf', C=30, gamma='auto')
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.9333333333333333

`Approach 2:` Use K-Fold Cross Validation  

Manually try supplying models with different parameters to cross_val_score function with 5 fold cross validation

In [23]:
from sklearn.model_selection import cross_val_score

cross_val_score(svm.SVC(kernel='linear', C=10, gamma='auto'), iris.data, iris.target, cv=5)

array([1.        , 1.        , 0.9       , 0.96666667, 1.        ])

In [24]:
cross_val_score(svm.SVC(kernel='rbf', C=10, gamma='auto'), iris.data, iris.target, cv=5)

array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ])

In [25]:
cross_val_score(svm.SVC(kernel='rbf', C=20, gamma='auto'), iris.data, iris.target, cv=5)

array([0.96666667, 1.        , 0.9       , 0.96666667, 1.        ])

Above approach is so tiresome and very manual writing the same code line by line, instead we can use for loop.

In [27]:
kernels = ['rbf', 'linear']
C = [1, 10, 20]

for kval in kernels:
    for cval in C:
        print(f'For kernel: {kval} and C = {cval}')
        print(cross_val_score(svm.SVC(kernel=kval, C=cval, gamma='auto'), iris.data, iris.target, cv=5).mean())

For kernel: rbf and C = 1
0.9800000000000001
For kernel: rbf and C = 10
0.9800000000000001
For kernel: rbf and C = 20
0.9666666666666668
For kernel: linear and C = 1
0.9800000000000001
For kernel: linear and C = 10
0.9733333333333334
For kernel: linear and C = 20
0.9666666666666666


From the above results, we can say that for **kernel rbf, C = 1 and 10** performs well with 98% accuracy. And also for kernel **linear, C = 1** with 98% score.

`Approach 3:` Use GridSearchCV

In [30]:
from sklearn.model_selection import GridSearchCV

clf = GridSearchCV(svm.SVC(gamma='auto'), {
    'C': [1, 10, 20],
    'kernel': ['rbf', 'linear']},
    cv=5, return_train_score=False)

In [31]:
clf.fit(iris.data, iris.target)
clf.cv_results_

{'mean_fit_time': array([0.00324922, 0.00360827, 0.0078558 , 0.00835042, 0.00191412,
        0.00429888]),
 'std_fit_time': array([0.00309904, 0.00298478, 0.00803054, 0.01360666, 0.00126673,
        0.00597702]),
 'mean_score_time': array([0.00099249, 0.00117626, 0.00121069, 0.00166874, 0.00186853,
        0.00143108]),
 'std_score_time': array([0.00023566, 0.0004716 , 0.00061114, 0.001087  , 0.00157156,
        0.00155531]),
 'param_C': masked_array(data=[1, 1, 10, 10, 20, 20],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['rbf', 'linear', 'rbf', 'linear', 'rbf', 'linear'],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 1, 'kernel': 'rbf'},
  {'C': 1, 'kernel': 'linear'},
  {'C': 10, 'kernel': 'rbf'},
  {'C': 10, 'kernel': 'linear'},
  {'C': 20, 'kernel': 'rbf'},
  {'C': 20, 'kernel': 'linear'}],


From the above results, it is not properly readable form. What we will do create a dataframe out of it.

In [32]:
df = pd.DataFrame(clf.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.003249,0.003099,0.000992,0.000236,1,rbf,"{'C': 1, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
1,0.003608,0.002985,0.001176,0.000472,1,linear,"{'C': 1, 'kernel': 'linear'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
2,0.007856,0.008031,0.001211,0.000611,10,rbf,"{'C': 10, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
3,0.00835,0.013607,0.001669,0.001087,10,linear,"{'C': 10, 'kernel': 'linear'}",1.0,1.0,0.9,0.966667,1.0,0.973333,0.038873,4
4,0.001914,0.001267,0.001869,0.001572,20,rbf,"{'C': 20, 'kernel': 'rbf'}",0.966667,1.0,0.9,0.966667,1.0,0.966667,0.036515,5
5,0.004299,0.005977,0.001431,0.001555,20,linear,"{'C': 20, 'kernel': 'linear'}",1.0,1.0,0.9,0.933333,1.0,0.966667,0.042164,6


In [33]:
df[['param_C', 'param_kernel', 'mean_test_score', 'rank_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score,rank_test_score
0,1,rbf,0.98,1
1,1,linear,0.98,1
2,10,rbf,0.98,1
3,10,linear,0.973333,4
4,20,rbf,0.966667,5
5,20,linear,0.966667,6


In [34]:
dir(clf)

['__abstractmethods__',
 '__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__sklearn_clone__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_build_request_for_signature',
 '_check_feature_names',
 '_check_n_features',
 '_check_refit_for_multimetric',
 '_estimator_type',
 '_format_results',
 '_get_default_requests',
 '_get_metadata_request',
 '_get_param_names',
 '_get_tags',
 '_more_tags',
 '_parameter_constraints',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_required_parameters',
 '_run_search',
 '_select_best_index',
 '_validate_data',
 '_validate_params',
 'best_estimator_',
 'best_index_',
 'best_params_',
 'best_score_',
 '

In [35]:
clf.best_params_

{'C': 1, 'kernel': 'rbf'}

In [36]:
clf.best_score_

0.9800000000000001

In [37]:
clf.best_estimator_

We can also use `RandomizedSearchCV` to reduce number of iterations and with random combination of parameters. This is useful when you have too many parameters to try and your training time is longer. It helps reduce the cost of computation

In [46]:
from sklearn.model_selection import RandomizedSearchCV

rs = RandomizedSearchCV(svm.SVC(gamma='auto'), {
    'kernel': ['rbf', 'linear'],
    'C': [1,10,20],}, 
    cv=5, return_train_score=False, n_iter=2)

In [47]:
rs.fit(iris.data, iris.target)

In [48]:
df = pd.DataFrame(rs.cv_results_)[['param_C', 'param_kernel', 'mean_test_score', 'rank_test_score']]

In [49]:
df

Unnamed: 0,param_C,param_kernel,mean_test_score,rank_test_score
0,10,linear,0.973333,2
1,1,linear,0.98,1


### **How about different models with different hyperparameters?**

In [51]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# we create a dictionary
model_params = {
    'svm': {
        'model': SVC(gamma='auto'),  # model class name in scikit-learn
        'params': {
            'C': [1, 10, 20],
            'kernel': ['rbf', 'linear'],
        }
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [1, 5, 10]
        }
    },
    'logreg': {
        'model': LogisticRegression(solver='liblinear', multi_class='auto'),
        'params': {
            'C': [1, 5, 10]
        }
    }
}

In [56]:
scores = []

for model_name, mp in model_params.items():
    clf = GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(iris.data, iris.target)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })

In [57]:
scores

[{'model': 'svm',
  'best_score': 0.9800000000000001,
  'best_params': {'C': 1, 'kernel': 'rbf'}},
 {'model': 'random_forest',
  'best_score': 0.9533333333333334,
  'best_params': {'n_estimators': 10}},
 {'model': 'logreg',
  'best_score': 0.9666666666666668,
  'best_params': {'C': 5}}]

In [58]:
df = pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
df

Unnamed: 0,model,best_score,best_params
0,svm,0.98,"{'C': 1, 'kernel': 'rbf'}"
1,random_forest,0.953333,{'n_estimators': 10}
2,logreg,0.966667,{'C': 5}


Based on above, I can terminate that SVM with C=1 and kernel='rbf' parameters is the best model for solving my problem of iris flower classification.