<a href="https://colab.research.google.com/github/fahimabrar/Hyperparameter_Tuning/blob/main/Gridsearch_CV_and_Random_search_CV.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [31]:
from sklearn import svm
from sklearn import datasets
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate as CV
from sklearn.model_selection import cross_val_score

In [77]:
iris = datasets.load_iris()
#iris

In [5]:
data = pd.DataFrame(iris.data, columns=iris.feature_names)
# we loaded iris dataset from sklearn datasets and converted them to a pandas dataFrame to see the features names
data.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [7]:
import numpy as np
np.shape(iris.target)

(150,)

In [78]:
X = iris.data
y = iris.target

In [25]:
print(X.shape)
print(y.shape)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


(150, 4)
(150,)


In [26]:
clf1 = svm.SVC(C = 1.0, kernel ='rbf', gamma= 'auto')
clf1.fit(X_train, y_train)
clf1.score(X_test, y_test)

1.0

In [27]:
clf2 = svm.SVC(C = 5.0, kernel ='linear', gamma= 'scale')
clf2.fit(X_train, y_train)
clf2.score(X_test, y_test)

0.9666666666666667

# five fold Cross Validation

In [29]:
scores1 = cross_val_score(clf1, X_train, y_train, cv=5)
scores2 = cross_val_score(clf2, X_train, y_train, cv = 5)

In [30]:
print(scores1)
print()
print(scores2)

[0.95833333 0.95833333 1.         0.95833333 0.95833333]

[1.         0.95833333 1.         0.95833333 0.95833333]


In [32]:
from sklearn.model_selection import GridSearchCV

# we can reduce the tidious/repeatating task by using sklearn gridsearchCV api
# we can try different parameters with gridsearchCV and train the models to find the most accurate model

In [35]:
gridclf = GridSearchCV(svm.SVC(gamma='auto'), 
                       {'C': [1, 5, 10], 'kernel':['rbf', 'linear',]}, cv = 5)

In [36]:
gridclf.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [1, 5, 10], 'kernel': ['rbf', 'linear']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [37]:
gridclf.cv_results_

{'mean_fit_time': array([0.00067387, 0.00066566, 0.00053449, 0.00060892, 0.00051641,
        0.00048671]),
 'mean_score_time': array([0.00038295, 0.00043244, 0.00029688, 0.00035114, 0.00029535,
        0.00031042]),
 'mean_test_score': array([0.96666667, 0.98333333, 0.975     , 0.975     , 0.96666667,
        0.975     ]),
 'param_C': masked_array(data=[1, 1, 5, 5, 10, 10],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['rbf', 'linear', 'rbf', 'linear', 'rbf', 'linear'],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 1, 'kernel': 'rbf'},
  {'C': 1, 'kernel': 'linear'},
  {'C': 5, 'kernel': 'rbf'},
  {'C': 5, 'kernel': 'linear'},
  {'C': 10, 'kernel': 'rbf'},
  {'C': 10, 'kernel': 'linear'}],
 'rank_test_score': array([5, 1, 2, 2, 5, 2], dtype=int32),
 'split0_test_score': array([0.95833333, 0.958333

## Lets convert the result into a pandas dataframe and select only the needed columns

In [38]:
pd.DataFrame(gridclf.cv_results_)[["param_C", "param_kernel", "mean_test_score"]]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,rbf,0.966667
1,1,linear,0.983333
2,5,rbf,0.975
3,5,linear,0.975
4,10,rbf,0.966667
5,10,linear,0.975


In [40]:
gridclf.best_estimator_ 
# the best model (98% accuracy) is the one that has C = 1 and a linear kernel

SVC(C=1, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [42]:
gridclf.best_params_
# or we can only print the parameter that we defiend (not the default parameteres|)

{'C': 1, 'kernel': 'linear'}

## If the model is computatinally heavy, we can tune the parameter with the RandomizedSearchCV 
where from predefined set of hyperparameters, randomly picked parameters are tried and showed the results

we can define how many randmized parameters we wanna try

In [47]:
from sklearn.model_selection import RandomizedSearchCV
randclf = RandomizedSearchCV(svm.SVC(gamma='auto'), 
                       {'C': [1, 5, 10, 20], 'kernel':['rbf', 'linear',]}, cv = 5, n_iter = 3)

# here n_iter = 3 means we want the output for 3 randomized hyperparameters

In [48]:
randclf.fit(X_train, y_train)

RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                                 class_weight=None, coef0=0.0,
                                 decision_function_shape='ovr', degree=3,
                                 gamma='auto', kernel='rbf', max_iter=-1,
                                 probability=False, random_state=None,
                                 shrinking=True, tol=0.001, verbose=False),
                   iid='deprecated', n_iter=3, n_jobs=None,
                   param_distributions={'C': [1, 5, 10, 20],
                                        'kernel': ['rbf', 'linear']},
                   pre_dispatch='2*n_jobs', random_state=None, refit=True,
                   return_train_score=False, scoring=None, verbose=0)

In [49]:
pd.DataFrame(randclf.cv_results_)[["param_C", "param_kernel", "mean_test_score"]]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,10,linear,0.975
1,1,rbf,0.966667
2,5,linear,0.975


In [50]:
# we can see we defiend lots of types of hyperparameters in randomizedCV, but it randomly selected 3 sets of parameters and trained the model with them

# Lets practise this again for KNN

In [63]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=3)

In [64]:
neigh.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [65]:
neigh.score(X_test, y_test)

0.9666666666666667

In [66]:
gcf = GridSearchCV(KNeighborsClassifier(), {'n_neighbors':[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}, cv =3)

In [67]:
gcf.fit(X_train, y_train)

GridSearchCV(cv=3, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=None,
             param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [68]:
pd.DataFrame(gcf.cv_results_)[["param_n_neighbors", "mean_test_score"]]

Unnamed: 0,param_n_neighbors,mean_test_score
0,1,0.958333
1,2,0.958333
2,3,0.958333
3,4,0.95
4,5,0.975
5,6,0.966667
6,7,0.983333
7,8,0.983333
8,9,0.975
9,10,0.983333


# some random practise for Logistic Regression

In [69]:
bc = datasets.load_breast_cancer()

In [70]:
bc.data.shape

(569, 30)

In [71]:
bc.target.shape

(569,)

In [72]:
bc.feature_names

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error',
       'fractal dimension error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension'], dtype='<U23')

In [73]:
bc.target_names

array(['malignant', 'benign'], dtype='<U9')

In [74]:
from sklearn.linear_model import LogisticRegression

In [75]:
clf = LogisticRegression(random_state=0).fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [76]:
clf.score(X_test, y_test)

0.9666666666666667