# HYPERPARAMETER TUNING WITH `SCIKIT-LEARN`

# 1. GridSearchCV

In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# Load the data set
cancer = load_breast_cancer()

# Split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target)

# These are the hyperparameters that we will test.
# We'll try both 'l1' and 'l2' regularization.
# C is the inverse of regularization strength. Smaller C will result in stronger regularization.
parameters = {'penalty': ['l1', 'l2'], 'C': [0.1, 1, 10, 100, 1000]}

# The logistic regression model
# The 'liblinear' solver is compatible with both 'l1' and 'l1' penalties.
# Setting max_iter to 1000 ensures that the solver will converge for this particular data set.
lr = LogisticRegression(solver='liblinear', max_iter=1000)

# Create a GridSearchCV model
# This will train the model 'lr' with each possible combination of hyperparameters in 'parameters'
clf = GridSearchCV(lr,parameters)

# Fit the GridSearchCV model
clf.fit(X_train, y_train)

GridSearchCV(estimator=LogisticRegression(max_iter=1000, solver='liblinear'),
             param_grid={'C': [0.1, 1, 10, 100, 1000], 'penalty': ['l1', 'l2']})

In [8]:
# Print the hyperparameters that performed the best.
print(clf.best_estimator_)

LogisticRegression(C=1000, max_iter=1000, solver='liblinear')


In [9]:
print(clf.cv_results_['params'])

[{'C': 0.1, 'penalty': 'l1'}, {'C': 0.1, 'penalty': 'l2'}, {'C': 1, 'penalty': 'l1'}, {'C': 1, 'penalty': 'l2'}, {'C': 10, 'penalty': 'l1'}, {'C': 10, 'penalty': 'l2'}, {'C': 100, 'penalty': 'l1'}, {'C': 100, 'penalty': 'l2'}, {'C': 1000, 'penalty': 'l1'}, {'C': 1000, 'penalty': 'l2'}]


In [10]:
# Print the score of the model with each combination of hyperparameters.
print(clf.cv_results_['mean_test_score'])

[0.93192886 0.93896033 0.94834473 0.94599179 0.9553762  0.95534884
 0.9553762  0.96005472 0.95772914 0.96010944]


In [11]:
# This Pandas DataFrame tabulates hyperparameter values and the associated scores
import pandas as pd
 
df = pd.concat([pd.DataFrame(clf.cv_results_['params']), pd.DataFrame(clf.cv_results_['mean_test_score'], columns=['Score'])] ,axis=1)
 
cv_table = df.pivot(index='C', columns='penalty')
 
print(cv_table)

            Score          
penalty        l1        l2
C                          
0.1      0.931929  0.938960
1.0      0.948345  0.945992
10.0     0.955376  0.955349
100.0    0.955376  0.960055
1000.0   0.957729  0.960109


In [12]:
#Compute and print the accuracy of the model on test data
acc = clf.score(X_test, y_test)
print(acc)

0.958041958041958


# 2. RandomizedSearchCV

In [13]:
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

# Load the data set
cancer = load_breast_cancer()

# Split the data into training and testing sets
X = cancer.data
y = cancer.target
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [14]:
# These are the hyperparameters that we will test.
# We'll try both 'l1' and 'l2' regularization.
# C is the inverse of regularization strength. Smaller C will result in stronger regularization.
distributions = {'penalty': ['l1', 'l2'], 'C': uniform(loc=0, scale=100)}

In [15]:
# The logistic regression model
lr = LogisticRegression(solver = 'liblinear', max_iter = 1000)

# Create a RandomizedSearchCV model
clf = RandomizedSearchCV(lr, distributions, n_iter=8)

In [17]:
# Fit the RandomizedSearchCV model
clf.fit(X_train, y_train)

RandomizedSearchCV(estimator=LogisticRegression(max_iter=1000,
                                                solver='liblinear'),
                   n_iter=8,
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001B021B3B6D0>,
                                        'penalty': ['l1', 'l2']})

In [18]:
# Show which hyperparameters performed the best
print(clf.best_estimator_)

LogisticRegression(C=43.03655869605423, max_iter=1000, penalty='l1',
                   solver='liblinear')


In [19]:
# Print the accuracy of the model on validation data
print(clf.cv_results_['params'])
print(clf.cv_results_['mean_test_score'])

[{'C': 93.940849899094, 'penalty': 'l2'}, {'C': 64.3046686743434, 'penalty': 'l2'}, {'C': 10.485023129974635, 'penalty': 'l2'}, {'C': 78.46217201189137, 'penalty': 'l2'}, {'C': 43.03655869605423, 'penalty': 'l1'}, {'C': 39.6588441504642, 'penalty': 'l1'}, {'C': 54.70926946780115, 'penalty': 'l1'}, {'C': 75.78108256224327, 'penalty': 'l2'}]
[0.9553762  0.9553762  0.96005472 0.96008208 0.97181943 0.97181943
 0.96946648 0.95772914]


In [21]:
import pandas as pd
 
df = pd.concat([pd.DataFrame(clf.cv_results_['params']), pd.DataFrame(clf.cv_results_['mean_test_score'], columns=['Accuracy'])] ,axis=1)
 
print(df.sort_values('Accuracy', ascending = False))

           C penalty  Accuracy
4  43.036559      l1  0.971819
5  39.658844      l1  0.971819
6  54.709269      l1  0.969466
3  78.462172      l2  0.960082
2  10.485023      l2  0.960055
7  75.781083      l2  0.957729
0  93.940850      l2  0.955376
1  64.304669      l2  0.955376


In [22]:
# Compute and print the accuracy of the model on test data
acc = clf.score(X_test, y_test)
print(acc)

0.958041958041958
