# Grid Search CV

## Checking params for different models

In [3]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

# get model params
print(model.get_params())

{'copy_X': True, 'fit_intercept': True, 'n_jobs': None, 'positive': False}


In [4]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()

print(model.get_params())

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


In [6]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()
print(model.get_params())

{'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 5, 'p': 2, 'weights': 'uniform'}


## Using GridSearchCV  to optimize hyperparameters with LinearRegression

In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV, train_test_split

In [12]:
# load dataset
df = sns.load_dataset('titanic')

# fill missing values
df['age'] = df['age'].fillna(df['age'].mean())

X = df[['age']]
y = df['fare']

# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [14]:
# define parameter grid
param_grid = {'fit_intercept': [True, False]}

# create a model
model = LinearRegression()

# create a GridSearchCV object with
# the model, parameter grid, and scoring metric
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='r2')

# fit the data to the grid search object 
grid_search.fit(X_train, y_train)

# print the best params
print("Best Params: \n", grid_search.best_params_)

Best Params: 
 {'fit_intercept': True}


## Use GridSearchCV with KNN

In [15]:
from sklearn.neighbors import KNeighborsClassifier

In [16]:
df = sns.load_dataset('titanic')

# splitting the data
X = df[['age', 'sex', 'pclass', 'sibsp', 'parch', 'fare']]
y = df['survived']

# one hot encoding
X = pd.get_dummies(X, columns=['sex'])

# fill missing values
X['age'] = X['age'].fillna(X['age'].mean())

# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [19]:
# define parameter grid
# ------------------------------------------  # 
# define the parameter grid
param_grid = {'n_neighbors': np.arange(1, 30, 2), 'weights': ['uniform', 'distance']}

# define model
model = KNeighborsClassifier()

# create a GridSearchCV object with
# the model, parameter grid, and scoring metric
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')

# fit the data to the grid search object
grid_search.fit(X_train, y_train)

# print the best params
print("Best Params: \n", grid_search.best_params_)

# print the best score
print("Best Score: \n", grid_search.best_score_)

Best Params: 
 {'n_neighbors': 5, 'weights': 'distance'}
Best Score: 
 0.731773859942874


## Decision Tree Classifier with GridSearchCV

In [20]:
from sklearn.tree import DecisionTreeClassifier

In [21]:
df = sns.load_dataset('titanic')

# splitting the data
X = df[['age', 'sex', 'pclass', 'sibsp', 'parch', 'fare']]
y = df['survived']

# one hot encoding
X = pd.get_dummies(X, columns=['sex'])

# fill missing values
X['age'] = X['age'].fillna(X['age'].mean())

# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [22]:
model = DecisionTreeClassifier()
model.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'random_state': None,
 'splitter': 'best'}

In [23]:
# define parameter grid
# ------------------------------------------  # 
# define the parameter grid
param_grid = {'max_depth': np.arange(3,5,7,None), 'min_samples_split': [2, 3, 4]} 

# initialize the grid search object
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')


# fit the data to the grid search object
grid_search.fit(X_train, y_train)

# print the best params
print("Best Params: \n", grid_search.best_params_)

# print the best score
print("Best Score: \n", grid_search.best_score_)

Best Params: 
 {'max_depth': 3, 'min_samples_split': 2}
Best Score: 
 0.8103614695163991


## RandomForest Classifier with GridSearch CV

In [24]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
df = sns.load_dataset('titanic')

# splitting the data
X = df[['age', 'sex', 'pclass', 'sibsp', 'parch', 'fare']]
y = df['survived']

# one hot encoding
X = pd.get_dummies(X, columns=['sex'])

# fill missing values
X['age'] = X['age'].fillna(X['age'].mean())

# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [25]:
model = RandomForestClassifier()
model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [26]:
# define parameter grid
# ------------------------------------------  # 
# define the parameter grid
param_grid = {'n_estimators': [10, 50, 100], 
              'max_depth': [3, 5, 7, None], 
              'min_samples_split': [2, 3, 4],
              'max_features': ['auto', 'sqrt', 'log2']} 

# initialize the grid search object
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')


# fit the data to the grid search object
grid_search.fit(X_train, y_train)

# print the best params
print("Best Params: \n", grid_search.best_params_)

# print the best score
print("Best Score: \n", grid_search.best_score_)

Best Params: 
 {'max_depth': 7, 'max_features': 'log2', 'min_samples_split': 3, 'n_estimators': 50}
Best Score: 
 0.8342361863488623


180 fits failed out of a total of 540.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
180 fits failed with the following error:
Traceback (most recent call last):
  File "/home/idev/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/idev/.local/lib/python3.10/site-packages/sklearn/base.py", line 1467, in wrapper
    estimator._validate_params()
  File "/home/idev/.local/lib/python3.10/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/home/idev/.local/lib/python3.10/site-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constraints
    rai