In [8]:
import time
from sklearn.datasets import load_wine, make_classification
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV


# What is Grid Search

- Grid search is a hyperparameter optimization technique used to improve the performance of machine learning models. Hyperparameters are parameters we give the model, like in every other function. Its not learned. Parameters like the number of trees in a Random Forest, the learning rate in linear regression, etc.

- The grid search method performs an exhaustive search over a specified parameter grid, evaluating each possible combination of parameters and selecting the set that yields the best model performance according to a chosen metric.



# Why Tune Hyperparameters

Hyperparameters significantly influence a model's performance. For example, a very high learning rate may cause a neural network to diverge during training, while a low rate might result in very slow convergence or might even lead to not being able to learn at all.

The goals of hyperparameter tuning include:

- Improving model performance
- Help with overfitting and underfitting

Grid search provides a systematic approach to achieve optimal model performance.


# Grid Search Process

Grid search works by taking a set of hyperparameters and generating all possible combinations within a specified range. Each combination is evaluated using cross-validation or a separate validation set, and the model's performance is recorded. Finally, the best-performing hyperparameter combination is selected for the final model.

### Steps in Grid Search:

1. **Define the parameter grid**: Create a dictionary where keys are the hyperparameters, and values are lists of values to be tested.
2. **Train models for all combinations**: Train and validate models on all combinations of hyperparameters.
3. **Evaluate model performance**: Use a scoring metric such as accuracy, precision, recall, or a custom metric to determine the best combination.
4. **Select the best parameters**: Identify the combination with the best performance.

We'll be using Sklearn to make our lives easier.



# Grid Search Using Scikit-learn

In [9]:
%%time

data = load_wine()
X, y = data.data, data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = RandomForestClassifier(random_state=42)

param_grid = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [5, 10, 2, None],
    'min_samples_split': [2, 5, 10, 12]
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1)

grid_search.fit(X_train, y_train)

CPU times: user 150 ms, sys: 108 ms, total: 258 ms
Wall time: 2.15 s


In [10]:
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best Parameters: {best_params}")
print(f"Best Cross-Validation Accuracy: {best_score:.4f}")


Best Parameters: {'max_depth': 5, 'min_samples_split': 5, 'n_estimators': 100}
Best Cross-Validation Accuracy: 0.9758


# Speedy Gonzales Way 

In [6]:
X, y = make_classification(n_samples=100000, n_features=50, n_informative=30, n_classes=2, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5]
}

rf = RandomForestClassifier(random_state=42)
grid_search_1 = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=1)

start_time = time.time()
grid_search_1.fit(X_train, y_train)
time_n_jobs_1 = time.time() - start_time

print(f"Grid Search Time with n_jobs=1: {time_n_jobs_1} seconds")

Grid Search Time with n_jobs=1: 473.82622480392456 seconds
Grid Search Time with n_jobs=-1: 110.24896097183228 seconds
Best parameters with n_jobs=1: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 100}
Best parameters with n_jobs=-1: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 100}


In [None]:
grid_search_all = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1)

start_time = time.time()
grid_search_all.fit(X_train, y_train)
time_n_jobs_all = time.time() - start_time

print(f"Grid Search Time with n_jobs=-1: {time_n_jobs_all} seconds")

print("Best parameters with n_jobs=1:", grid_search_1.best_params_)
print("Best parameters with n_jobs=-1:", grid_search_all.best_params_)