In [1]:
# ==========================================================
# Cell 1: What is Hyperparameter Tuning?
# ==========================================================
# In Machine Learning, a "hyperparameter" is a parameter 
# that is NOT learned from the training data automatically.
#
# Example:
#   - For KNeighborsClassifier -> 'n_neighbors' is a hyperparameter
#   - For DecisionTreeClassifier -> 'max_depth' is a hyperparameter
#
# Tuning = finding the best values for these hyperparameters
# so that the model performs better.
#
# If we choose wrong hyperparameters -> model may underfit or overfit.
# Hyperparameter tuning is very important for achieving high accuracy.

In [2]:
# ==========================================================
# Cell 2: Types of Hyperparameter Tuning
# ==========================================================
# There are several approaches:
#
# 1. Manual Search:
#    - We test different values manually.
#    - Example: Try n_neighbors = 3, 5, 7 and see results.
#
# 2. Grid Search:
#    - We define a grid (set) of possible hyperparameter values.
#    - The system tries ALL combinations.
#    - Very exhaustive but can be slow if grid is large.
#
# 3. Random Search:
#    - Instead of trying all combinations, we randomly pick some.
#    - Faster than grid search, especially when there are many parameters.
#
# 4. Bayesian Optimization / Advanced Methods:
#    - Uses probability/statistics to decide the next best set of parameters.
#    - More efficient for very complex models.
#
# In sklearn, we mainly use GridSearchCV and RandomizedSearchCV.

In [3]:
# ==========================================================
# Cell 3: Example Dataset + Model Setup
# ==========================================================
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

# Load dataset
X, y = load_breast_cancer(return_X_y=True)

# Scale features for KNN (important since distance-based model)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data (train/test for evaluation after tuning)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Base model (before tuning)
knn = KNeighborsClassifier()

In [4]:
# ==========================================================
# Cell 4: Grid Search CV
# ==========================================================
from sklearn.model_selection import GridSearchCV

# Define parameter grid
# We want to test different values of 'n_neighbors' and 'weights'
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],  # possible k values
    'weights': ['uniform', 'distance'] # weighting strategy
}

# GridSearchCV parameters:
# - estimator: our model (knn)
# - param_grid: dictionary of hyperparameters to test
# - cv: number of folds for cross-validation
# - scoring: metric to evaluate (default = accuracy)
# - n_jobs: number of CPU cores to use (-1 = all)
grid_search = GridSearchCV(estimator=knn,
                           param_grid=param_grid,
                           cv=5,
                           scoring='accuracy',
                           n_jobs=-1)

# Run the grid search
grid_search.fit(X_train, y_train)

# Show best hyperparameters found
grid_search.best_params_

{'n_neighbors': 7, 'weights': 'distance'}

In [5]:
# ==========================================================
# Cell 5: Evaluate Best Model
# ==========================================================
from sklearn.metrics import accuracy_score

# Get best estimator (model with best params)
best_knn = grid_search.best_estimator_

# Predict on test set
y_pred = best_knn.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
accuracy

# Notes:
# - GridSearchCV internally did cross-validation with all hyperparameter combinations.
# - After that, we use the BEST parameters on unseen test data.

0.9473684210526315

In [6]:
# ==========================================================
# Cell 6: Randomized Search CV
# ==========================================================
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# Define parameter distributions
param_dist = {
    'n_neighbors': np.arange(1, 30),         # range of k values
    'weights': ['uniform', 'distance']
}

# RandomizedSearchCV parameters:
# - n_iter: number of random combinations to try
# - random_state: for reproducibility
random_search = RandomizedSearchCV(estimator=knn,
                                   param_distributions=param_dist,
                                   n_iter=10,
                                   cv=5,
                                   scoring='accuracy',
                                   random_state=42,
                                   n_jobs=-1)

# Run the random search
random_search.fit(X_train, y_train)

# Show best hyperparameters found
random_search.best_params_

{'weights': 'distance', 'n_neighbors': np.int64(7)}

In [7]:
# ==========================================================
# Cell 7: Grid Search vs Randomized Search Summary
# ==========================================================
# Differences:
#
# GridSearchCV:
#   - Tests ALL possible combinations from param_grid.
#   - Best if search space is small.
#   - More accurate but computationally expensive.
#
# RandomizedSearchCV:
#   - Tests only a RANDOM subset of combinations.
#   - Good if search space is large.
#   - Faster but may miss the absolute best parameter.
#
# Both return:
#   - best_params_  -> best hyperparameters
#   - best_estimator_ -> model with best parameters
#   - best_score_ -> best cross-validation score