In [10]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_diabetes

data, target = load_diabetes(return_X_y=True, as_frame=True)
data_train, data_test, target_train, target_test = train_test_split(data, target, random_state=42)

In [11]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
scaler = StandardScaler()
model = make_pipeline(scaler, KNeighborsRegressor())

In [None]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV

max_neighbors = min(len(data_train) - 1, 200)

parameter_grid = {
    "kneighborsregressor__n_neighbors": np.arange(1, max_neighbors + 1),
    "standardscaler__with_mean": [True, False],
    "standardscaler__with_std": [True, False],
}

random_param_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=parameter_grid,
    scoring="neg_mean_absolute_error",
    n_iter=20,
    n_jobs=2,
    verbose=1,
    random_state=1,
    error_score="raise",
)

random_param_search.fit(data_train, target_train)
optimal_parameters = random_param_search.best_params_
optimal_parameters

Fitting 5 folds for each of 20 candidates, totalling 100 fits


{'standardscaler__with_std': True,
 'standardscaler__with_mean': True,
 'kneighborsregressor__n_neighbors': np.int64(22)}

In [24]:
import pandas as pd
CVresults = pd.DataFrame(model_random_search.cv_results_)

In [27]:
# Convert negative scores to positive values
CVresults["mean_test_score"] = -CVresults["mean_test_score"]

In [28]:
new_column_names = {
    "param_kneighborsregressor__n_neighbors": "neighbor_count",
    "param_standardscaler__with_mean": "center_data",
    "param_standardscaler__with_std": "scale_data",
    "mean_test_score": "avg_score",
}

#apply column renaming
CVresults = CVresults.rename(columns=new_column_names)
#Keep only renamed columns and sort by score
CVresults = CVresults[list(new_column_names.values())].sort_values("avg_score")

In [29]:
bool_columns = ["center_data", "scale_data"]
CVresults[bool_columns] = CVresults[bool_columns].astype(np.int64)
CVresults["neighbor_count"] = CVresults["neighbor_count"].astype(np.int64)

CVresults

Unnamed: 0,neighbor_count,center_data,scale_data,avg_score
8,22,1,1,49.084511
11,28,0,0,49.892715
2,44,0,0,50.767185
18,57,1,1,51.580136
4,61,0,1,51.822152
10,64,0,0,51.931944
0,3,1,1,52.246887
12,73,1,0,52.487124
13,88,1,0,53.237693
19,93,0,0,53.57544
