In [1]:
import pandas as pd

penguins = pd.read_csv("../datasets/penguins.csv")

columns = ["Body Mass (g)", "Flipper Length (mm)", "Culmen Length (mm)"]
target_name = "Species"

# Remove lines with missing values for the columns of interest
penguins_non_missing = penguins[columns + [target_name]].dropna()

data = penguins_non_missing[columns]
target = penguins_non_missing[target_name]

In [2]:
data.head()

Unnamed: 0,Body Mass (g),Flipper Length (mm),Culmen Length (mm)
0,3750.0,181.0,39.1
1,3800.0,186.0,39.5
2,3250.0,195.0,40.3
4,3450.0,193.0,36.7
5,3650.0,190.0,39.3


In [4]:
target.sample(10)

9      Adelie Penguin (Pygoscelis adeliae)
162      Gentoo penguin (Pygoscelis papua)
18     Adelie Penguin (Pygoscelis adeliae)
263      Gentoo penguin (Pygoscelis papua)
97     Adelie Penguin (Pygoscelis adeliae)
117    Adelie Penguin (Pygoscelis adeliae)
4      Adelie Penguin (Pygoscelis adeliae)
125    Adelie Penguin (Pygoscelis adeliae)
45     Adelie Penguin (Pygoscelis adeliae)
11     Adelie Penguin (Pygoscelis adeliae)
Name: Species, dtype: object

In [6]:
target.value_counts(dropna=False, normalize=True)

Adelie Penguin (Pygoscelis adeliae)          0.441520
Gentoo penguin (Pygoscelis papua)            0.359649
Chinstrap penguin (Pygoscelis antarctica)    0.198830
Name: Species, dtype: float64

In [7]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
model = Pipeline(steps=[
    ("preprocessor", StandardScaler()),
    ("classifier", KNeighborsClassifier(n_neighbors=5)),
])

In [10]:
from sklearn.model_selection import cross_validate

cv_results = cross_validate(
    model,
    data,
    target,
    cv=10,
    scoring='balanced_accuracy',
    n_jobs=-1,
)

acc = cv_results['test_score']


In [11]:
acc.mean(), acc.std()

(0.9521978021978021, 0.0399020975957868)

In [12]:
for p in model.get_params():
    print(p)

memory
steps
verbose
preprocessor
classifier
preprocessor__copy
preprocessor__with_mean
preprocessor__with_std
classifier__algorithm
classifier__leaf_size
classifier__metric
classifier__metric_params
classifier__n_jobs
classifier__n_neighbors
classifier__p
classifier__weights


In [18]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'classifier__n_neighbors': [5, 51, 101],
    'preprocessor': [None, StandardScaler()]
    
}
search = GridSearchCV(
    model,
    param_grid=param_grid,
    scoring="balanced_accuracy",
    cv=10,
)

search.fit(data, target)

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('preprocessor', StandardScaler()),
                                       ('classifier', KNeighborsClassifier())]),
             param_grid={'classifier__n_neighbors': [5, 51, 101],
                         'preprocessor': [None, StandardScaler()]},
             scoring='balanced_accuracy')

In [19]:
search.cv_results_

{'mean_fit_time': array([0.00302246, 0.00431759, 0.00251477, 0.00423265, 0.00236115,
        0.00374026]),
 'std_fit_time': array([1.44368754e-03, 3.19840742e-04, 5.98357536e-05, 1.24925511e-04,
        3.21910064e-04, 1.61060664e-04]),
 'mean_score_time': array([0.00390399, 0.00412235, 0.00395036, 0.00418155, 0.00374331,
        0.00393999]),
 'std_score_time': array([4.38759286e-04, 7.28899534e-04, 9.31935500e-05, 1.83863272e-04,
        3.70122391e-04, 2.43516289e-04]),
 'param_classifier__n_neighbors': masked_array(data=[5, 5, 51, 51, 101, 101],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_preprocessor': masked_array(data=[None, StandardScaler(), None, StandardScaler(), None,
                    StandardScaler()],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'classifier__n_neighbors': 5, 'preprocessor': None},
  {'classifie

In [20]:
results = (
    pd.DataFrame(search.cv_results_)
    .sort_values(by="mean_test_score", ascending=False)
)

results = results[
    [c for c in results.columns if c.startswith("param_")]
    + ["mean_test_score", "std_test_score"]
]

In [21]:
results

Unnamed: 0,param_classifier__n_neighbors,param_preprocessor,mean_test_score,std_test_score
1,5,StandardScaler(),0.952198,0.039902
3,51,StandardScaler(),0.94188,0.038905
5,101,StandardScaler(),0.876642,0.041618
0,5,,0.739838,0.086685
4,101,,0.613857,0.031472
2,51,,0.605182,0.03648


In [30]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer


all_preprocessors = [
    None,
    StandardScaler(),
    MinMaxScaler(),
    QuantileTransformer(n_quantiles=100),
    PowerTransformer(method="box-cox"),
]

In [33]:
param_grid = {
    'classifier__n_neighbors': [5, 51, 101],
    'preprocessor': all_preprocessors,
    
}
search = GridSearchCV(
    model,
    param_grid=param_grid,
    scoring="balanced_accuracy",
    cv=10,
)

search.fit(data, target)

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('preprocessor', StandardScaler()),
                                       ('classifier', KNeighborsClassifier())]),
             param_grid={'classifier__n_neighbors': [5, 51, 101],
                         'preprocessor': [None, StandardScaler(),
                                          MinMaxScaler(),
                                          QuantileTransformer(n_quantiles=100),
                                          PowerTransformer(method='box-cox')]},
             scoring='balanced_accuracy')

In [34]:
results = (
    pd.DataFrame(search.cv_results_)
    .sort_values(by="mean_test_score", ascending=False)
)

results = results[
    [c for c in results.columns if c.startswith("param_")]
    + ["mean_test_score", "std_test_score"]
]

results

Unnamed: 0,param_classifier__n_neighbors,param_preprocessor,mean_test_score,std_test_score
1,5,StandardScaler(),0.952198,0.039902
2,5,MinMaxScaler(),0.947778,0.034268
3,5,QuantileTransformer(n_quantiles=100),0.947094,0.033797
4,5,PowerTransformer(method='box-cox'),0.94696,0.047387
6,51,StandardScaler(),0.94188,0.038905
8,51,QuantileTransformer(n_quantiles=100),0.927277,0.043759
9,51,PowerTransformer(method='box-cox'),0.922833,0.047883
7,51,MinMaxScaler(),0.920293,0.045516
11,101,StandardScaler(),0.876642,0.041618
12,101,MinMaxScaler(),0.862357,0.046244


In [35]:
cv_results = cross_validate(
    search,
    data,
    target,
    cv=10,
    scoring='balanced_accuracy',
    n_jobs=-1,
    return_estimator=True,
)

acc = cv_results['test_score']
acc.mean().round(3)

0.943

In [32]:
from pprint import pprint

pprint(search.best_params_)

{'classifier__n_neighbors': 5, 'preprocessor': StandardScaler()}


In [36]:
res = pd.DataFrame(cv_results)
res

Unnamed: 0,fit_time,score_time,estimator,test_score
0,1.399506,0.004113,"GridSearchCV(cv=10,\n estimator=Pi...",0.952381
1,1.354188,0.003689,"GridSearchCV(cv=10,\n estimator=Pi...",0.92674
2,1.382682,0.004234,"GridSearchCV(cv=10,\n estimator=Pi...",1.0
3,1.312368,0.003946,"GridSearchCV(cv=10,\n estimator=Pi...",0.918803
4,1.425392,0.004264,"GridSearchCV(cv=10,\n estimator=Pi...",0.88254
5,1.362918,0.003434,"GridSearchCV(cv=10,\n estimator=Pi...",1.0
6,1.518476,0.003573,"GridSearchCV(cv=10,\n estimator=Pi...",0.955556
7,1.461393,0.003278,"GridSearchCV(cv=10,\n estimator=Pi...",0.930159
8,1.285781,0.003158,"GridSearchCV(cv=10,\n estimator=Pi...",0.907937
9,1.37671,0.004043,"GridSearchCV(cv=10,\n estimator=Pi...",0.952381


In [37]:
for estimator in cv_results["estimator"]:
    print(estimator.best_params_)

{'classifier__n_neighbors': 5, 'preprocessor': QuantileTransformer(n_quantiles=100)}
{'classifier__n_neighbors': 5, 'preprocessor': QuantileTransformer(n_quantiles=100)}
{'classifier__n_neighbors': 5, 'preprocessor': StandardScaler()}
{'classifier__n_neighbors': 5, 'preprocessor': StandardScaler()}
{'classifier__n_neighbors': 5, 'preprocessor': MinMaxScaler()}
{'classifier__n_neighbors': 5, 'preprocessor': QuantileTransformer(n_quantiles=100)}
{'classifier__n_neighbors': 5, 'preprocessor': MinMaxScaler()}
{'classifier__n_neighbors': 5, 'preprocessor': StandardScaler()}
{'classifier__n_neighbors': 5, 'preprocessor': StandardScaler()}
{'classifier__n_neighbors': 5, 'preprocessor': QuantileTransformer(n_quantiles=100)}
