# Data Preprocessing and Machine Learning with Scikit-Learn

In [1]:
import numpy as np
import pandas as pd

from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, PredefinedSplit

In [2]:
PATH = '../data/iris.csv'

In [3]:
!ls '../data'  
!wc -l {PATH} 
!du -h {PATH}

iris.csv  rent.csv
150 ../data/iris.csv
8,0K	../data/iris.csv


In [4]:
!head -n 5 {PATH}

Id,SepalLength[cm],SepalWidth[cm],PetalLength[cm],PetalWidth[cm],Species
1,5.1,3.5,1.4,0.2,Iris-setosa
2,4.9,3.0,1.4,0.2,Iris-setosa
3,4.7,3.2,1.3,0.2,Iris-setosa
4,4.6,3.1,1.5,0.2,Iris-setosa


In [5]:
!tail -n 5 {PATH}

146,6.7,3.0,5.2,2.3,Iris-virginica
147,6.3,2.5,5.0,1.9,Iris-virginica
148,6.5,3.0,5.2,2.0,Iris-virginica
149,6.2,3.4,5.4,2.3,Iris-virginica
150,5.9,3.0,5.1,1.8,Iris-virginica

## A. Loading Tabular Datasets from Text Files

In [6]:
data_frame = pd.read_csv(filepath_or_buffer = PATH)
data_frame.head()

Unnamed: 0,Id,SepalLength[cm],SepalWidth[cm],PetalLength[cm],PetalWidth[cm],Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [7]:
data_frame.info() # data frame information

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Id               150 non-null    int64  
 1   SepalLength[cm]  150 non-null    float64
 2   SepalWidth[cm]   150 non-null    float64
 3   PetalLength[cm]  150 non-null    float64
 4   PetalWidth[cm]   150 non-null    float64
 5   Species          150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB


In [8]:
memory_series = data_frame.memory_usage(deep = True) / 1024 # show memory usage in KB

display(memory_series)
print(f'Total memory used: {memory_series.sum():.2f} KB')

Index               0.125000
Id                  1.171875
SepalLength[cm]     1.171875
SepalWidth[cm]      1.171875
PetalLength[cm]     1.171875
PetalWidth[cm]      1.171875
Species            10.302734
dtype: float64

Total memory used: 16.29 KB


In [9]:
print(f'The data_frame data type is: {type(data_frame)}')
print(f'The data_frame has {data_frame.shape[0]} rows and {data_frame.shape[1]} columns')
print(f'The data_frame contains {data_frame.size} values (rows x columns)')
print(f'The data_frame index is: {data_frame.index}')
print(f'The data_frame columns are: {data_frame.columns.values}')

The data_frame data type is: <class 'pandas.core.frame.DataFrame'>
The data_frame has 150 rows and 6 columns
The data_frame contains 900 values (rows x columns)
The data_frame index is: RangeIndex(start=0, stop=150, step=1)
The data_frame columns are: ['Id' 'SepalLength[cm]' 'SepalWidth[cm]' 'PetalLength[cm]'
 'PetalWidth[cm]' 'Species']


In [10]:
class_map = {
    'Iris-setosa': 0,
    'Iris-versicolor': 1,
    'Iris-virginica': 2
}

data_frame['Classes'] = data_frame['Species'].map(class_map) # apply a dictionary mapping on a column
display(data_frame.head(), data_frame.tail(), np.unique(data_frame['Classes']))

Unnamed: 0,Id,SepalLength[cm],SepalWidth[cm],PetalLength[cm],PetalWidth[cm],Species,Classes
0,1,5.1,3.5,1.4,0.2,Iris-setosa,0
1,2,4.9,3.0,1.4,0.2,Iris-setosa,0
2,3,4.7,3.2,1.3,0.2,Iris-setosa,0
3,4,4.6,3.1,1.5,0.2,Iris-setosa,0
4,5,5.0,3.6,1.4,0.2,Iris-setosa,0


Unnamed: 0,Id,SepalLength[cm],SepalWidth[cm],PetalLength[cm],PetalWidth[cm],Species,Classes
145,146,6.7,3.0,5.2,2.3,Iris-virginica,2
146,147,6.3,2.5,5.0,1.9,Iris-virginica,2
147,148,6.5,3.0,5.2,2.0,Iris-virginica,2
148,149,6.2,3.4,5.4,2.3,Iris-virginica,2
149,150,5.9,3.0,5.1,1.8,Iris-virginica,2


array([0, 1, 2])

In [11]:
series = data_frame['Species']

display(series.head(n = 3), (series.index, series.dtype, series.shape, np.unique(series.values)), series.tail(n = 3))

0    Iris-setosa
1    Iris-setosa
2    Iris-setosa
Name: Species, dtype: object

(RangeIndex(start=0, stop=150, step=1),
 dtype('O'),
 (150,),
 array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object))

147    Iris-virginica
148    Iris-virginica
149    Iris-virginica
Name: Species, dtype: object

In [12]:
data_frame.loc[[2, 1, 0], ['PetalLength[cm]', 'PetalWidth[cm]', 'SepalLength[cm]', 'SepalWidth[cm]', 'Species']]

Unnamed: 0,PetalLength[cm],PetalWidth[cm],SepalLength[cm],SepalWidth[cm],Species
2,1.3,0.2,4.7,3.2,Iris-setosa
1,1.4,0.2,4.9,3.0,Iris-setosa
0,1.4,0.2,5.1,3.5,Iris-setosa


In [13]:
data_frame.iloc[[2, 1, 0], [3, 4, 1, 2, 5]]

Unnamed: 0,PetalLength[cm],PetalWidth[cm],SepalLength[cm],SepalWidth[cm],Species
2,1.3,0.2,4.7,3.2,Iris-setosa
1,1.4,0.2,4.9,3.0,Iris-setosa
0,1.4,0.2,5.1,3.5,Iris-setosa


In [14]:
data_frame[['PetalLength[cm]', 'PetalWidth[cm]', 'Species']].head()

Unnamed: 0,PetalLength[cm],PetalWidth[cm],Species
0,1.4,0.2,Iris-setosa
1,1.4,0.2,Iris-setosa
2,1.3,0.2,Iris-setosa
3,1.5,0.2,Iris-setosa
4,1.4,0.2,Iris-setosa


In [15]:
data_frame[:5]

Unnamed: 0,Id,SepalLength[cm],SepalWidth[cm],PetalLength[cm],PetalWidth[cm],Species,Classes
0,1,5.1,3.5,1.4,0.2,Iris-setosa,0
1,2,4.9,3.0,1.4,0.2,Iris-setosa,0
2,3,4.7,3.2,1.3,0.2,Iris-setosa,0
3,4,4.6,3.1,1.5,0.2,Iris-setosa,0
4,5,5.0,3.6,1.4,0.2,Iris-setosa,0


In [16]:
data_frame = data_frame.drop('Id', axis = 1) # delete `Id` column
data_frame.head()

Unnamed: 0,SepalLength[cm],SepalWidth[cm],PetalLength[cm],PetalWidth[cm],Species,Classes
0,5.1,3.5,1.4,0.2,Iris-setosa,0
1,4.9,3.0,1.4,0.2,Iris-setosa,0
2,4.7,3.2,1.3,0.2,Iris-setosa,0
3,4.6,3.1,1.5,0.2,Iris-setosa,0
4,5.0,3.6,1.4,0.2,Iris-setosa,0


## B. Splitting a Dataset into Train, Validation, and Test Subsets

In [17]:
indices = np.arange(data_frame.shape[0])
rng = np.random.RandomState(123)

permuted_indices = rng.permutation(indices)
permuted_indices

array([ 72, 112, 132,  88,  37, 138,  87,  42,   8,  90, 141,  33,  59,
       116, 135, 104,  36,  13,  63,  45,  28, 133,  24, 127,  46,  20,
        31, 121, 117,   4, 130, 119,  29,   0,  62,  93, 131,   5,  16,
        82,  60,  35, 143, 145, 142, 114, 136,  53,  19,  38, 110,  23,
         9,  86,  91,  89,  79, 101,  65, 115,  41, 124,  95,  21,  11,
       103,  74, 122, 118,  44,  51,  81, 149,  12, 129,  56,  50,  25,
       128, 146,  43,   1,  71,  54, 100,  14,   6,  80,  26,  70, 139,
        30, 108,  15,  18,  77,  22,  10,  58, 107,  75,  64,  69,   3,
        40,  76, 134,  34,  27,  94,  85,  97, 102,  52,  92,  99, 105,
         7,  48,  61, 120, 137, 125, 147,  39,  84,   2,  67,  55,  49,
        68, 140,  78, 144, 111,  32,  73,  47, 148, 113,  96,  57, 123,
       106,  83,  17,  98,  66, 126, 109])

In [18]:
train_size, validation_size = int(.65*data_frame.shape[0]), int(.15*data_frame.shape[0])
test_size = int(data_frame.shape[0] - (train_size + validation_size))

print(train_size, validation_size, test_size)

97 22 31


In [19]:
train_indices      = permuted_indices[:train_size]
validation_indices = permuted_indices[train_size:train_size + validation_size]
test_indices       = permuted_indices[train_size + validation_size:]

In [20]:
X, y = data_frame.drop(['Species', 'Classes'], axis = 1).values, data_frame['Classes'].values

print(f'Features: {X.shape}') 
print(f'Classes: {y.shape}')

Features: (150, 4)
Classes: (150,)


In [21]:
X_train, X_valid, X_test = X[train_indices], X[validation_indices], X[test_indices]
y_train, y_valid, y_test = y[train_indices], y[validation_indices], y[test_indices]

print('Training set size: ', X_train.shape, ' -> Class proportions:', np.bincount(y_train))
print('Validation set size:', X_valid.shape, ' -> Class proportions:', np.bincount(y_valid))
print('Test set size:', X_test.shape, ' -> Class proportions:', np.bincount(y_test))

Training set size:  (97, 4)  -> Class proportions: [37 27 33]
Validation set size: (22, 4)  -> Class proportions: [ 7 11  4]
Test set size: (31, 4)  -> Class proportions: [ 6 12 13]


### B.1. Stratification

Previously, we wrote our own code to shuffle and split a data set into training, validation, and test subsets, which had one considerable downside.
If we are working with small datasets and split it randomly into subsets, it will affect the class distribution in the samples - this is problematic since machine learning algorithms/models assume that training, validation, and test samples have been drawn from the same distributions to produce reliable models and estimates of the generalization performance.

![stratification](https://github.com/rasbt/stat479-machine-learning-fs19/raw/6d1f0c5f20cac88860b3e1bb4c318e3ee56783ac/05_preprocessing-and-sklearn/code/images/iris-subsampling.png)

The method of ensuring that the class label proportions are the same in each subset after splitting, we use an approach that is usually referred to as **stratification**.
Stratification is supported in `scikit-learn`'s `train_test_split` method if we pass the class label array to the `stratify` parameter as shown below.

In [22]:
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size = .15, shuffle = True, random_state = 123, stratify = y)
X_train, X_valid, y_train, y_valid = train_test_split(X_temp, y_temp, test_size = .15, shuffle = True, random_state = 123, stratify = y_temp)

print('Training set size: ', X_train.shape, ' -> Class proportions:', np.bincount(y_train))
print('Validation set size:', X_valid.shape, ' -> Class proportions:', np.bincount(y_valid))
print('Test set size:', X_test.shape, ' -> Class proportions:', np.bincount(y_test))

Training set size:  (107, 4)  -> Class proportions: [36 35 36]
Validation set size: (20, 4)  -> Class proportions: [7 7 6]
Test set size: (23, 4)  -> Class proportions: [7 8 8]


## C. Data Scaling

Whether or not to scale features depends on the problem at hand and requires your judgement.
However, there are several algorithms (especially gradient-descent, etc.), which work much better (are more robust, numerically stable, and converge faster) if the data is centered and has a smaller range.
There are many different ways for scaling features; here, we only cover to of the most common "normalization" schemes: *min-max* scaling and *z-score* standardization.

### C.1. Normalization - Min-Max Scaling

Min-max scaling squashes the features into a `[0, 1]` range, which can be achieved via the following equation for a single input:

$$ x^{[i]}_{norm} = \frac{x^{[i]}_{norm} - x_{min}}{x_{max} - x_{min}} $$

In [23]:
x = np.arange(6).astype(np.float16)

display(f'Unnormalized vector: {x}')
display(f'Normalized vector: {(x - x.min()) / (x.max() - x.min())}')

'Unnormalized vector: [0. 1. 2. 3. 4. 5.]'

'Normalized vector: [0.  0.2 0.4 0.6 0.8 1. ]'

### C.2 Standardization

After standardizing a feature, it will have the properties of a standard normal distribution, that is, unit variance and zero mean $\mathcal{N}(\mu = 0, \sigma^2 = 1)$; however, this does not transform a feature from not following a normal distribution to a normal distributed one.
The formula for standardizing a feature is shown below, for a single data point $x^{[i]}$:

$$ x^{[i]}_{standard} = \frac{x^{[i]} - \mu_x}{\sigma_x} $$

In [24]:
x = np.arange(6).astype(np.float16)

display(f'Unnormalized vector: {x}')
display(f'Standardized vector: {(x - x.mean()) / (x.std())}')

'Unnormalized vector: [0. 1. 2. 3. 4. 5.]'

'Standardized vector: [-1.464  -0.8784 -0.2927  0.2927  0.8784  1.464 ]'

A concept that is very important though is how we use the estimated normalization parameters (e.g., mean and standard deviation in z-score standardization).
In particular, it is important that we re-use the parameters estimated from the training set to transfrom validation and test sets - re-estimating the parameters is a common "beginner-mistake".

In [25]:
X_train_example, y_train_example = np.array([10, 20, 30]), np.array([0, 1, 0])
X_valid_example, y_valid_example = np.array([3, 12, 27]), np.array([0, 1, 0])

mu, sigma = X_train_example.mean(), X_train_example.std()
minimum, maximum = X_train_example.min(), X_train_example.max()

X_valid_example_scaled = (X_valid_example - minimum) / (maximum - minimum)
X_valid_example_standardized = (X_valid_example - mu) / sigma # WRONG !!! X_valid = (X_valid - X_valid.mean()) / X_valid.std()

print(f'Scaled: {X_valid_example_scaled}, Standardized: {X_valid_example_standardized}')

Scaled: [-0.35  0.1   0.85], Standardized: [-2.08206628 -0.9797959   0.85732141]


## D. Scikit-Learn Transformer API

In [26]:
min_max_scaler = MinMaxScaler()
min_max_scaler.fit(X_train_example.reshape(-1, 1))
X_valid_example_scaled = min_max_scaler.transform(X_valid_example.reshape(-1, 1)).reshape(1, -1)[0]

print(f'Scaled: {X_valid_example_scaled}')

Scaled: [-0.35  0.1   0.85]


In [27]:
standardizer = StandardScaler()
standardizer.fit(X_train_example.reshape(-1, 1))
X_valid_example_standardized = standardizer.transform(X_valid_example.reshape(-1, 1)).reshape(1, -1)[0]

print(f'Standardized: {X_valid_example_standardized}')

Standardized: [-2.08206628 -0.9797959   0.85732141]


### D.1 Categorical Data

When we preprocess a dataset as input to a machine learning algorithm, we have to be careful how we treat categorical variables.
There are two broad categories of categorical variables: **nominal** (no order implied) and **ordinal** (order implied).

In [28]:
data_frame_1 = pd.DataFrame({'Color': ['green', 'red', 'blue'], 'Size': ['M', 'L', 'XXL'], 'Class': ['Class 1', 'Class 2', 'Class 2']})
data_frame_1.head()

Unnamed: 0,Color,Size,Class
0,green,M,Class 1
1,red,L,Class 2
2,blue,XXL,Class 2


 - In the example above, `Size` would be an example of an ordinal variable; i.e., if the letters refer to T-shirt sizes, it would make sense to come up with an ordering like `M < L < XXL`.
 
 - Hence, we can assign increasing values to a ordinal values; however, the range and difference between categories depends on our domain knowledge and judgement.

In [29]:
size_mapper = {
    'M': 2,
    'L': 3,
    'XXL': 5
}

data_frame_1['Size'] = data_frame_1['Size'].map(size_mapper)
data_frame_1.head()

Unnamed: 0,Color,Size,Class
0,green,2,Class 1
1,red,3,Class 2
2,blue,5,Class 2


 - Machine learning algorithms do not assume an ordering in the case of class labels.
 
 - Here, we can use the `LabelEncoder` from `scikit-learn` to convert class labels to integers as an alternative to using the `map` method.

In [30]:
label_encoder = LabelEncoder()
label_encoder.fit(data_frame_1['Class'])

data_frame_1['ClassLabels'] = label_encoder.transform(data_frame_1['Class'])
data_frame_1.head()

Unnamed: 0,Color,Size,Class,ClassLabels
0,green,2,Class 1,0
1,red,3,Class 2,1
2,blue,5,Class 2,1


- Representing nominal variables properly is a bit more tricky.

- We use "one-hot" encoding - we binarize a nominal variable, as shown below for the color variable (again, we do this because some ordering like `orange < red < blue` would not make sense in many applications).

In [31]:
data_frame_1 = pd.get_dummies(data_frame_1, columns = ['Color'])
data_frame_1.head()

Unnamed: 0,Size,Class,ClassLabels,Color_blue,Color_green,Color_red
0,2,Class 1,0,0,1,0
1,3,Class 2,1,0,0,1
2,5,Class 2,1,1,0,0


 - Note that executing the code above produced `3` new variables for `Color_*` each of which takes on binary values.

### D.2 Missing Data

There are many different ways for dealing with missing data.
The simplest approaches are removing entire columns or rows.
Another simple approach is to impute missing values via the feature means, medians, mode, etc.
There is no rule or best practice, and the choice of the approprite missing data imputation method depends on your judgement and domain knowledge.

In [32]:
data_frame_2 = pd.DataFrame({'A': [1., 5., 10.], 'B': [2., 6., 11.], 'C': [3., np.nan, 12.], 'D': [4., 8., np.nan]})
data_frame_2.head()

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [33]:
display(data_frame_2.isnull(), data_frame_2.isnull().sum())

Unnamed: 0,A,B,C,D
0,False,False,False,False
1,False,False,True,False
2,False,False,False,True


A    0
B    0
C    1
D    1
dtype: int64

In [34]:
display(data_frame_2.dropna(axis = 0), data_frame_2.dropna(axis = 1)) # drop rows, columns where there are missing values respectively

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,10.0,11.0


In [35]:
imputer_mean   = SimpleImputer(missing_values = np.nan, strategy = 'mean')
imputer_median = SimpleImputer(missing_values = np.nan, strategy = 'median')

imputer_mean.fit(data_frame_2['C'].values.reshape(-1, 1))
imputer_median.fit(data_frame_2['D'].values.reshape(-1, 1))

data_frame_2['C'] = imputer_mean.transform(data_frame_2['C'].values.reshape(-1, 1))
data_frame_2['D'] = imputer_median.transform(data_frame_2['D'].values.reshape(-1, 1))

data_frame_2.head()

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.5,8.0
2,10.0,11.0,12.0,6.0


## E. Feature Transformation, Extraction, and Selection

Scikit-learn pipelines are an extremely convenient and powerful concept.
Pipelines basically let us define a series of perprocessing steps together with fitting an estimator.
Pipelines will automatically take care of pitfalls like estimating feature scaling parameters from the training set and applying those to scale new data.

In [36]:
pipeline = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=3))

pipeline

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('kneighborsclassifier', KNeighborsClassifier(n_neighbors=3))])

In [37]:
pipeline.fit(X = X_test, y = y_test)

print(f'Predictions: {pipeline.predict(X = X_valid)}'), 
print(f'Score (accuracy): {pipeline.score(X = X_test, y = y_test)*100:.2f}%')

Predictions: [0 2 0 1 0 2 1 2 0 2 2 1 2 1 0 2 1 1 0 0]
Score (accuracy): 95.65%


### E.1 Intro Model Selection - Pipelines and Grid Search

In machine learning practice, we often need to experiment with an machine learning algorithm's hyperparameters to find a good setting.
The process of tuning hyperparameters and comparing and selecting the resulting models is also called *model selection*.
Here, we are introducing the simplest way of performing model selection: using the *holdout method.*
In the holdout method, we split a dataset into 3 subsets: a training, a validation, and a test datatset.
To avoid biasing the estimate of the generalization performance, we only want to use the test dataset once, which is why we use the validation dataset for hyperparameter tuning (model selection).
Here, the validation dataset serves as an estimate of the generalization performance, too, but it becomes more biased than the final estimate on the test data because of its repeated re-use during model selection (think of "multiple hypothesis testing").

![tuning-model](https://github.com/rasbt/stat479-machine-learning-fs19/raw/6d1f0c5f20cac88860b3e1bb4c318e3ee56783ac/05_preprocessing-and-sklearn/code/images/holdout-tuning.png)

In [38]:
pipeline = make_pipeline(StandardScaler(), KNeighborsClassifier())
pipeline

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('kneighborsclassifier', KNeighborsClassifier())])

In [39]:
params = {
    'kneighborsclassifier__n_neighbors': [1, 3, 5],
    'kneighborsclassifier__p': [1, 2]
}

ps = PredefinedSplit(np.concatenate((np.full(shape = (X_train.shape[0],), fill_value = -1), np.zeros(shape = (X_valid.shape[0],)))))

grid = GridSearchCV(estimator = pipeline, param_grid = params, cv = ps)
grid.fit(X = np.vstack((X_train, X_valid)), y = np.hstack((y_train, y_valid)))

GridSearchCV(cv=PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0])),
             estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('kneighborsclassifier',
                                        KNeighborsClassifier())]),
             param_grid={'kneighborsclassifier__n_neighbors': [1, 3, 5],
                         'kneighborsclassifier__p': [1, 2]})

In [40]:
grid.cv_results_

{'mean_fit_time': array([0.00440311, 0.00735044, 0.00571036, 0.02639818, 0.00419641,
        0.00139952]),
 'std_fit_time': array([0., 0., 0., 0., 0., 0.]),
 'mean_score_time': array([0.00548553, 0.00662446, 0.00645447, 0.00226831, 0.00444078,
        0.00358653]),
 'std_score_time': array([0., 0., 0., 0., 0., 0.]),
 'param_kneighborsclassifier__n_neighbors': masked_array(data=[1, 1, 3, 3, 5, 5],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_kneighborsclassifier__p': masked_array(data=[1, 2, 1, 2, 1, 2],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'kneighborsclassifier__n_neighbors': 1,
   'kneighborsclassifier__p': 1},
  {'kneighborsclassifier__n_neighbors': 1, 'kneighborsclassifier__p': 2},
  {'kneighborsclassifier__n_neighbors': 3, 'kneighborsclassifier__p': 1},
  {'kneighborsclassifier__n_neighbors': 3, 'kneighborsclassifie

In [41]:
print(f'Best score: {grid.best_score_}')
print(f'Best parameters: {grid.best_params_}')

Best score: 0.9
Best parameters: {'kneighborsclassifier__n_neighbors': 1, 'kneighborsclassifier__p': 1}


In [42]:
classifier = grid.best_estimator_
classifier.fit(X_train, y_train)
print(f'Test accuracy: {(classifier.score(X_test, y_test)*100):.2f}%')

Test accuracy: 91.30%
