In [49]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn import metrics

import pandas as pd
import numpy as np

# Cross validation demo in Iris dataset

## Prepare Data

In [50]:
# read in the iris data
iris = load_iris()

# Create DataFrame
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['Class'] = iris.target
df.iloc[::50, :]

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),Class
0,5.1,3.5,1.4,0.2,0
50,7.0,3.2,4.7,1.4,1
100,6.3,3.3,6.0,2.5,2


In [51]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
 4   Class              150 non-null    int64  
dtypes: float64(4), int64(1)
memory usage: 6.0 KB


In [52]:
X = iris.data
y = iris.target

## Standard/Simple way to Evaluate/validate the model:

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# check classification accuracy of KNN with K=5
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))

1.0


## KFold Example

In [54]:
# simulate splitting a dataset of 25 observations into 5 folds
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, shuffle=False).split(range(25))

# print the contents of each training and testing set
print('{} {:^61} {}'.format('Iteration', 'Training set observations', 'Testing set observations'))
for iteration, data in enumerate(kf, start=1):
    print('{:^9} {} {:^25}'.format(iteration, data[0], str(data[1])))

Iteration                   Training set observations                   Testing set observations
    1     [ 5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24]        [0 1 2 3 4]       
    2     [ 0  1  2  3  4 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24]        [5 6 7 8 9]       
    3     [ 0  1  2  3  4  5  6  7  8  9 15 16 17 18 19 20 21 22 23 24]     [10 11 12 13 14]     
    4     [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 20 21 22 23 24]     [15 16 17 18 19]     
    5     [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]     [20 21 22 23 24]     


In [55]:
kf = KFold(n_splits=5, shuffle=True).split(range(25))

# print the contents of each training and testing set
print('{} {:^61} {}'.format('Iteration', 'Training set observations', 'Testing set observations'))
for iteration, data in enumerate(kf, start=1):
    print('{:^9} {} {:^25}'.format(iteration, data[0], str(data[1])))

Iteration                   Training set observations                   Testing set observations
    1     [ 0  1  2  3  4  7  8 10 11 12 13 14 16 17 19 20 21 22 23 24]     [ 5  6  9 15 18]     
    2     [ 0  1  2  4  5  6  9 10 11 12 13 14 15 17 18 19 20 21 22 23]     [ 3  7  8 16 24]     
    3     [ 3  4  5  6  7  8  9 10 11 13 14 15 16 18 19 20 21 22 23 24]     [ 0  1  2 12 17]     
    4     [ 0  1  2  3  4  5  6  7  8  9 12 15 16 17 18 19 20 21 22 24]     [10 11 13 14 23]     
    5     [ 0  1  2  3  5  6  7  8  9 10 11 12 13 14 15 16 17 18 23 24]     [ 4 19 20 21 22]     


## Cross validation on Iris

In [56]:
from sklearn.model_selection import cross_val_score

In [57]:
# 10-fold cross-validation with K=5 for KNN (the n_neighbors parameter)
knn = KNeighborsClassifier(n_neighbors=5)
scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
print(scores)

[1.         0.93333333 1.         1.         0.86666667 0.93333333
 0.93333333 1.         1.         1.        ]


In [58]:
# 10-fold cross-validation with K=5 for KNN (the n_neighbors parameter)
knn = KNeighborsClassifier(n_neighbors=1)
scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
print(scores)

[1.         0.93333333 1.         0.93333333 0.86666667 1.
 0.86666667 1.         1.         1.        ]


## Select the best tuning parameters (aka "hyperparameters") for KNN on the iris dataset by hand

In [59]:
# search for an optimal value of K for KNN
k_range = list(range(5, 16))
k_scores = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
    k_scores.append(scores.mean())

max_score = max(k_scores)
optimal_k = k_range[k_scores.index(max_score)]

print('All scores:', sorted(k_scores))
print('Max score:', max_score)
print(f'Optimal K: {optimal_k}')


All scores: [0.9666666666666668, 0.9666666666666668, 0.9666666666666668, 0.9666666666666668, 0.9666666666666668, 0.9666666666666668, 0.9733333333333334, 0.9733333333333334, 0.9733333333333334, 0.9733333333333334, 0.9800000000000001]
Max score: 0.9800000000000001
Optimal K: 13


## Select the best tuning parameters with GridSearch

In [60]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

### HW: make it work

In [63]:
estimators = [
    {
        'estimator': KNeighborsClassifier(),
        'param_grid': {
            'n_neighbors': list(range(5, 16))
        }
    },
    {
        'estimator': SVC(),
        'param_grid':{
            'kernel':[ 'linear', 'poly', 'rbf', 'sigmoid', 'precomputed' ]
        }

    }
]

for est_details in estimators:
    # Perform GridSearchCV
    grid_search = GridSearchCV(
        estimator=est_details['estimator'],
        param_grid={
            'n_neighbors': list(range(5, 16))
        },
        cv=10, scoring='accuracy'
    )
    grid_search.fit(X_train, y_train)

    # Best parameters and score
    print(f'Best parameters for KNeighborsClassifier: {grid_search.best_params_}')
    print(f'Best F1 score for KNeighborsClassifier: {grid_search.best_score_}')

    # use the best estimator to predict on the test set
    best_clf = grid_search.best_estimator_

    # Evaluate on the test set (assuming X_test is already split and prepared)
    y_pred = best_clf.predict(X_test)

    # Print evaluation metrics
    print(f'Accuracy for KNeighborsClassifier: {accuracy_score(y_test, y_pred)}')
    print(classification_report(y_test, y_pred))
    print('-' * 50)

Best parameters for KNeighborsClassifier: {'n_neighbors': 11}
Best F1 score for KNeighborsClassifier: 0.9545454545454545
Accuracy for KNeighborsClassifier: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       1.00      1.00      1.00        11
           2       1.00      1.00      1.00        12

    accuracy                           1.00        38
   macro avg       1.00      1.00      1.00        38
weighted avg       1.00      1.00      1.00        38

--------------------------------------------------


ValueError: Invalid parameter 'n_neighbors' for estimator SVC(). Valid parameters are: ['C', 'break_ties', 'cache_size', 'class_weight', 'coef0', 'decision_function_shape', 'degree', 'gamma', 'kernel', 'max_iter', 'probability', 'random_state', 'shrinking', 'tol', 'verbose'].