In [2]:
import sys
from packaging import version
import sklearn


In [3]:
from pathlib import Path

IMAGES_PATH = Path() / "images" / "classification"
IMAGES_PATH.mkdir(parents=True, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = IMAGES_PATH / f"{fig_id}.{fig_extension}"
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# MNIST

In [4]:
from sklearn.datasets import fetch_openml

mnist = fetch_openml('mnist_784', as_frame=False)

In [None]:
mnist.keys()

In [None]:
X, y = mnist.data, mnist.target
X

In [None]:
y[0]

In [7]:
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

### KNeighborsClassifier
class sklearn.neighbors.KNeighborsClassifier(n_neighbors=5, *, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None)[source]



In [None]:
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier(n_jobs=-1)
knn_clf.fit(X_train, y_train)
baseline_accuracy = knn_clf.score(X_test, y_test)
baseline_accuracy

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = [{'weights': ["uniform", "distance"], 'n_neighbors': [3, 4, 5, 6]}]

knn_clf = KNeighborsClassifier(n_jobs=-1)
grid_search = GridSearchCV(knn_clf, param_grid, cv=5)
grid_search.fit(X_train[:10_000], y_train[:10_000])

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_score_

In [None]:
grid_search.best_estimator_.fit(X_train, y_train)
tuned_accuracy = grid_search.score(X_test, y_test)
tuned_accuracy

In [12]:
knn_clf=grid_search.best_estimator_

# Measuring Accuracy Using Cross-Validation

In [None]:
from sklearn.model_selection import cross_val_score
cross_val_score(knn_clf, X_train, y_train, cv=3, scoring="accuracy",n_jobs=-1)

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone

skfolds = StratifiedKFold(n_splits=3)  # add shuffle=True if the dataset is not
                                       # already shuffled
for train_index, test_index in skfolds.split(X_train, y_train):
    clone_clf = clone(knn_clf)
    X_train_folds = X_train[train_index]
    y_train_folds = y_train[train_index]
    X_test_fold = X_train[test_index]
    y_test_fold = y_train[test_index]

    clone_clf.fit(X_train_folds, y_train_folds)
    y_pred = clone_clf.predict(X_test_fold)
    n_correct = sum(y_pred == y_test_fold)
    print(n_correct / len(y_pred))

## Confusion Matrix 

In [58]:
from sklearn.model_selection import cross_val_predict

y_train_pred = cross_val_predict(knn_clf, X_train, y_train, cv=3,n_jobs=-1)

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_train, y_train_pred)
cm

In [None]:
y_train_perfect_predictions = y_train  # pretend we reached perfection
confusion_matrix(y_train, y_train_perfect_predictions)

Precision and Recall

In [None]:
from sklearn.metrics import precision_score, recall_score

precision_score(y_train, y_train_pred,average=None) 

In [None]:
# extra code – this cell also computes the precision: TP / (FP + TP)
cm[1, 1] / (cm[0, 1] + cm[1, 1])

In [None]:
recall_score(y_train, y_train_pred,average=None)  

In [None]:
from sklearn.metrics import f1_score

f1_score(y_train, y_train_pred,average=None)

In [None]:
# extra code – this cell also computes the f1 score
cm[1, 1] / (cm[1, 1] + (cm[1, 0] + cm[0, 1]) / 2)

In [82]:
y_scores = cross_val_predict(knn_clf, X_train, y_train, cv=3,method='predict_proba')

In [None]:
y_scores.shape

In [None]:
y_train.shape


# Error Analysis

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.astype("float64"))
cross_val_score(knn_clf, X_train_scaled, y_train, cv=3, scoring="accuracy")

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt

y_train_pred = cross_val_predict(knn_clf, X_train_scaled, y_train, cv=3)
plt.rc('font', size=9)  # extra code – make the text smaller
ConfusionMatrixDisplay.from_predictions(y_train, y_train_pred)
plt.show()

In [None]:
plt.rc('font', size=10)  # extra code
ConfusionMatrixDisplay.from_predictions(y_train, y_train_pred,
                                        normalize="true", values_format=".0%")
plt.show()

In [None]:
sample_weight = (y_train_pred != y_train)
plt.rc('font', size=10)  # extra code
ConfusionMatrixDisplay.from_predictions(y_train, y_train_pred,
                                        sample_weight=sample_weight,
                                        normalize="true", values_format=".0%")
plt.show()

In [None]:
# extra code – this cell generates and saves Figure 3–9
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(9, 4))
plt.rc('font', size=9)
ConfusionMatrixDisplay.from_predictions(y_train, y_train_pred, ax=axs[0])
axs[0].set_title("Confusion matrix")
plt.rc('font', size=10)
ConfusionMatrixDisplay.from_predictions(y_train, y_train_pred, ax=axs[1],
                                        normalize="true", values_format=".0%")
axs[1].set_title("CM normalized by row")
save_fig("confusion_matrix_plot_1")
plt.show()

In [None]:
# extra code – this cell generates and saves Figure 3–10
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(9, 4))
plt.rc('font', size=10)
ConfusionMatrixDisplay.from_predictions(y_train, y_train_pred, ax=axs[0],
                                        sample_weight=sample_weight,
                                        normalize="true", values_format=".0%")
axs[0].set_title("Errors normalized by row")
ConfusionMatrixDisplay.from_predictions(y_train, y_train_pred, ax=axs[1],
                                        sample_weight=sample_weight,
                                        normalize="pred", values_format=".0%")
axs[1].set_title("Errors normalized by column")
save_fig("confusion_matrix_plot_2")
plt.show()
plt.rc('font', size=14)  # make fonts great again

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 4))  # extra code – it's not needed, just formatting
plt.plot(thresholds[5], precision[5][:-1], "b--", label="Precision", linewidth=2)
plt.plot(thresholds[5], recall[5][:-1], "g-", label="Recall", linewidth=2)
plt.vlines(thresholds[5], 0, 1.0, "k", "dotted", label="threshold")

# extra code – this section just beautifies and saves Figure 3–5
idx = (thresholds[5] >= thresholds[5]).argmax()  # first index ≥ threshold
plt.plot(thresholds[5][idx], precision[5][idx], "bo")
plt.plot(thresholds[5][idx], recall[5][idx], "go")
plt.axis([-50000, 50000, 0, 1])
plt.grid()
plt.xlabel("Threshold")
plt.legend(loc="center right")
save_fig("precision_recall_vs_threshold_plot")

plt.show()

In [None]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import precision_recall_curve

# Step 4: Wrap KNN in OneVsRestClassifier for multiclass handling
ovr = OneVsRestClassifier(KNeighborsClassifier(n_neighbors= 4, weights= 'distance'))
ovr.fit(X_train, y_train)

# Step 5: Predict probabilities for each class
y_scores = ovr.predict_proba(X_test)

# Step 6: Compute Precision-Recall for each class
for i in range(len(ovr.classes_)):
    precision, recall, thresholds = precision_recall_curve(y_test == i, y_scores[:, i])
    plt.figure(figsize=(10, 6))
    plt.plot(thresholds, precision[:-1], label=f"Precision (Class {i})", color='b', linewidth=2)
    plt.plot(thresholds, recall[:-1], label=f"Recall (Class {i})", color='g', linewidth=2)
    plt.xlabel("Threshold")
    plt.ylabel("Precision / Recall")
    plt.title(f"Precision-Recall vs Threshold for Class {i}")
    plt.legend(loc="best")
    plt.grid(True)
    plt.show()



In [None]:
# Check the distribution of the test labels
unique, counts = np.unique(y_test, return_counts=True)
print(dict(zip(unique, counts)))

In [None]:
# Step 7: Plot Precision vs. Recall vs. Threshold
plt.figure(figsize=(10, 6))
plt.plot(thresholds, precision[:-1], label="Precision", color='b', linewidth=2)
plt.plot(thresholds, recall[:-1], label="Recall", color='g', linewidth=2)
plt.xlabel("Threshold")
plt.ylabel("Precision / Recall")
plt.title("Precision-Recall vs Threshold")
plt.legend(loc="best")
plt.grid(True)
plt.show()