In [3]:
from sklearn.datasets import load_breast_cancer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

In [7]:
# We scale features so all have similar ranges. This is important for KNN.
# IMPORTANT: When using cross-validation, prefer applying scaling inside a Pipeline
#            (see later). If you scale the entire dataset BEFORE cross-validation
#            you will leak test-set statistics into the training folds.

X, y = load_breast_cancer(return_X_y=True)

# Example - this scales the whole dataset (OK for quick demo, but not ideal for CV if
# you forget that this scaling used all the data). Best practice: use Pipeline so scaling
# is applied *inside* each train fold only.
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Quick shapes check (always good to check)
print("X_scaled shape:", X_scaled.shape)   # (n_samples, n_features)
print("y shape:", y.shape)                 # (n_samples,)

X_scaled shape: (569, 30)
y shape: (569,)


In [8]:
from sklearn.model_selection import cross_val_score, StratifiedKFold, KFold, GroupKFold, TimeSeriesSplit

# Create the estimator (model)
clf = KNeighborsClassifier()

# Basic usage:
#   cross_val_score(estimator, X, y=None, groups=None, scoring=None, cv=None,
#                   n_jobs=None, verbose=0, fit_params=None, pre_dispatch='2*n_jobs',
#                   error_score=nan)
#
# KEY PARAMETERS explained:
# - estimator : any sklearn estimator implementing fit/predict (e.g., KNeighborsClassifier()).
# - X, y      : data and labels. X shape (n_samples, n_features), y shape (n_samples,).
# - groups    : optional array-like (n_samples,) used for GroupKFold (samples in same group
#               will not be split between train/test).
# - scoring   : None (default) uses estimator.score (for classifiers => accuracy). You can pass:
#               - a string (e.g. 'accuracy', 'f1_macro', 'roc_auc', 'neg_mean_squared_error')
#               - a scorer callable from sklearn.metrics (use make_scorer)
#               NOTE: for "loss" metrics sklearn uses negative values ('neg_mean_squared_error')
#                     because cross_val_score always maximizes the returned score.
# - cv        : controls how data is split into folds. Can be:
#               - an integer (e.g., cv=5) => default split (StratifiedKFold for classification)
#               - an explicit cross-validator object (e.g., StratifiedKFold(...), KFold(...))
#               - an iterable of (train_index, test_index) pairs
# - n_jobs    : number of jobs for parallelism. n_jobs=None (no parallel). n_jobs=-1 uses all cores.
# - verbose   : printing verbosity (0 = quiet)
# - pre_dispatch : controls how many jobs are created/queued for parallel execution
# - error_score : value to assign if a fail occurs during fitting/predicting (default is np.nan)
#
# DEFAULT CV FOR CLASSIFICATION:
# - If you pass cv=int (e.g. 5) and your problem is classification, sklearn uses StratifiedKFold
#   by default. That means each fold preserves the class ratio (good when classes are imbalanced).
#
# EXAMPLES of cv objects:
# - StratifiedKFold(n_splits=5, shuffle=True, random_state=42)  -> preserves class distribution
# - KFold(n_splits=5, shuffle=True, random_state=42)            -> for regression or balanced classes
# - GroupKFold(n_splits=5)                                      -> use when samples have groups
# - TimeSeriesSplit(n_splits=5)                                 -> for time-ordered data
#
# IMPORTANT: cross_val_score clones the estimator for each fold. The estimator instance you pass
#            is not re-used across folds (sklearn does clone(estimator) internally).
#
# SIMPLE CALL (classification example, 5-fold):
scores = cross_val_score(clf, X_scaled, y, cv=5, n_jobs=-1)  # default scoring = accuracy for classifiers
# This returns an array with 5 scores (one per fold).
#
# Print raw fold scores + common summary stats
print("Cross-validation scores (per-fold):", scores)
print("Mean score:", scores.mean())
print("Std (score across folds):", scores.std())

# -------------------------
# Examples of other options
# -------------------------
# 1) Use StratifiedKFold explicitly (good to set shuffle & random_state)
# skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# scores = cross_val_score(clf, X_scaled, y, cv=skf)

# 2) Evaluate a different metric (e.g., F1 macro)
# scores_f1 = cross_val_score(clf, X_scaled, y, cv=5, scoring='f1_macro')

# 3) Use cross_validate if you want train scores, fit times, and multiple metrics
# from sklearn.model_selection import cross_validate
# results = cross_validate(clf, X_scaled, y, cv=5, scoring=['accuracy','f1_macro'], return_train_score=True)

# 4) Use Pipeline to avoid leakage (preferred when you need scaling or feature transforms)
# from sklearn.pipeline import make_pipeline
# pipe = make_pipeline(StandardScaler(), KNeighborsClassifier())
# scores_pipe = cross_val_score(pipe, X, y, cv=5)  # This scales inside each fold correctly

Cross-validation scores (per-fold): [0.96491228 0.95614035 0.98245614 0.95614035 0.96460177]
Mean score: 0.9648501785437045
Std (score across folds): 0.009609970350036127


In [10]:
import numpy as np

# Can find the mean
np.mean(scores)

np.float64(0.9648501785437045)

In [11]:
"""
=====================================
CROSS-VALIDATION EXPLAINED (BEGINNER)
=====================================

Quick summary:
-------------
cross_val_score(clf, X, y, cv=5) runs the model 5 times on different train/validation splits
and returns 5 scores (one per split). Use the mean ± std of those scores to estimate model performance.

What is Cross-Validation?
-------------------------
- Think of your dataset as a deck of cards.
- Instead of doing one train/test split, CV splits the deck into `cv` equal piles (folds).
- For cv=5: 5 folds → each round train on 4 folds, test on the remaining 1.
- Repeat until each fold has been used once for testing.
- Output: 5 scores → average them.

Step-by-step for cv=5:
----------------------
- Fold1, Fold2, Fold3, Fold4, Fold5
- Iteration 1: train=[2,3,4,5], validate=[1]
- Iteration 2: train=[1,3,4,5], validate=[2]
- Iteration 3: train=[1,2,4,5], validate=[3]
- Iteration 4: train=[1,2,3,5], validate=[4]
- Iteration 5: train=[1,2,3,4], validate=[5]

Why useful?
-----------
- A single train/test split can be lucky or unlucky.
- CV averages over multiple splits → more reliable estimate of performance.

------------------------------------------------------------
PARAMETERS of cross_val_score (explained in beginner English)
------------------------------------------------------------
cross_val_score(estimator, X, y, groups=None, scoring=None, cv=None,
                n_jobs=None, verbose=0, fit_params=None, pre_dispatch='2*n_jobs',
                error_score=np.nan)

- estimator: the model (e.g., KNeighborsClassifier()).
- X, y: features (X: n_samples × n_features), labels (y: n_samples,).
- groups: for GroupKFold (ensures samples from the same group stay together).
- scoring: how to measure performance.
    * Default = estimator.score (accuracy for classifiers).
    * Can use: 'accuracy', 'f1_macro', 'roc_auc', 'neg_mean_squared_error', etc.
    * Note: for loss metrics sklearn uses NEGATIVE values (so higher is better).
- cv: how to split the data.
    * int (e.g., 5): StratifiedKFold for classification, KFold for regression.
    * Or a cross-validator object (StratifiedKFold, KFold, GroupKFold, TimeSeriesSplit).
- n_jobs: parallelism. n_jobs=-1 uses all CPU cores.
- error_score: value to assign if model fails (default NaN).

Default CV rules:
-----------------
- Classification + cv=int → StratifiedKFold (preserves class ratio).
- Regression + cv=int → KFold.

Special CV types:
-----------------
- StratifiedKFold: keeps class ratios balanced.
- KFold: random split (shuffle recommended).
- GroupKFold: split by groups (e.g., patients, cities).
- TimeSeriesSplit: for ordered (temporal) data.

Important:
----------
- cross_val_score clones the estimator for each fold → your clf object is NOT reused.
- Always use Pipelines if preprocessing (e.g., scaling) is needed → avoids data leakage.

------------------------------------------------------------
Interpreting results:
------------------------------------------------------------
scores = cross_val_score(...)
Example output: [0.95, 0.93, 0.94, 0.92, 0.96]
- Mean = 0.94 (central estimate)
- Std = 0.01 (low variance, stable model)

If Std is high → model is unstable across folds.

------------------------------------------------------------
BEST PRACTICES:
------------------------------------------------------------
- Use Pipeline for scaling/feature selection to avoid leakage.
- Use StratifiedKFold for imbalanced classification.
- Shuffle in KFold unless it's time series.
- For time series → TimeSeriesSplit.
- Common cv values: 5 or 10.

------------------------------------------------------------
"""

# ========================
# Practical code examples
# ========================

from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import numpy as np

# Example dataset
from sklearn.datasets import load_breast_cancer
X, y = load_breast_cancer(return_X_y=True)

# Pipeline: scaling inside CV (avoids leakage)
pipe = make_pipeline(StandardScaler(), KNeighborsClassifier())

# Basic cross_val_score usage
scores = cross_val_score(pipe, X, y, cv=5, n_jobs=-1)  # default scoring=accuracy for classifier
print("Per-fold accuracy:", scores)
print("Mean accuracy:", scores.mean())
print("Std deviation:", scores.std())

# --------------------------------------------------
# Example: show actual train/test splits for KFold
# --------------------------------------------------
X_small = np.arange(10)
kf = KFold(n_splits=5, shuffle=False)

print("\nKFold splits (train -> test indices):")
for i, (train_idx, test_idx) in enumerate(kf.split(X_small), 1):
    print(f"Fold {i}: train={train_idx.tolist()}  test={test_idx.tolist()}")

# --------------------------------------------------
# Example: StratifiedKFold with shuffling
# --------------------------------------------------
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores_strat = cross_val_score(pipe, X, y, cv=skf, n_jobs=-1)
print("\nStratifiedKFold accuracy scores:", scores_strat)

# --------------------------------------------------
# Example: different metric (F1 macro)
# --------------------------------------------------
scores_f1 = cross_val_score(pipe, X, y, cv=5, scoring='f1_macro', n_jobs=-1)
print("\nF1-macro scores:", scores_f1)

Per-fold accuracy: [0.96491228 0.95614035 0.98245614 0.95614035 0.96460177]
Mean accuracy: 0.9648501785437045
Std deviation: 0.009609970350036127

KFold splits (train -> test indices):
Fold 1: train=[2, 3, 4, 5, 6, 7, 8, 9]  test=[0, 1]
Fold 2: train=[0, 1, 4, 5, 6, 7, 8, 9]  test=[2, 3]
Fold 3: train=[0, 1, 2, 3, 6, 7, 8, 9]  test=[4, 5]
Fold 4: train=[0, 1, 2, 3, 4, 5, 8, 9]  test=[6, 7]
Fold 5: train=[0, 1, 2, 3, 4, 5, 6, 7]  test=[8, 9]

StratifiedKFold accuracy scores: [0.98245614 0.94736842 0.93859649 0.98245614 0.96460177]

F1-macro scores: [0.96265968 0.95263814 0.98095556 0.95157591 0.96245847]
