In [1]:
# ==========================================================
# Cell 1: What is a Pipeline?
# ==========================================================
# A Pipeline in sklearn is a way to chain multiple steps together
# (like preprocessing + model) into a single object.
#
# Why is this useful?
# - Cleaner code: instead of calling scaler.fit_transform(), then model.fit(), etc
#   we put them into one pipeline.
# - Consistency: ensures that the exact same preprocessing happens during training
#   and prediction/testing.
# - Cross-validation ready: avoids "data leakage" by making sure preprocessing
#   is fit only on training folds, not on test folds.
#
# Structure:
#   Pipeline([
#       ('step_name1', transformer1),
#       ('step_name2', transformer2),
#       ('model', estimator)
#   ])
#
# Notes:
# - Each step except the last must be a transformer (something with fit/transform).
# - The last step must be an estimator (classifier/regressor).

In [8]:
# ==========================================================
# Cell 2: Example Dataset Setup
# ==========================================================
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

# Load dataset
X, y = load_breast_cancer(return_X_y=True)

# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# ==========================================================
# Cell 3: Basic Pipeline Example
# ==========================================================
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline

# Create a pipeline:
# Step 1: scale the features (important for KNN)
# Step 2: fit a KNeighborsClassifier
pipe = Pipeline([
    ('scaler', StandardScaler()),          # preprocessing step
    ('knn', KNeighborsClassifier())        # model step
])

# Fit pipeline on training data
pipe.fit(X_train, y_train)

# Predict using the same pipeline
y_pred = pipe.predict(X_test)

# The scaler is automatically applied inside the pipeline
# -> No need to manually scale test data!

In [10]:
# ==========================================================
# Cell 4: Pipeline with Cross-Validation
# ==========================================================
from sklearn.model_selection import cross_val_score

# We can directly pass the pipeline into cross_val_score
scores = cross_val_score(pipe, X, y, cv=5)

scores

# Notes:
# - cross_val_score automatically applies scaling inside each fold.
# - This prevents data leakage (test fold data is never used for fitting scaler).

array([0.96491228, 0.95614035, 0.98245614, 0.95614035, 0.96460177])

In [11]:
# ==========================================================
# Cell 5: Pipeline with Hyperparameter Tuning
# ==========================================================
from sklearn.model_selection import GridSearchCV

# Define parameter grid for pipeline
# IMPORTANT: to tune parameters inside a pipeline, we use "stepname__parameter"
# Example: 'knn__n_neighbors' -> access n_neighbors inside 'knn' step
param_grid = {
    'knn__n_neighbors': [3, 5, 7, 9],
    'knn__weights': ['uniform', 'distance']
}

# Use GridSearchCV on the pipeline
grid = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy')
grid.fit(X_train, y_train)

grid.best_params_

# Notes:
# - Pipeline allows us to combine preprocessing + model + hyperparameter tuning.
# - Very powerful because everything is packaged together cleanly.

{'knn__n_neighbors': 5, 'knn__weights': 'uniform'}

In [12]:
# ==========================================================
# Cell 6: Evaluate Best Pipeline
# ==========================================================
from sklearn.metrics import accuracy_score

# Get best model (pipeline with best hyperparameters)
best_pipe = grid.best_estimator_

# Predict on test set
y_pred_best = best_pipe.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred_best)
accuracy

# Notes:
# - The returned best estimator is STILL a pipeline.
# - That means preprocessing + best model steps are included.

0.9473684210526315

In [7]:
# ==========================================================
# Cell 7: Summary of Pipelines
# ==========================================================
# ✅ Key Takeaways:
# - Pipelines chain multiple steps (scaling, encoding, model, etc).
# - Each step has a name + transformer/estimator.
# - Final step must be a model (classifier/regressor).
#
# ✅ Benefits:
# - Prevents data leakage
# - Cleaner and more reproducible code
# - Easy integration with cross-validation and GridSearchCV
# - Automatically applies preprocessing at both training and prediction time
#
# ✅ Usage:
# - Use when your model requires preprocessing.
# - Combine with hyperparameter tuning for professional workflows.