In [1]:
# 📦 Import Required Libraries
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np


In [3]:
# 🧪 Load a Sample Dataset
# We use Breast Cancer dataset from sklearn for binary classification

data = load_breast_cancer()
X, y = data.data, data.target

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:
# 🔧 Define a Pipeline

# The pipeline includes:
# 1. Imputation: Filling in missing values using SimpleImputer
# 2. Scaling: Standardizing features with StandardScaler
# 3. Modeling: Training a RandomForestClassifier

pipeline = Pipeline([
    ('imputer', SimpleImputer()),             # Step 1: Handle missing values
    ('scaler', StandardScaler()),             # Step 2: Feature scaling
    ('clf', RandomForestClassifier())         # Step 3: ML model
])
