In [1]:
# 📦 Import Required Libraries
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np


In [3]:
# 🧪 Load a Sample Dataset
# We use Breast Cancer dataset from sklearn for binary classification

data = load_breast_cancer()
X, y = data.data, data.target

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:
# 🔧 Define a Pipeline

# The pipeline includes:
# 1. Imputation: Filling in missing values using SimpleImputer
# 2. Scaling: Standardizing features with StandardScaler
# 3. Modeling: Training a RandomForestClassifier

pipeline = Pipeline([
    ('imputer', SimpleImputer()),             # Step 1: Handle missing values
    ('scaler', StandardScaler()),             # Step 2: Feature scaling
    ('clf', RandomForestClassifier())         # Step 3: ML model
])


In [7]:
# 🔍 Define the Hyperparameter Grid for GridSearchCV

# Each key is formatted as <step_name>__<parameter_name>
# This allows us to tune hyperparameters of both preprocessing and model steps

param_grid = {
    'imputer__strategy': ['mean', 'median'],  # Try mean and median imputation
    'clf__n_estimators': [50, 100],           # Number of trees in the forest
    'clf__max_depth': [None, 10, 20]          # Maximum depth of the trees
}


In [9]:
# 🧠 Set Up and Run GridSearchCV

# cv=5: Use 5-fold cross-validation
# n_jobs=-1: Use all available CPU cores
# verbose=1: Print progress messages

grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, verbose=1)

# Fit the grid search on the training data
grid_search.fit(X_train, y_train)


Fitting 5 folds for each of 12 candidates, totalling 60 fits


In [11]:
# 📈 Evaluate the Best Model on the Test Set

print("✅ Best Parameters Found:")
print(grid_search.best_params_)

# Make predictions on the test data
y_pred = grid_search.predict(X_test)

# Display classification performance
print("\n📊 Classification Report:")
print(classification_report(y_test, y_pred))


✅ Best Parameters Found:
{'clf__max_depth': 20, 'clf__n_estimators': 100, 'imputer__strategy': 'mean'}

📊 Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.93      0.95        43
           1       0.96      0.99      0.97        71

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114



# 📘 Understanding Each Component (Markdown for documentation cells)

"""
### 🔹 What does `StandardScaler()` do?
It standardizes each feature by removing the mean and scaling to unit variance:

    z = (x - mean) / std

This is important for models sensitive to feature scales (e.g., Logistic Regression, SVM),
though less crucial for tree-based models like RandomForest.

---

### 🔹 What does `param_grid` do?
It specifies hyperparameters for GridSearchCV to try out:

- `imputer__strategy`: Try using 'mean' and 'median' to fill missing values.
- `clf__n_estimators`: Try 50 and 100 trees in the random forest.
- `clf__max_depth`: Try different tree depths: None (full tree), 10, and 20.

GridSearchCV will test all combinations.

---

### 🔹 What does `GridSearchCV(cv=5, n_jobs=-1, verbose=1)` mean?

- `cv=5`: 5-fold cross-validation (split data into 5 parts, rotate training/validation).
- `n_jobs=-1`: Use all available CPU cores to parallelize training.
- `verbose=1`: Output progress during the search.

It selects the best model based on average validation performance across all folds.
"""


In [14]:
!git add .


The file will have its original line endings in your working directory
The file will have its original line endings in your working directory
The file will have its original line endings in your working directory
The file will have its original line endings in your working directory
The file will have its original line endings in your working directory
The file will have its original line endings in your working directory
The file will have its original line endings in your working directory
The file will have its original line endings in your working directory
The file will have its original line endings in your working directory
The file will have its original line endings in your working directory


In [16]:
!git commit -m "Added notebook on standardization, Gridsearch and pipeline"

[main 27e5d70] Added notebook on standardization, Gridsearch and pipeline
 17 files changed, 1271 insertions(+)
 create mode 100644 .ipynb_checkpoints/Processing-data-checkpoint.ipynb
 create mode 100644 .ipynb_checkpoints/Standardization-scalling-checkpoint.ipynb
 create mode 100644 Processing-data.ipynb
 create mode 100644 Standardization-scalling.ipynb
 create mode 100644 supervised-learning/Classification-problems/.ipynb_checkpoints/Classiffication-metrics-checkpoint.ipynb
 create mode 100644 supervised-learning/Classification-problems/.ipynb_checkpoints/Logistics-regression-checkpoint.ipynb
 create mode 100644 supervised-learning/Classification-problems/Classiffication-metrics.ipynb
 create mode 100644 supervised-learning/Classification-problems/Logistics-regression.ipynb
 create mode 100644 supervised-learning/Regression-problem/.ipynb_checkpoints/K-fold-LinearRegression-checkpoint.ipynb
 create mode 100644 supervised-learning/Regression-problem/K-fold-LinearRegression.ipynb
 cre

In [18]:
!git push origin main

To https://github.com/endiesworld/ML_projects.git
   a9634af..27e5d70  main -> main


In [38]:
np.arange(0.01, 1.0, 20)

array([0.01])

In [24]:
np.arange(1, 50)

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49])