<a href="https://colab.research.google.com/github/deshm084/Automated-Machine-Learning-Pipeline/blob/main/Automated-Machine-Learning-Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
Project: Automated ML Pipeline
Author: Sanskruti
Description: A robust pipeline that automates data cleaning, scaling, feature selection,
             and model selection using Scikit-Learn's GridSearchCV.
"""

import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# --- 1. Load Data ---
# We use the Breast Cancer dataset (Binary Classification)
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)

# Split 80/20
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- 2. Build the Pipeline Chassis ---
# This is a generic structure. We will swap out the 'classifier' later.
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')), # Step 1: Handle missing values
    ('scaler', StandardScaler()),                # Step 2: Normalize data (0-1 range)
    ('selector', SelectKBest(f_classif)),        # Step 3: Select most important features
    ('classifier', LogisticRegression())         # Step 4: The Model (Placeholder)
])

# --- 3. Define the Search Space (The "Auto" Part) ---
# We define a list of dictionaries. Each dictionary is a "path" the computer can take.

search_space = [
    # Path 1: Logistic Regression
    {
        'selector__k': [5, 10, 'all'],           # Try using 5, 10, or all features
        'classifier': [LogisticRegression()],
        'classifier__C': [0.1, 1.0, 10.0]        # Regularization strength
    },
    # Path 2: Random Forest (Note: RF doesn't strictly need scaling, but the pipeline handles it)
    {
        'selector__k': [5, 10],
        'classifier': [RandomForestClassifier()],
        'classifier__n_estimators': [50, 100],   # Number of trees
        'classifier__max_depth': [None, 10, 20]  # Depth of trees
    }
]

# --- 4. The Engine (GridSearchCV) ---
# verbose=1 shows you the progress
print("Starting AutoML Search... (Competing models against each other)")
grid_search = GridSearchCV(pipeline, search_space, cv=5, verbose=1, n_jobs=-1)

# Train!
grid_search.fit(X_train, y_train)

# --- 5. The Winner ---
print("\n--- Best Pipeline Found ---")
print(f"Best Model: {grid_search.best_estimator_.named_steps['classifier']}")
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Cross-Validation Score: {grid_search.best_score_:.2%}")

# --- 6. Final Evaluation ---
# We use the "Best Estimator" to predict on the test set
print("\n--- Test Set Evaluation ---")
y_pred = grid_search.best_estimator_.predict(X_test)
print(classification_report(y_test, y_pred))


Starting AutoML Search... (Competing models against each other)
Fitting 5 folds for each of 21 candidates, totalling 105 fits

--- Best Pipeline Found ---
Best Model: LogisticRegression()
Best Parameters: {'classifier': LogisticRegression(), 'classifier__C': 1.0, 'selector__k': 'all'}
Best Cross-Validation Score: 97.58%

--- Test Set Evaluation ---
              precision    recall  f1-score   support

           0       0.98      0.95      0.96        43
           1       0.97      0.99      0.98        71

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114

