In [36]:
# Import required libraries
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#import seaborn as sns
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, roc_curve, auc

In [33]:
import mlflow
import mlflow.sklearn

In [2]:
# Add the src directory to the path for importing our utility modules
sys.path.append('../')
from src.model_utils import evaluate_model, save_model, load_model

In [41]:
# Set paths
DATA_DIR = '../Data'
PROCESSED_DATA_DIR = os.path.join(DATA_DIR, 'processed')
MODEL_DIR = os.path.join('../models')
os.makedirs(MODEL_DIR, exist_ok=True)

In [42]:
# Load the prepared data
train_data_path = os.path.join(PROCESSED_DATA_DIR, 'train_data.npz')
test_data_path = os.path.join(PROCESSED_DATA_DIR, 'test_data.npz')

train_data = np.load(train_data_path)
test_data = np.load(test_data_path)

X_train = train_data['X']
y_train = train_data['y']
X_test = test_data['X']
y_test = test_data['y']

# Load feature names for reference
feature_names_path = os.path.join(PROCESSED_DATA_DIR, 'feature_names.joblib')
feature_names = joblib.load(feature_names_path)

# Load the scaler
scaler_path = os.path.join(PROCESSED_DATA_DIR, 'scaler.joblib')
scaler = joblib.load(scaler_path)

print(f"Loaded training data: {X_train.shape[0]} samples with {X_train.shape[1]} features")
print(f"Loaded testing data: {X_test.shape[0]} samples with {X_test.shape[1]} features")
print(f"Feature names: {feature_names}")

Loaded training data: 120 samples with 6 features
Loaded testing data: 30 samples with 6 features
Feature names: ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'SepalArea', 'PetalArea']


In [43]:
mlflow.set_tracking_uri("file:../models/mlruns")
mlflow.set_experiment("Iris_Model_Tracking")

<Experiment: artifact_location='file:d:/MLOPS/Iris-Classification/notebooks/../models/mlruns/484549340631699792', creation_time=1753519665597, experiment_id='484549340631699792', last_update_time=1753519665597, lifecycle_stage='active', name='Iris_Model_Tracking', tags={}>

In [44]:
# Cell 2: Train Logistic Regression
with mlflow.start_run(run_name="LogisticRegression"):
    lr = LogisticRegression(max_iter=200,random_state=45)
    lr.fit(X_train, y_train)
    y_pred = lr.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average="macro")
    recall = recall_score(y_test, y_pred, average="macro")
    f1 = f1_score(y_test, y_pred, average="macro")
    
    mlflow.log_param("max_iter", 200)
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("Precision", precision)
    mlflow.log_metric("Recall",recall)
    mlflow.log_metric("f1_score", f1)
    
    mlflow.sklearn.log_model(lr, "model", registered_model_name="LogisticRegression")
    
    print(f"Logistic Regression - Accuracy: {acc:.4f}, F1 Score: {f1:.4f}")




Logistic Regression - Accuracy: 1.0000, F1 Score: 1.0000


Registered model 'LogisticRegression' already exists. Creating a new version of this model...
Created version '2' of model 'LogisticRegression'.


In [45]:
mlflow.set_tracking_uri("file:../models/mlruns")  # Optional
mlflow.set_experiment("LogReg_Hyperparam_Tuning")

param_grid = {
    'C': [0.01, 0.1, 1, 10],               # Inverse of regularization strength
    'penalty': ['l1', 'l2'],              # Type of regularization
    'solver': ['liblinear']              # 'liblinear' supports both l1 and l2
}

mlflow.set_tracking_uri("file:../models/mlruns")  # Log to models folder
mlflow.set_experiment("LogReg_Hyperparam_Tuning")

with mlflow.start_run(run_name="LogReg_GridSearch_Metrics"):
    grid = GridSearchCV(
        LogisticRegression(max_iter=500),
        param_grid,
        cv=5,
        scoring='accuracy',
        n_jobs=-1
    )
    grid.fit(X_train, y_train)

    best_model = grid.best_estimator_
    y_pred = best_model.predict(X_test)

    # ⚠️ Use average='macro' for multi-class metrics
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='macro')
    rec = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')

    # 📌 Log metrics to MLflow
    mlflow.log_params(grid.best_params_)
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("precision", prec)
    mlflow.log_metric("recall", rec)
    mlflow.log_metric("f1_score", f1)

    # 📁 Log the model itself
    mlflow.sklearn.log_model(best_model, "best_model")

    print("Best Params:", grid.best_params_)
    print(f"Test Accuracy: {acc:.3f} | Precision: {prec:.3f} | Recall: {rec:.3f} | F1: {f1:.3f}")




Best Params: {'C': 10, 'penalty': 'l1', 'solver': 'liblinear'}
Test Accuracy: 1.000 | Precision: 1.000 | Recall: 1.000 | F1: 1.000


In [20]:
# Cell 3: Train Random Forest
with mlflow.start_run(run_name="RandomForest"):
    rf = RandomForestClassifier(n_estimators=100, max_depth=4, random_state=42)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="macro")
    
    mlflow.log_param("n_estimators", 100)
    mlflow.log_param("max_depth", 4)
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("f1_score", f1)
    
    mlflow.sklearn.log_model(rf, "model", registered_model_name="RandomForest")
    
    print(f"Random Forest - Accuracy: {acc:.4f}, F1 Score: {f1:.4f}")




Random Forest - Accuracy: 1.0000, F1 Score: 1.0000


Registered model 'RandomForest' already exists. Creating a new version of this model...
Created version '2' of model 'RandomForest'.


In [46]:
# Set MLflow to log into your desired folder (relative to notebooks/)
mlflow.set_tracking_uri("file:../models/mlruns")
mlflow.set_experiment("RandomForest_Hyperparam_Tuning")

param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 5, 10],
    'min_samples_split': [2, 4]
}


with mlflow.start_run(run_name="RandomForest_GridSearch_Metrics"):
    # Grid Search with Cross-Validation
    grid = GridSearchCV(
        RandomForestClassifier(random_state=42),
        param_grid,
        cv=5,
        scoring='accuracy',
        n_jobs=-1
    )
    grid.fit(X_train, y_train)

    best_model = grid.best_estimator_
    y_pred = best_model.predict(X_test)

    # ⚠️ Use average='macro' for multi-class metrics
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='macro')
    rec = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')

    # Log best parameters and metrics
    mlflow.log_params(grid.best_params_)
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("precision", prec)
    mlflow.log_metric("recall", rec)
    mlflow.log_metric("f1_score", f1)

    # Save the model
    mlflow.sklearn.log_model(best_model, "best_model")

    print("✅ Best Params:", grid.best_params_)
    print(f"📊 Accuracy: {acc:.3f} | Precision: {prec:.3f} | Recall: {rec:.3f} | F1: {f1:.3f}")




✅ Best Params: {'max_depth': 3, 'min_samples_split': 2, 'n_estimators': 50}
📊 Accuracy: 1.000 | Precision: 1.000 | Recall: 1.000 | F1: 1.000


In [47]:
# Set tracking URI and experiment
mlflow.set_tracking_uri("file:../models/mlruns")
mlflow.set_experiment("Iris_Comparison")


best_model = None
best_f1 = 0
best_run_id = None
best_name = ""

# ---------- Logistic Regression ----------
with mlflow.start_run(run_name="LogisticRegression") as run:
    model = LogisticRegression(max_iter=500, solver='liblinear', C=1.0)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    f1 = f1_score(y_test, preds, average='macro')
    mlflow.log_metric("f1_score", f1)
    mlflow.sklearn.log_model(model, "model")

    if f1 > best_f1:
        best_f1 = f1
        best_model = model
        best_run_id = run.info.run_id
        best_name = "LogisticRegression"

# ---------- Random Forest ----------
with mlflow.start_run(run_name="RandomForest") as run:
    model = RandomForestClassifier(n_estimators=50, max_depth=3,min_samples_split=2)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    f1 = f1_score(y_test, preds, average='macro')
    mlflow.log_metric("f1_score", f1)
    mlflow.sklearn.log_model(model, "model")

    if f1 > best_f1:
        best_f1 = f1
        best_model = model
        best_run_id = run.info.run_id
        best_name = "RandomForest"




In [48]:
# Register best model using its run_id
model_uri = f"runs:/{best_run_id}/model"
mlflow.register_model(model_uri, "IrisBestModel")


Registered model 'IrisBestModel' already exists. Creating a new version of this model...


Created version '2' of model 'IrisBestModel'.


<ModelVersion: aliases=[], creation_timestamp=1753531172068, current_stage='None', deployment_job_state=None, description=None, last_updated_timestamp=1753531172068, metrics=[<Metric: dataset_digest=None, dataset_name=None, key='f1_score', model_id='m-c24f54082aec4688b1a70e3982650b90', run_id='0f4b514109b04c949b12ae4dd83c8e08', step=0, timestamp=1753531154920, value=1.0>], model_id='m-c24f54082aec4688b1a70e3982650b90', name='IrisBestModel', params={}, run_id='0f4b514109b04c949b12ae4dd83c8e08', run_link=None, source='models:/m-c24f54082aec4688b1a70e3982650b90', status='READY', status_message=None, tags={}, user_id=None, version=2>

In [49]:
import mlflow.pyfunc

model = mlflow.pyfunc.load_model("models:/IrisBestModel/2")  # or /Staging

In [51]:
model

mlflow.pyfunc.loaded_model:
  artifact_path: file:d:/MLOPS/Iris-Classification/notebooks/../models/mlruns/709871268375005215/models/m-c24f54082aec4688b1a70e3982650b90/artifacts
  flavor: mlflow.sklearn
  run_id: 0f4b514109b04c949b12ae4dd83c8e08