# MLflow Complete Tutorial

This notebook provides a comprehensive walkthrough of MLflow features.

## Prerequisites

1. Start MLflow server in a terminal:
   ```bash
   mlflow server --host 0.0.0.0 --port 5000
   ```

2. Access MLflow UI at http://localhost:5000

## Setup and Imports

In [None]:
import mlflow
import mlflow.sklearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Configure MLflow
mlflow.set_tracking_uri("http://localhost:5000")

print("âœ… Setup complete!")
print(f"MLflow Tracking URI: {mlflow.get_tracking_uri()}")

## 1. Load and Explore Data

In [None]:
# Load iris dataset
iris = load_iris()
X = pd.DataFrame(iris.data, columns=iris.feature_names)
y = iris.target

print("Dataset shape:", X.shape)
print("\nFirst few rows:")
display(X.head())

print("\nTarget distribution:")
print(pd.Series(y).value_counts().sort_index())

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")

## 2. Basic MLflow Tracking

In [None]:
# Create experiment
mlflow.set_experiment("notebook-tutorial")

# Train a simple model with MLflow tracking
with mlflow.start_run(run_name="Logistic Regression - Baseline"):
    # Log parameters
    params = {
        "model": "LogisticRegression",
        "max_iter": 200,
        "solver": "lbfgs"
    }
    mlflow.log_params(params)
    
    # Train model
    model = LogisticRegression(max_iter=200, solver='lbfgs')
    model.fit(X_train, y_train)
    
    # Predict and evaluate
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    # Log metrics
    mlflow.log_metric("accuracy", accuracy)
    
    # Log model
    mlflow.sklearn.log_model(model, "model")
    
    print(f"âœ… Run completed!")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Run ID: {mlflow.active_run().info.run_id}")

## 3. Compare Multiple Models

In [None]:
# Define models to compare
models = {
    "Logistic Regression": LogisticRegression(max_iter=200),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "SVM": SVC(kernel='rbf', random_state=42)
}

results = []

for model_name, model in models.items():
    with mlflow.start_run(run_name=model_name):
        # Log model type
        mlflow.log_param("model_type", model_name)
        
        # Train
        model.fit(X_train, y_train)
        
        # Predict
        y_pred = model.predict(X_test)
        
        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        
        # Log metrics
        mlflow.log_metric("accuracy", accuracy)
        
        # Log model
        mlflow.sklearn.log_model(model, "model")
        
        results.append({
            'Model': model_name,
            'Accuracy': accuracy
        })
        
        print(f"âœ… {model_name}: {accuracy:.4f}")

# Display results
results_df = pd.DataFrame(results)
display(results_df.sort_values('Accuracy', ascending=False))

## 4. Logging Artifacts (Plots, Files)

In [None]:
with mlflow.start_run(run_name="Model with Artifacts"):
    # Train model
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Log accuracy
    accuracy = accuracy_score(y_test, y_pred)
    mlflow.log_metric("accuracy", accuracy)
    
    # Create and log confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.savefig('confusion_matrix.png')
    mlflow.log_artifact('confusion_matrix.png')
    plt.show()
    
    # Create and log feature importance
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    plt.figure(figsize=(10, 6))
    sns.barplot(data=feature_importance, x='importance', y='feature')
    plt.title('Feature Importance')
    plt.tight_layout()
    plt.savefig('feature_importance.png')
    mlflow.log_artifact('feature_importance.png')
    plt.show()
    
    # Log classification report as text
    report = classification_report(y_test, y_pred)
    with open('classification_report.txt', 'w') as f:
        f.write(report)
    mlflow.log_artifact('classification_report.txt')
    
    print("âœ… Artifacts logged successfully!")

## 5. Autologging (Easy Mode!)

In [None]:
# Enable autologging
mlflow.sklearn.autolog()

with mlflow.start_run(run_name="Autolog Demo"):
    # Just train - MLflow logs everything automatically!
    model = RandomForestClassifier(n_estimators=50, max_depth=5)
    model.fit(X_train, y_train)
    score = model.score(X_test, y_test)
    
    print(f"âœ… Model trained with autologging!")
    print(f"Test score: {score:.4f}")
    print("Check MLflow UI to see all automatically logged information!")

# Disable autologging for future runs
mlflow.sklearn.autolog(disable=True)

## 6. Model Registry

In [None]:
# Train a production-ready model
with mlflow.start_run(run_name="Production Model") as run:
    model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
    model.fit(X_train, y_train)
    
    accuracy = model.score(X_test, y_test)
    mlflow.log_metric("accuracy", accuracy)
    
    # Register model
    mlflow.sklearn.log_model(
        model,
        "model",
        registered_model_name="notebook-iris-model"
    )
    
    print(f"âœ… Model registered: notebook-iris-model")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Run ID: {run.info.run_id}")

## 7. Loading Models from Registry

In [None]:
# Load model from registry
model_name = "notebook-iris-model"
model_uri = f"models:/{model_name}/latest"

try:
    loaded_model = mlflow.pyfunc.load_model(model_uri)
    
    # Make predictions with loaded model
    sample_data = X_test.iloc[:5]
    predictions = loaded_model.predict(sample_data)
    
    print("âœ… Model loaded from registry successfully!")
    print("\nSample predictions:")
    for i, pred in enumerate(predictions):
        print(f"Sample {i+1}: {iris.target_names[int(pred)]}")
except Exception as e:
    print(f"Note: {e}")
    print("Make sure you've registered the model first!")

## Summary

In this notebook, you learned:

1. âœ… **Basic MLflow Tracking** - Log parameters, metrics, and models
2. âœ… **Model Comparison** - Compare multiple models systematically
3. âœ… **Logging Artifacts** - Save plots, reports, and files
4. âœ… **Autologging** - Automatic tracking with minimal code
5. âœ… **Model Registry** - Manage and version models
6. âœ… **Loading Models** - Retrieve and use registered models

## Next Steps

1. Explore the MLflow UI at http://localhost:5000
2. Try the Python demo scripts in this directory
3. Read the [MLflow README](README.md) for more examples
4. Apply MLflow to your own projects!

---
**Happy experimenting! ðŸš€**