# Production RAG Monitoring with MLflow

This notebook shows how to set up production monitoring for your RAG system's hallucination guardrails using MLflow.

**Real-world use case:**
- Track guardrail performance over time
- Detect when your coverage starts degrading
- Compare different guardrail configurations
- Get alerts when coverage drops below threshold

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/debu-sinha/conformaldrift/blob/main/examples/02_production_monitoring_mlflow.ipynb)

In [None]:
!pip install conformal-drift mlflow langchain langchain-openai sentence-transformers -q

In [None]:
import os
import mlflow
import numpy as np
from datetime import datetime

os.environ['OPENAI_API_KEY'] = 'your-key-here'

## Set Up MLflow Experiment

Create a dedicated experiment for tracking your RAG guardrails.

In [None]:
# Create experiment for guardrail monitoring
mlflow.set_experiment("rag-hallucination-guardrails")

print(f"MLflow Tracking URI: {mlflow.get_tracking_uri()}")
print(f"Experiment: rag-hallucination-guardrails")
print("\nRun 'mlflow ui' to view the dashboard")

## Define Your RAG System Components

This is a template - replace with your actual RAG system.

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Your embedding model for computing hallucination scores
scorer = SentenceTransformer('all-MiniLM-L6-v2')

def compute_hallucination_score(response: str, context: str) -> float:
    """
    Compute hallucination score for a RAG response.
    Replace this with your actual scoring logic.
    """
    if not context:
        return 1.0
    
    response_emb = scorer.encode([response])
    context_emb = scorer.encode([context])
    similarity = cosine_similarity(response_emb, context_emb)[0][0]
    
    return 1 - similarity

print("Scoring function ready")

## Scenario 1: Initial Calibration Run

Log your initial calibration as a baseline run in MLflow.

In [None]:
from conformal_drift import ConformalDriftAuditor

# Simulate calibration data (replace with your real calibration data)
np.random.seed(42)
calibration_responses = [
    ("Python uses indentation for code blocks.", "Python syntax uses whitespace indentation."),
    ("Lists are created with square brackets.", "Create lists using [] syntax."),
    ("Dictionaries store key-value pairs.", "Dict maps keys to values."),
    # ... add your real calibration data here
]

# Compute calibration scores
calibration_scores = []
for response, context in calibration_responses:
    score = compute_hallucination_score(response, context)
    calibration_scores.append(score)

# For demo, generate more scores
calibration_scores = np.random.beta(2, 5, 100)  # Replace with real scores!

with mlflow.start_run(run_name="initial_calibration"):
    # Log calibration parameters
    mlflow.log_param("alpha", 0.1)
    mlflow.log_param("n_calibration_samples", len(calibration_scores))
    mlflow.log_param("embedding_model", "all-MiniLM-L6-v2")
    mlflow.log_param("calibration_date", datetime.now().isoformat())
    
    # Compute threshold
    threshold = np.percentile(calibration_scores, 90)
    mlflow.log_metric("threshold", threshold)
    mlflow.log_metric("calibration_mean_score", np.mean(calibration_scores))
    mlflow.log_metric("calibration_std_score", np.std(calibration_scores))
    
    # Save calibration scores as artifact
    np.save("calibration_scores.npy", calibration_scores)
    mlflow.log_artifact("calibration_scores.npy")
    
    # Tag as baseline
    mlflow.set_tag("type", "calibration")
    mlflow.set_tag("status", "baseline")
    
    print(f"Logged calibration run")
    print(f"Threshold: {threshold:.4f}")
    print(f"Run ID: {mlflow.active_run().info.run_id}")

## Scenario 2: Daily/Weekly Audit Runs

Run this periodically to track how your guardrail performs on production traffic.

In [None]:
def run_production_audit(production_data, calibration_scores, audit_name="production_audit"):
    """
    Run a production audit and log to MLflow.
    
    Args:
        production_data: List of (response, context, is_grounded) tuples
        calibration_scores: Original calibration scores
        audit_name: Name for the MLflow run
    """
    with mlflow.start_run(run_name=audit_name):
        # Log metadata
        mlflow.log_param("audit_date", datetime.now().isoformat())
        mlflow.log_param("n_samples", len(production_data))
        mlflow.set_tag("type", "audit")
        
        # Compute scores on production data
        production_scores = []
        labels = []
        
        for response, context, is_grounded in production_data:
            score = compute_hallucination_score(response, context)
            production_scores.append(score)
            labels.append(is_grounded)
        
        production_scores = np.array(production_scores)
        labels = np.array(labels)
        
        # Get threshold from calibration
        threshold = np.percentile(calibration_scores, 90)
        
        # Compute coverage on grounded responses
        grounded_scores = production_scores[labels]
        if len(grounded_scores) > 0:
            coverage = np.mean(grounded_scores <= threshold)
        else:
            coverage = np.nan
        
        # Compute precision on flagged responses
        flagged = production_scores > threshold
        if sum(flagged) > 0:
            precision = np.mean(~labels[flagged])  # True positive rate
        else:
            precision = np.nan
        
        # Log metrics
        mlflow.log_metric("coverage", coverage)
        mlflow.log_metric("coverage_gap", 0.9 - coverage)
        mlflow.log_metric("precision", precision if not np.isnan(precision) else 0)
        mlflow.log_metric("flagged_rate", np.mean(flagged))
        mlflow.log_metric("mean_score", np.mean(production_scores))
        mlflow.log_metric("score_shift", np.mean(production_scores) - np.mean(calibration_scores))
        
        # Detect distribution shift
        score_shift = np.mean(production_scores) - np.mean(calibration_scores)
        
        # Set status based on coverage
        if coverage >= 0.85:
            mlflow.set_tag("status", "HEALTHY")
        elif coverage >= 0.75:
            mlflow.set_tag("status", "WARNING")
        else:
            mlflow.set_tag("status", "CRITICAL")
        
        # Create visualization
        import matplotlib.pyplot as plt
        
        fig, axes = plt.subplots(1, 2, figsize=(12, 5))
        
        # Score distribution comparison
        axes[0].hist(calibration_scores, bins=20, alpha=0.5, label='Calibration', density=True)
        axes[0].hist(production_scores, bins=20, alpha=0.5, label='Production', density=True)
        axes[0].axvline(threshold, color='r', linestyle='--', label=f'Threshold ({threshold:.3f})')
        axes[0].set_xlabel('Hallucination Score')
        axes[0].set_ylabel('Density')
        axes[0].set_title('Score Distribution Shift')
        axes[0].legend()
        
        # Coverage gauge
        axes[1].bar(['Coverage'], [coverage], color='green' if coverage >= 0.85 else 'orange' if coverage >= 0.75 else 'red')
        axes[1].axhline(0.9, color='r', linestyle='--', label='Target (90%)')
        axes[1].set_ylim(0, 1)
        axes[1].set_ylabel('Coverage')
        axes[1].set_title(f'Coverage: {coverage:.1%}')
        axes[1].legend()
        
        plt.tight_layout()
        plt.savefig('audit_report.png', dpi=150)
        mlflow.log_artifact('audit_report.png')
        plt.close()
        
        print(f"Audit complete: Coverage = {coverage:.1%}")
        return coverage

print("Audit function defined")

In [None]:
# Simulate production data with some shift
# In practice, replace this with your actual production data

def simulate_production_data(shift_level=0.0):
    """Simulate production data with distribution shift."""
    np.random.seed(int(datetime.now().timestamp()) % 1000)
    
    n_samples = 50
    production_data = []
    
    for _ in range(n_samples):
        # Simulate grounded vs hallucinated responses
        is_grounded = np.random.random() > 0.2  # 80% grounded
        
        if is_grounded:
            # Grounded response - score should be low
            # But with shift, scores increase
            base_score = np.random.beta(2, 5)
            score = base_score + shift_level * 0.3  # Shift increases scores
        else:
            # Hallucination - score should be high
            score = np.random.beta(5, 2)
        
        production_data.append((f"response_{_}", f"context_{_}", is_grounded))
    
    return production_data

# Run audit for "Week 1" - no shift
prod_data_week1 = simulate_production_data(shift_level=0.0)
run_production_audit(prod_data_week1, calibration_scores, "week_1_audit")

In [None]:
# Run audit for "Week 2" - mild shift
prod_data_week2 = simulate_production_data(shift_level=0.3)
run_production_audit(prod_data_week2, calibration_scores, "week_2_audit")

In [None]:
# Run audit for "Week 3" - significant shift (e.g., new product launch)
prod_data_week3 = simulate_production_data(shift_level=0.6)
run_production_audit(prod_data_week3, calibration_scores, "week_3_audit")

## Scenario 3: Compare Guardrail Configurations

A/B test different embedding models or thresholds.

In [None]:
configurations = [
    {"name": "minilm-v2", "model": "all-MiniLM-L6-v2", "alpha": 0.1},
    {"name": "minilm-conservative", "model": "all-MiniLM-L6-v2", "alpha": 0.05},  # More conservative
    {"name": "minilm-relaxed", "model": "all-MiniLM-L6-v2", "alpha": 0.2},  # More relaxed
]

with mlflow.start_run(run_name="config_comparison"):
    mlflow.set_tag("type", "comparison")
    
    for config in configurations:
        with mlflow.start_run(run_name=config["name"], nested=True):
            mlflow.log_param("embedding_model", config["model"])
            mlflow.log_param("alpha", config["alpha"])
            
            # Compute threshold for this config
            threshold = np.percentile(calibration_scores, (1 - config["alpha"]) * 100)
            mlflow.log_metric("threshold", threshold)
            
            # Simulate evaluation (replace with real evaluation)
            coverage = 1 - config["alpha"] - np.random.uniform(0, 0.1)
            false_positive_rate = config["alpha"] + np.random.uniform(0, 0.05)
            
            mlflow.log_metric("coverage", coverage)
            mlflow.log_metric("false_positive_rate", false_positive_rate)
            
            print(f"{config['name']}: Coverage={coverage:.1%}, FPR={false_positive_rate:.1%}")

## Query MLflow for Insights

Find runs where coverage dropped below threshold.

In [None]:
# Find all audit runs with coverage issues
warning_runs = mlflow.search_runs(
    experiment_names=["rag-hallucination-guardrails"],
    filter_string="tags.type = 'audit' AND tags.status != 'HEALTHY'",
    order_by=["metrics.coverage ASC"]
)

if len(warning_runs) > 0:
    print("Runs with coverage issues:")
    print(warning_runs[['run_id', 'tags.status', 'metrics.coverage', 'metrics.score_shift']].to_string())
else:
    print("No coverage issues detected!")

In [None]:
# Track coverage over time
all_audits = mlflow.search_runs(
    experiment_names=["rag-hallucination-guardrails"],
    filter_string="tags.type = 'audit'",
    order_by=["start_time ASC"]
)

if len(all_audits) > 0:
    import matplotlib.pyplot as plt
    
    fig, ax = plt.subplots(figsize=(10, 5))
    
    ax.plot(range(len(all_audits)), all_audits['metrics.coverage'], 'b-o', linewidth=2)
    ax.axhline(0.9, color='r', linestyle='--', label='Target')
    ax.axhline(0.85, color='orange', linestyle='--', label='Warning')
    
    ax.set_xlabel('Audit Run')
    ax.set_ylabel('Coverage')
    ax.set_title('Guardrail Coverage Over Time')
    ax.set_xticks(range(len(all_audits)))
    ax.set_xticklabels([r.split('_')[-2] + '_' + r.split('_')[-1] for r in all_audits['tags.mlflow.runName']], rotation=45)
    ax.legend()
    ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig('coverage_trend.png', dpi=150)
    plt.show()

## Production Integration Template

Here's how to integrate this into your production system:

In [None]:
# Production monitoring class
class RAGGuardrailMonitor:
    def __init__(self, calibration_scores, alpha=0.1, experiment_name="rag-guardrails"):
        self.calibration_scores = calibration_scores
        self.threshold = np.percentile(calibration_scores, (1 - alpha) * 100)
        self.alpha = alpha
        self.experiment_name = experiment_name
        self.scores_buffer = []
        self.labels_buffer = []
        
        mlflow.set_experiment(experiment_name)
    
    def log_prediction(self, hallucination_score: float, is_grounded: bool):
        """Log a single prediction for batch analysis."""
        self.scores_buffer.append(hallucination_score)
        self.labels_buffer.append(is_grounded)
    
    def run_audit(self, run_name=None):
        """Run audit on buffered predictions."""
        if len(self.scores_buffer) < 10:
            print("Not enough samples for audit")
            return
        
        if run_name is None:
            run_name = f"audit_{datetime.now().strftime('%Y%m%d_%H%M')}"
        
        with mlflow.start_run(run_name=run_name):
            scores = np.array(self.scores_buffer)
            labels = np.array(self.labels_buffer)
            
            grounded_scores = scores[labels]
            coverage = np.mean(grounded_scores <= self.threshold) if len(grounded_scores) > 0 else np.nan
            
            mlflow.log_metric("coverage", coverage)
            mlflow.log_metric("n_samples", len(scores))
            mlflow.log_metric("mean_score", np.mean(scores))
            
            if coverage < 0.85:
                mlflow.set_tag("alert", "COVERAGE_DROP")
            
            # Clear buffer
            self.scores_buffer = []
            self.labels_buffer = []
            
            return coverage

# Usage:
# monitor = RAGGuardrailMonitor(calibration_scores)
# 
# # In your RAG pipeline:
# score = compute_hallucination_score(response, context)
# is_hallucination = score > monitor.threshold
# monitor.log_prediction(score, is_grounded=not is_hallucination)
# 
# # Run audit daily/weekly:
# coverage = monitor.run_audit()

print("RAGGuardrailMonitor class defined")

## View Results

Run this in your terminal to view the MLflow dashboard:

```bash
mlflow ui
```

Then open http://localhost:5000