In [None]:
# MLflow Example Notebook

# Install necessary packages if not already installed
# Uncomment the line below if needed
# !pip install mlflow scikit-learn pandas numpy matplotlib

# Import libraries
import mlflow
import mlflow.sklearn
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt

# Set MLflow tracking URI (Replace with your DagsHub repository tracking URI)
# Format: https://dagshub.com/<username>/<repo-name>.mlflow
mlflow.set_tracking_uri("https://dagshub.com/your-username/your-repo-name.mlflow")

# Set experiment name
mlflow.set_experiment("example-experiment")

# Load sample dataset (using sklearn's built-in dataset)
from sklearn.datasets import load_diabetes
db = load_diabetes()
X = db.data
y = db.target

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define parameters for our model
n_estimators = 100
max_depth = 6
random_state = 42

# Train model and log with MLflow
with mlflow.start_run():
    # Log parameters
    mlflow.log_param("n_estimators", n_estimators)
    mlflow.log_param("max_depth", max_depth)
    mlflow.log_param("random_state", random_state)
    
    # Train model
    rf = RandomForestRegressor(n_estimators=n_estimators, 
                              max_depth=max_depth, 
                              random_state=random_state)
    rf.fit(X_train, y_train)
    
    # Make predictions
    y_pred = rf.predict(X_test)
    
    # Calculate metrics
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Log metrics
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("r2", r2)
    
    # Create and log a simple plot
    plt.figure(figsize=(10, 6))
    plt.scatter(y_test, y_pred, alpha=0.5)
    plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], 'r--')
    plt.xlabel('Actual')
    plt.ylabel('Predicted')
    plt.title('Actual vs Predicted Values')
    
    # Save figure locally
    plt.savefig("prediction_plot.png")
    
    # Log figure to MLflow
    mlflow.log_artifact("prediction_plot.png")
    
    # Log model
    mlflow.sklearn.log_model(rf, "random_forest_model")
    
    # Print results
    print(f"Mean Squared Error: {mse:.2f}")
    print(f"Mean Absolute Error: {mae:.2f}")
    print(f"R2 Score: {r2:.2f}")
    
    # Get run ID for reference
    run_id = mlflow.active_run().info.run_id
    print(f"Run ID: {run_id}")

print("Experiment completed and logged to MLflow!")