# Example Model Training

Quick example training a Random Forest classifier on synthetic data for model monitoring demo. Run this notebook and then follow the steps in the project README

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import joblib
import mlflow
import mlflow.sklearn
import os
from pathlib import Path

In [None]:
# Set up MLflow experiment with unique name
starting_domino_user = os.environ.get("DOMINO_STARTING_USERNAME", "default_user")
experiment_name = f"model_monitoring_example_{starting_domino_user}"
mlflow.set_experiment(experiment_name=experiment_name)

# Generate synthetic data
np.random.seed(42)
n_samples = 1000

# 10 features named 1_feature, 2_feature, etc.
feature_names = [f'{i}_feature' for i in range(1, 11)]
X = np.random.randn(n_samples, 10)

# Create target with 4 classes
# Add some signal to make classification meaningful
signal = X[:, 0] + X[:, 1] * 0.5 + X[:, 2] * 0.3
y = np.digitize(signal, bins=np.percentile(signal, [25, 50, 75]))
class_names = [f'class_{i}' for i in range(1, 5)]

# Create DataFrames
df = pd.DataFrame(X, columns=feature_names)
df['target'] = [class_names[i] for i in y]

print(f"MLflow experiment: {experiment_name}")
print(f"Dataset shape: {df.shape}")
print(f"Class distribution:")
print(df['target'].value_counts())

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    df[feature_names], df['target'], 
    test_size=0.3, random_state=42, stratify=df['target']
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

In [None]:
# Train Random Forest with MLflow tracking
with mlflow.start_run() as run:
    # Log parameters
    mlflow.log_param("n_estimators", 100)
    mlflow.log_param("random_state", 42)
    mlflow.log_param("n_features", len(feature_names))
    mlflow.log_param("n_classes", len(class_names))
    
    # Train model
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    # Evaluate and log metrics
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    mlflow.log_metric("accuracy", accuracy)
    
    # Log model with input example and signature
    input_example = X_train.iloc[:5]  # Use first 5 rows as example
    mlflow.sklearn.log_model(
        model, 
        "random_forest_model",
        input_example=input_example
    )
    
    # Store run_id for later use
    run_id = run.info.run_id
    
    print(f"Accuracy: {accuracy:.3f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print(f"\nMLflow run logged to experiment: {experiment_name}")
    print(f"Run ID: {run_id}")

In [None]:
# Save model and data
joblib.dump(model, 'notebook-model.pkl')

# Save training data for monitoring baseline
train_df = X_train.copy()
train_df['target'] = y_train
train_df.to_csv('/mnt/artifacts/training_data.csv', index=False)

# Save test data for predictions
test_df = X_test.copy()
test_df['target'] = y_test
test_df.to_csv('/mnt/artifacts/test_data.csv', index=False)

print("✅ Saved:")
print("- model.pkl")
print("- /mnt/artifacts/training_data.csv")
print("- /mnt/artifacts/test_data.csv")

In [None]:
# Test probability prediction
sample = X_test.iloc[:1]
pred_class = model.predict(sample)[0]
pred_proba = model.predict_proba(sample)[0]

print(f"Sample prediction: {pred_class}")
print(f"Class probabilities: {dict(zip(model.classes_, pred_proba))}")

In [None]:
# Export command for model deployment
print(f"✅ MLflow Run ID: {run_id}")
print(f"\nTo export this model for deployment, run:")
print(f"python 2_optional_export_model.py --run-id {run_id} --folder /folder/to/save/model")