In [1]:
import pandas as pd
import json
import os
from dataclasses import dataclass, field
from typing import Dict, Any, List, Optional
from datetime import datetime
import uuid

In [2]:
@dataclass
class DataContext():
    """
    A class that stores the DataFrame in the context.
    """
    _data: pd.DataFrame = None
    _fe_data: pd.DataFrame = None

    def set_data(self, new_data: pd.DataFrame):
        """
        Method to set or update the data.
        """
        self._data = new_data
        self._fe_data = new_data.copy(deep=True)

    def get_data(self) -> pd.DataFrame:
        """
        Method to get the data from the context.
        """
        return self._data
    
    def set_fe_data(self, new_fe_data: pd.DataFrame):
        """
        Method to set or update the feature engineered data.
        """
        self._fe_data = new_fe_data

    def get_fe_data(self) -> pd.DataFrame:
        """
        Method to get the feature engineered data from the context.
        """
        return self._fe_data

In [3]:
# --- 2. Experiment Configuration Dataclasses (Inputs) ---

@dataclass
class ModelConfig:
    """Configuration defining the model to be trained."""
    model_name: str                  # e.g., "LinearRegression", "RandomForest"
    hyperparameters: Dict[str, Any] = field(default_factory=dict) # e.g., {'alpha': 0.1}

In [4]:
@dataclass
class TrainingConfig:
    """Configuration defining the training protocol."""
    target_column: str               # e.g., "price"
    test_size: float = 0.2
    random_state: int = 42
    # Metrics to compute will be hardcoded in the server tool for now

In [5]:
# --- 3. Experiment Record and Registry (Outputs/Persistence) ---

@dataclass
class Metrics:
    """Computed metrics for a single experiment run."""
    train_r2: float
    train_rmse: float
    test_r2: float
    test_rmse: float
    test_rmse_pct_mean: float
    test_rmse_pct_range: float
    train_time_sec: float           # How long the fit took
    best_param: float

In [6]:
@dataclass
class ExperimentRecord:
    """The immutable, complete record of an experiment run."""
    run_id: str
    timestamp: str
    
    # Input Configurations
    model_config: ModelConfig
    training_config: TrainingConfig
    
    # Outputs and Metadata
    metrics: Metrics
    artifact_path: str               # Path to the serialized model file
    features_used: List[str]         # The exact columns used for training

In [7]:
class ExperimentRegistry:
    """
    Handles the persistence and retrieval of experiment metadata.
    This replaces the in-memory ModelContext.
    """
    REGISTRY_DIR = "artifacts"
    REGISTRY_FILE = os.path.join(REGISTRY_DIR, "registry.json")

    def __init__(self):
        """Initializes the registry and ensures the artifact directory exists."""
        os.makedirs(self.REGISTRY_DIR, exist_ok=True)
        # Initialize registry file if it doesn't exist
        if not os.path.exists(self.REGISTRY_FILE):
            with open(self.REGISTRY_FILE, 'w') as f:
                json.dump([], f)

    def _load_registry(self) -> List[Dict[str, Any]]:
        """Loads all experiment records from the JSON file."""
        try:
            with open(self.REGISTRY_FILE, 'r') as f:
                return json.load(f)
        except json.JSONDecodeError:
            print("Warning: Registry file is corrupted or empty. Returning empty list.")
            return []
        except FileNotFoundError:
            return []

    def _save_registry(self, records: List[Dict[str, Any]]):
        """Saves the list of records back to the JSON file."""
        with open(self.REGISTRY_FILE, 'w') as f:
            json.dump(records, f, indent=4)

    def append_record(self, record: ExperimentRecord) -> str:
        """
        Appends a new record to the registry file.
        Returns the run_id.
        """
        records = self._load_registry()
        
        # Convert the complex dataclass structure to a JSON-serializable dictionary
        record_dict = record.__dict__
        record_dict['model_config'] = record.model_config.__dict__
        record_dict['training_config'] = record.training_config.__dict__
        record_dict['metrics'] = record.metrics.__dict__
        
        records.append(record_dict)
        self._save_registry(records)
        
        return record.run_id

    def list_runs(self) -> List[Dict[str, Any]]:
        """Returns the full list of runs."""
        return self._load_registry()

    def get_run_summary(self, run_id: str) -> Optional[Dict[str, Any]]:
        """Retrieves a specific run by ID and returns a summary."""
        records = self._load_registry()
        for record in records:
            if record['run_id'] == run_id:
                # Create a simple summary dict
                summary = {
                    'run_id': record['run_id'],
                    'timestamp': record['timestamp'],
                    'model': record['model_config']['model_name'],
                    # 'feature_set': record['model_config']['feature_set_name'],
                    'train_r2': record['metrics']['train_r2'],
                    'train_rmse': record['metrics']['train_rmse'],
                    'test_r2': record['metrics']['test_r2'],
                    'test_rmse': record['metrics']['test_rmse'],
                    'test_rmse_pct_mean': record['metrics']['test_rmse_pct_mean'],
                    'test_rmse_pct_range': record['metrics']['test_rmse_pct_range'],
                    'training_time_sec': record['metrics']['train_time_sec'],
                    'artifact_path': record['artifact_path'],
                    'best_param': record['metrics']['best_param']
                }
                return summary
        return None

In [8]:
class ProductionRegistry:
    """
    Handles persistence of the single designated 'Production' model run_id.
    
    This class manages a dedicated JSON file to store the ID of the model 
    currently marked as 'Production' or 'Best'.
    """
    
    # Use the same base directory for all registry files
    REGISTRY_DIR = "artifacts" 
    REGISTRY_FILE = os.path.join(REGISTRY_DIR, "production_registry.json")

    def __init__(self):
        """Initializes the registry and ensures the artifact directory and file exist."""
        os.makedirs(self.REGISTRY_DIR, exist_ok=True)
        # Initialize file if it doesn't exist
        if not os.path.exists(self.REGISTRY_FILE):
            self._save_production_id(None)

    def _load_production_id(self) -> Optional[str]:
        """Loads the current production run_id from the JSON file."""
        try:
            with open(self.REGISTRY_FILE, 'r') as f:
                data = json.load(f)
                # Returns the run_id, or None if the file is empty/malformed
                return data.get("production_run_id")
        except (json.JSONDecodeError, FileNotFoundError):
            # If the file is missing or corrupted, return None
            return None

    def _save_production_id(self, run_id: Optional[str]):
        """Saves the designated production run_id to the JSON file."""
        # Note: We save it inside a dictionary structure for robustness
        with open(self.REGISTRY_FILE, 'w') as f:
            json.dump({"production_run_id": run_id}, f, indent=4)

    def set_production_model(self, run_id: str) -> str:
        """Sets a new run_id as the production model."""
        if not run_id:
             # Allows clearing the production status if needed
             self._save_production_id(None)
             return None
             
        self._save_production_id(run_id)
        return run_id

    def get_production_model(self) -> Optional[str]:
        """Retrieves the current production run_id."""
        return self._load_production_id()