In [None]:
#!/usr/bin/env python3
"""
Model Training Script for RHCP Chatbot ML Pipeline.

Main training script that coordinates data loading, model training, and evaluation.
"""

import json
import os
import sys
from datetime import datetime
from pathlib import Path

# Add parent directories to path for imports - notebook compatible
try:
    # For script execution
    script_dir = Path(__file__).parent
    project_root = script_dir.parent.parent
except NameError:
    # For notebook execution - assume we're in notebooks/ directory or project root
    current_dir = Path.cwd()
    if current_dir.name == "notebooks":
        # Running from notebooks directory
        project_root = current_dir.parent
    elif (current_dir / "scripts").exists():
        # Running from project root
        project_root = current_dir
    else:
        # Try to find project root by looking for scripts directory
        project_root = current_dir
        while project_root != project_root.parent:
            if (project_root / "scripts").exists():
                break
            project_root = project_root.parent
        else:
            raise ValueError("Could not find project root with 'scripts' directory")

# Add project root to path
sys.path.append(str(project_root))

# Change working directory to project root for config file loading
os.chdir(project_root)

print(f"Project root: {project_root}")
print(f"Python path updated to include: {project_root}")
print(f"Working directory changed to: {os.getcwd()}")
print(f"Config directory exists: {(Path('config') / 'training_config.yaml').exists()}")

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from scripts.data.enhance_data import DataEnhancer
from scripts.data.load_data import DataLoader
from scripts.evaluation.evaluate_model import ModelEvaluator
from scripts.utils.config_manager import ConfigManager
from scripts.utils.logger_setup import setup_training_logger
from scripts.utils.model_utils import ModelUtils


class ModelTrainer:
    """Main class for coordinating the training pipeline."""

    def __init__(self):
        """Initialize the trainer."""
        self.config_manager = ConfigManager()
        self.training_config = self.config_manager.get_training_config()
        self.data_config = self.config_manager.get_data_config()

        # Set up logging
        self.logger = setup_training_logger(self.training_config)

        # Initialize components
        self.data_loader = DataLoader(self.config_manager)
        self.data_enhancer = DataEnhancer(self.config_manager)
        self.evaluator = ModelEvaluator(self.config_manager)

        self.logger.info("Model trainer initialized")

    def run_training_pipeline(self):
        """Run the complete training pipeline."""
        self.logger.info("=== STARTING TRAINING PIPELINE ===")

        try:
            # Step 1: Load and validate data
            self.logger.info("Step 1: Loading training data...")
            df = self.data_loader.load_training_data()

            # Step 2: Enhance data if configured
            enhancement_config = self.training_config.get("enhancement", {})
            if enhancement_config.get("enable_minority_class_enhancement", False):
                self.logger.info("Step 2: Enhancing minority classes...")
                df = self.data_enhancer.enhance_minority_classes(df)
            else:
                self.logger.info(
                    "Step 2: Skipping data enhancement (disabled in config)"
                )

            # Step 3: Split data
            self.logger.info("Step 3: Splitting data into train/test sets...")
            X_train, X_test, y_train, y_test = self._split_data(df)

            # Step 4: Create and train model
            self.logger.info("Step 4: Creating and training model...")
            pipeline = ModelUtils.create_pipeline(self.training_config)

            # Set random seed for reproducibility
            self._set_random_seed()

            # Train the model
            trained_pipeline = ModelUtils.train_model(
                pipeline, X_train, y_train, self.training_config, self.logger
            )

            # Step 5: Evaluate model
            self.logger.info("Step 5: Evaluating model performance...")

            # Test set evaluation
            test_results = ModelUtils.evaluate_model(
                trained_pipeline, X_test, y_test, self.training_config, self.logger
            )

            # Cross-validation
            cv_results = ModelUtils.cross_validate_model(
                trained_pipeline,
                df["text"],
                df["intent"],
                self.training_config,
                self.logger,
            )

            # Test on specific cases
            test_cases = self.training_config.get("evaluation", {}).get(
                "test_cases", []
            )
            prediction_results = ModelUtils.test_model_predictions(
                trained_pipeline, test_cases, self.logger
            )

            # Step 6: Save model and results
            self.logger.info("Step 6: Saving model and results...")
            self._save_training_artifacts(
                trained_pipeline, test_results, cv_results, prediction_results, df
            )

            # Step 7: Generate comprehensive evaluation
            self.logger.info("Step 7: Generating evaluation report...")
            self.evaluator.generate_evaluation_report(
                trained_pipeline, X_test, y_test, test_results, cv_results
            )

            self.logger.info("=== TRAINING PIPELINE COMPLETED SUCCESSFULLY ===")
            return trained_pipeline, test_results, cv_results

        except Exception as e:
            self.logger.error(f"Training pipeline failed: {e}")
            raise

    def _split_data(self, df: pd.DataFrame):
        """Split data into training and test sets."""
        training_config = self.training_config["training"]

        X = df["text"]
        y = df["intent"]

        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X,
            y,
            test_size=training_config["test_size"],
            random_state=training_config["random_state"],
            stratify=y if training_config["stratify"] else None,
        )

        self.logger.info(
            f"Data split: {len(X_train)} train, {len(X_test)} test samples"
        )
        return X_train, X_test, y_train, y_test

    def _set_random_seed(self):
        """Set random seeds for reproducibility."""
        reproducibility_config = self.training_config.get("reproducibility", {})

        if reproducibility_config.get("set_global_seed", True):
            seed = reproducibility_config.get("seed", 42)
            np.random.seed(seed)

            # Set other random seeds if needed
            import random

            random.seed(seed)

            self.logger.info(f"Random seed set to: {seed}")

    def _save_training_artifacts(
        self, pipeline, test_results, cv_results, prediction_results, df
    ):
        """Save all training artifacts."""
        output_config = self.training_config["output"]
        models_path = self.data_config["paths"]["models"]
        results_path = self.data_config["paths"]["results"]

        # Create paths
        Path(models_path).mkdir(parents=True, exist_ok=True)
        Path(results_path).mkdir(parents=True, exist_ok=True)

        # Model file path
        model_path = Path(models_path) / output_config["model_filename"]

        # Create comprehensive metadata
        training_info = {
            "total_samples": len(df),
            "training_samples": test_results["n_test_samples"] * 4,  # Approximate
            "test_samples": test_results["n_test_samples"],
            "version": datetime.now().strftime("%Y%m%d_%H%M%S"),
            "enhancement_applied": self.training_config.get("enhancement", {}).get(
                "enable_minority_class_enhancement", False
            ),
        }

        metadata = ModelUtils.create_model_metadata(
            pipeline, test_results, self.training_config, training_info
        )

        # Add cross-validation results to metadata
        metadata["cross_validation"] = cv_results
        metadata["test_predictions"] = prediction_results

        # Save model with metadata
        ModelUtils.save_model(
            pipeline, str(model_path), metadata, self.training_config, self.logger
        )

        # Save detailed results
        results_data = {
            "timestamp": datetime.now().isoformat(),
            "model_path": str(model_path),
            "test_evaluation": test_results,
            "cross_validation": cv_results,
            "test_predictions": prediction_results,
            "data_summary": {
                "total_samples": len(df),
                "unique_intents": df["intent"].nunique(),
                "class_distribution": df["intent"].value_counts().to_dict(),
            },
            "configuration": self.training_config,
        }

        results_file = Path(results_path) / output_config["results_filename"]
        with open(results_file, "w", encoding="utf-8") as f:
            json.dump(results_data, f, indent=2, ensure_ascii=False)

        self.logger.info("Training artifacts saved:")
        self.logger.info(f"  Model: {model_path}")
        self.logger.info(f"  Results: {results_file}")

        # Create backup if configured
        if output_config.get("create_backup", True):
            training_files = self.training_config["data"]["training_files"]
            backup_dir = self.data_loader.create_data_backup()
            self.logger.info(f"  Backup: {backup_dir}")


def main():
    """Main function for standalone execution."""
    print("RHCP Chatbot Model Training Pipeline")
    print("=" * 50)

    try:
        # Initialize and run trainer
        trainer = ModelTrainer()
        pipeline, test_results, cv_results = trainer.run_training_pipeline()

        # Print summary
        print("\nTRAINING COMPLETED SUCCESSFULLY!")
        print("\nPERFORMANCE SUMMARY:")
        print(f"  Test Accuracy: {test_results['accuracy']:.4f}")
        print(f"  Test Macro F1: {test_results['macro_f1']:.4f}")
        print(
            f"  CV Accuracy: {cv_results['accuracy']['mean']:.4f} ± {cv_results['accuracy']['std']:.4f}"
        )
        print(
            f"  CV Macro F1: {cv_results['macro_f1']['mean']:.4f} ± {cv_results['macro_f1']['std']:.4f}"
        )

        print("\nModel is ready for deployment!")

    except Exception as e:
        print(f"\nTRAINING FAILED: {e}")
        sys.exit(1)


if __name__ == "__main__":
    main()

Project root: /home/gilberto/Documents/rhcp-chatbot
Python path updated to include: /home/gilberto/Documents/rhcp-chatbot
Working directory changed to: /home/gilberto/Documents/rhcp-chatbot
Config directory exists: True
RHCP Chatbot Model Training Pipeline
2025-07-28 21:34:29,108 - training - INFO - Model trainer initialized
2025-07-28 21:34:29,110 - training - INFO - === STARTING TRAINING PIPELINE ===
2025-07-28 21:34:29,111 - training - INFO - Step 1: Loading training data...
2025-07-28 21:34:29,114 - data_processing - INFO - Starting data loading process...
2025-07-28 21:34:29,115 - data_processing - INFO - Loading data from 2 files: ['data/processed/base-corpus.json', 'data/processed/rhcp-corpus.json']
2025-07-28 21:34:29,120 - data_processing - INFO - Loaded 1041 samples with 70 unique intents
2025-07-28 21:34:29,124 - data_processing - INFO - Class balance analysis:
2025-07-28 21:34:29,125 - data_processing - INFO -   Most common class: agent.chatbot (72 samples)
2025-07-28 21:34



2025-07-28 21:34:30,479 - training - INFO - Model training completed successfully
2025-07-28 21:34:30,484 - training - INFO - Step 5: Evaluating model performance...
2025-07-28 21:34:30,485 - training - INFO - Starting model evaluation...
2025-07-28 21:34:30,618 - training - INFO - Evaluation completed - Accuracy: 0.6387, Macro F1: 0.6250
2025-07-28 21:34:30,622 - training - INFO - Starting 5-fold cross-validation...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
Traceback (most recent call last):
  File "/home/gilberto/Documents/rhcp-chatbot/venv/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 152, in __call__
    score = scorer._score(
            ^^^^^^^^^^^^^^
  File "/home/gilberto/Documents/rhcp-chatbot/venv/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 400, in _score
    y_pred = method_caller(
             ^^^^^^^^^^^^^^
  File "/home/gilberto/Documents/rhcp-chatbot/venv/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 90, in _cached_call
    result, _ = _get_response_values(
                ^^^^^^^^^^^^^^^^^^^^^
  File "/home/gilberto/Documents/rhcp-chatbot/venv/lib/python3.12/site-packag

2025-07-28 21:34:32,665 - training - INFO - Cross-validation completed
2025-07-28 21:34:32,667 - training - INFO - accuracy: 0.6328 ± 0.0147
2025-07-28 21:34:32,668 - training - INFO - macro_f1: nan ± nan
2025-07-28 21:34:32,669 - training - INFO - Testing model on 8 test cases...
2025-07-28 21:34:32,679 - training - INFO - 'are you a bot' -> 'agent.chatbot' (confidence: 0.074)
2025-07-28 21:34:32,681 - training - INFO - 'bye for now' -> 'greetings.bye' (confidence: 0.071)
2025-07-28 21:34:32,682 - training - INFO - 'Hello' -> 'agent.there' (confidence: 0.123)
2025-07-28 21:34:32,684 - training - INFO - 'Who are the members of the band?' -> 'band.members' (confidence: 0.060)
2025-07-28 21:34:32,685 - training - INFO - 'Tell me about quantum physics' -> 'intent.outofscope' (confidence: 0.042)
2025-07-28 21:34:32,687 - training - INFO - 'when was RHCP formed' -> 'band.history' (confidence: 0.097)
2025-07-28 21:34:32,688 - training - INFO - 'list their albums' -> 'album.info' (confidence:

Traceback (most recent call last):
  File "/home/gilberto/Documents/rhcp-chatbot/venv/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 152, in __call__
    score = scorer._score(
            ^^^^^^^^^^^^^^
  File "/home/gilberto/Documents/rhcp-chatbot/venv/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 400, in _score
    y_pred = method_caller(
             ^^^^^^^^^^^^^^
  File "/home/gilberto/Documents/rhcp-chatbot/venv/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 90, in _cached_call
    result, _ = _get_response_values(
                ^^^^^^^^^^^^^^^^^^^^^
  File "/home/gilberto/Documents/rhcp-chatbot/venv/lib/python3.12/site-packages/sklearn/utils/_response.py", line 207, in _get_response_values
    raise ValueError(
ValueError: pos_label=1 is not a valid label: It should be one of ['agent.acquaintance' 'agent.age' 'agent.annoying' 'agent.bad'
 'agent.beautiful' 'agent.beclever' 'agent.birthday' 'agent.boring'
 'agent.boss' 'agent.bu

In [None]:
# Reload modules to get the fixed tokenizer
import importlib
import sys

# Reload the model utilities module
if "scripts.utils.model_utils" in sys.modules:
    importlib.reload(sys.modules["scripts.utils.model_utils"])

# Re-import the classes

print("Modules reloaded with fixed tokenizer!")

In [None]:
# Quick test of the fixed model training and saving
print("Testing fixed tokenizer and model saving...")

# Force reload of all relevant modules
import importlib

modules_to_reload = [
    "scripts.utils.model_utils",
    "scripts.utils.config_manager",
    "scripts.data.load_data",
    "scripts.data.enhance_data",
    "scripts.evaluation.evaluate_model",
]

for module_name in modules_to_reload:
    if module_name in sys.modules:
        print(f"Reloading {module_name}...")
        importlib.reload(sys.modules[module_name])

# Re-import everything fresh

print("All modules reloaded!")

try:
    # Create a completely fresh trainer instance
    print("Creating fresh trainer instance...")
    trainer = ModelTrainer()

    # Run the full pipeline again
    print("Running training pipeline...")
    pipeline, test_results, cv_results = trainer.run_training_pipeline()

    print("\nSUCCESS! Training completed without errors!")
    print(f"Test Accuracy: {test_results['accuracy']:.4f}")
    print(f"Test Macro F1: {test_results['macro_f1']:.4f}")
    print(
        f"CV Accuracy: {cv_results['accuracy']['mean']:.4f} ± {cv_results['accuracy']['std']:.4f}"
    )
    print("Model saved successfully!")

except Exception as e:
    print(f"Error: {e}")
    print(
        "\nIf still getting pickle error, please restart the kernel and run all cells fresh!"
    )
    import traceback

    traceback.print_exc()

In [None]:
# Alternative: Direct test of fixed tokenizer and model saving
print("=== ALTERNATIVE APPROACH: Testing Fixed Tokenizer Directly ===")

try:
    from pathlib import Path

    import joblib
    from sklearn.model_selection import train_test_split

    from scripts.data.load_data import DataLoader
    from scripts.utils.config_manager import ConfigManager
    from scripts.utils.model_utils import ModelUtils

    # Load configuration and data
    config_manager = ConfigManager()
    training_config = config_manager.get_training_config()
    data_config = config_manager.get_data_config()

    # Load data
    data_loader = DataLoader(config_manager)
    df = data_loader.load_training_data()
    print(f"Loaded {len(df)} samples")

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        df["text"], df["intent"], test_size=0.2, random_state=42, stratify=df["intent"]
    )
    print(f"Data split: {len(X_train)} train, {len(X_test)} test")

    # Create pipeline with fixed tokenizer
    pipeline = ModelUtils.create_pipeline(training_config)
    print("Pipeline created with fixed tokenizer")

    # Train model
    print("Training model...")
    pipeline.fit(X_train, y_train)

    # Test the model
    accuracy = pipeline.score(X_test, y_test)
    print(f"Model trained - Accuracy: {accuracy:.4f}")

    # Test saving (this is where the pickle error occurred before)
    print("Testing model saving...")
    models_path = Path("data/models")
    models_path.mkdir(parents=True, exist_ok=True)
    model_file = models_path / "test_fixed_model.joblib"

    joblib.dump(pipeline, model_file)
    print(f"Model saved successfully to: {model_file}")

    # Test loading
    loaded_pipeline = joblib.load(model_file)
    test_accuracy = loaded_pipeline.score(X_test, y_test)
    print(f"Model loaded and tested - Accuracy: {test_accuracy:.4f}")

    # Test predictions
    test_cases = ["are you a bot", "Hello", "Who are the members?"]
    predictions = loaded_pipeline.predict(test_cases)
    print("Test predictions:")
    for text, pred in zip(test_cases, predictions, strict=False):
        print(f"   '{text}' -> '{pred}'")

    print("\nSUCCESS! Fixed tokenizer works perfectly!")
    print("Model can be trained, saved, loaded, and used for predictions!")

except Exception as e:
    print(f"Error in direct test: {e}")
    import traceback

    traceback.print_exc()

=== ALTERNATIVE APPROACH: Testing Fixed Tokenizer Directly ===
2025-07-28 21:13:36,233 - data_processing - INFO - Starting data loading process...
2025-07-28 21:13:36,234 - data_processing - INFO - Loading data from 2 files: ['data/processed/base-corpus.json', 'data/processed/rhcp-corpus.json']
2025-07-28 21:13:36,239 - data_processing - INFO - Loaded 1041 samples with 70 unique intents
2025-07-28 21:13:36,243 - data_processing - INFO - Class balance analysis:
2025-07-28 21:13:36,245 - data_processing - INFO -   Most common class: agent.chatbot (72 samples)
2025-07-28 21:13:36,247 - data_processing - INFO -   Least common class: greetings.nicetomeetyou (5 samples)
2025-07-28 21:13:36,249 - data_processing - INFO -   Imbalance ratio: 14.40:1
✅ Loaded 1041 samples
✅ Data split: 832 train, 209 test
✅ Pipeline created with fixed tokenizer
🔄 Training model...




✅ Model trained - Accuracy: 0.5885
🔄 Testing model saving...
✅ Model saved successfully to: data/models/test_fixed_model.joblib
✅ Model loaded and tested - Accuracy: 0.5885
✅ Test predictions:
   'are you a bot' -> 'agent.chatbot'
   'Hello' -> 'agent.there'
   'Who are the members?' -> 'band.members'

🎉 SUCCESS! Fixed tokenizer works perfectly!
✅ Model can be trained, saved, loaded, and used for predictions!


