<a href="https://colab.research.google.com/github/fmousinho/LaxAI/blob/main/notebooks/end_to_end.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
%cd /content
!rm -rf LaxAI
!git clone https://github.com/fmousinho/LaxAI

/content
Cloning into 'LaxAI'...
remote: Enumerating objects: 1055, done.[K
remote: Counting objects: 100% (37/37), done.[K
remote: Compressing objects: 100% (37/37), done.[K
remote: Total 1055 (delta 11), reused 0 (delta 0), pack-reused 1018 (from 2)[K
Receiving objects: 100% (1055/1055), 636.01 MiB | 28.59 MiB/s, done.
Resolving deltas: 100% (655/655), done.


In [None]:
%cd /content/LaxAI
!pip install -r requirements_train.txt

In [4]:
%load_ext autoreload
%autoreload 2

import os
import sys
import logging
import json
import argparse

# --- Path Setup ---
# Add the project root to the Python path to allow for absolute imports
project_root = '/content/LaxAI'
if project_root not in sys.path:
    sys.path.insert(0, project_root)

from core.config.all_config import detection_config, training_config
from core.config import logging_config
from core.common.google_storage import get_storage
from core.train.dataprep_pipeline import DataPrepPipeline
from core.train.train_pipeline import TrainPipeline

# --- Configure Logging ---
# Note: This script assumes logging is configured elsewhere (e.g., in config)
# If not, uncomment the following lines for basic logging.
# from config import logging_config
logger = logging.getLogger(__name__)

20:11:46,412 | INFO    | [utils] NumExpr defaulting to 2 threads.


In [6]:
def train(tenant_id: str, frames_per_video: int, verbose: bool, save_intermediate: bool, training_kwargs: Optional[Dict[str, Any]] = None):
    """
    Main function to orchestrate the data prep and training workflows.

    Args:
        tenant_id: The tenant ID for GCS operations.
        frames_per_video: Number of frames to extract per video in the data prep pipeline.
        verbose: Enable verbose logging for pipelines.
        save_intermediate: Save intermediate pipeline results to GCS.
    """
    logger.info(f"--- Starting End-to-End Workflow for Tenant: {tenant_id} ---")

    # 1. Find all videos in the raw directory
    try:

        train_pipeline = TrainPipeline(
            tenant_id=tenant_id,
            verbose=verbose,
            save_intermediate=save_intermediate
            )


        # Use Google Storage functions to list directories
        storage_client = get_storage(tenant_id)
        # Find dataset paths - look for /datasets/ directories that contain numeric/train/ structure
        all_blobs = storage_client.list_blobs()
        logger.info(f"Total blobs found: {len(all_blobs)}")

        # Debug: Show sample blobs that contain /datasets/ and /train/
        train_blobs = [blob for blob in all_blobs if '/train/' in blob]
        for blob in train_blobs[:5]:
            logger.info(f"Sample blob: {blob}")

        # Find all unique parent directories containing /train
        train_paths = set()
        for blob in train_blobs:
            # Extract the full path including frame directory: .../datasets/frameN/train/
            if '/datasets/' in blob and '/train/' in blob:
                datasets_index = blob.find('/datasets/')
                train_index = blob.find('/train/', datasets_index)
                if datasets_index != -1 and train_index != -1:
                    # Include everything up to and including /train/
                    full_path = blob[:train_index + len('/train/')]
                    train_paths.add(full_path)
        logger.info(f"Found {len(train_paths)} directories containing training data.")

        total_folders = len(train_paths)
        processed_folders = 0
        for train_path in train_paths:
            # Validate that this path has the expected structure: .../datasets/frameN/train/
            if '/datasets/' not in train_path or '/datasets/frame' not in train_path or not train_path.rstrip('/').endswith('/train'):
                logger.warning(f"Skipping invalid train path structure: {train_path}")
                continue

            logger.info("**********************************************************************")
            logger.info(f"  Training round {processed_folders + 1}/{total_folders}")
            logger.info(f"Running train pipeline for dataset: {train_path}")
            logger.info("**********************************************************************")
            train_results = train_pipeline.run(dataset_path=train_path)

            if train_results.get("status") == "completed":
                logger.info(f"Successfully completed training for dataset: {train_path}")
            else:
                logger.error(f"Training pipeline failed for dataset: {train_path}")
                logger.error(f"Details: {json.dumps(train_results.get('errors'), indent=2)}")

            processed_folders += 1

        logger.info("--- End-to-End Workflow Finished ---")

    except Exception as e:
        logger.error(f"Error occurred during workflow: {e}")
        logger.error(f"Details: {json.dumps(e.args, indent=2)}")

'/content/LaxAI'