# Locust Pipeline Beginner Workflow Notebook

This notebook is a step-by-step version of the current pipeline in this repository.

It covers:
1. Optional file organization and cleanup utilities.
2. Video tracking and annotated output generation (`process_all_videos.py` logic).
3. Trial analysis and plotting (`locust_analysis.py` logic).
4. Optional dropped-frame export (`export_dropped_frames.py`).

Run cells from top to bottom.


## How This Notebook Is Organized

- Every stage is separated into its own section.
- Important settings are in one configuration cell.
- Potentially destructive operations have explicit safety toggles.
- You can run a single stage, or run all stages sequentially.


In [None]:
# Step 0: Project location setup
# This cell ensures we are running from the repository root.

from pathlib import Path
import os
import sys

current_working_directory = Path.cwd().resolve()
project_root_directory = current_working_directory

# If notebook was opened from another folder, use the expected repository path.
if not (project_root_directory / "process_all_videos.py").exists():
    fallback_project_root = Path("/home/ramanlab/Documents/cole/VSCode/RamanLab-Locust-Behavior")
    if fallback_project_root.exists():
        project_root_directory = fallback_project_root

os.chdir(project_root_directory)

print(f"Project root: {project_root_directory}")
print(f"Python executable: {sys.executable}")


## Optional: Install Missing Python Packages

If imports fail later, run the cell below once. Otherwise, leave it disabled.


In [None]:
# Step 0b: Optional dependency install
# Set RUN_PACKAGE_INSTALL to True only if you are missing packages.

RUN_PACKAGE_INSTALL = False

if RUN_PACKAGE_INSTALL:
    # Install packages into the current Jupyter/Python environment.
    package_install_command = [
        sys.executable,
        "-m",
        "pip",
        "install",
        "-U",
        "numpy",
        "pandas",
        "matplotlib",
        "opencv-python",
        "torch",
        "ultralytics",
    ]
    print("Running:", " ".join(package_install_command))
    subprocess.run(package_install_command, check=True)
else:
    print("Package install skipped (RUN_PACKAGE_INSTALL=False).")


In [None]:
# Step 0c: Shared imports and helper utilities

import shlex
import subprocess
import importlib
from typing import Iterable

import numpy as np
import pandas as pd
from IPython.display import display


def run_cli_command(command_parts: Iterable[str], raise_on_error: bool = True):
    """
    Run a command, print it clearly, and stream output for easier debugging.
    """
    command_parts = [str(part) for part in command_parts]
    printable_command = " ".join(shlex.quote(part) for part in command_parts)
    print(f"Running command:\n{printable_command}\n")

    completed_process = subprocess.run(
        command_parts,
        cwd=project_root_directory,
        text=True,
        capture_output=True,
    )

    if completed_process.stdout.strip():
        print("STDOUT:\n" + completed_process.stdout)
    if completed_process.stderr.strip():
        print("STDERR:\n" + completed_process.stderr)

    if raise_on_error and completed_process.returncode != 0:
        raise RuntimeError(f"Command failed with code {completed_process.returncode}")

    return completed_process


## Step 1: Pipeline Configuration

Edit values in the next cell before running the pipeline.


In [None]:
# Step 1a: Main paths and stage toggles

# Core directories
all_videos_directory = project_root_directory / "Locust" / "all_vids"
yolo_model_weights_path = project_root_directory / "model" / "best.pt"
analysis_output_directory = project_root_directory / "Locust" / "analysis_outputs"
dropped_frame_export_directory = analysis_output_directory / "dropped_frames"

# High-level stage toggles
run_optional_file_organization_stage = False
run_video_tracking_stage = True
run_analysis_stage = True
run_dropped_frame_export_stage = False

# Video processing settings
allow_cpu_if_no_gpu = True
overwrite_existing_tracking_outputs = False

# Optional filtering for processing stage.
# Leave empty list [] to process every detected date directory.
# Example:
# date_directories_to_process = [
#     all_videos_directory / "Off_1s_LOOL" / "12.02.2025",
# ]
date_directories_to_process = []

# Analysis script settings (these match locust_analysis.py defaults)
analysis_fps = 30.0
analysis_odor_on_seconds = 10.0
analysis_odor_off_seconds = 14.0
analysis_threshold_k = 4.0
analysis_min_samples_over_threshold = 20
analysis_min_floor_pixels = 9.5
analysis_skip_plot_generation = False
analysis_skip_combined_csv_outputs = False

# Dropped-frame export settings
dropped_frame_minimum_pixels = 9.5

print("Configuration loaded.")
print(f"all_videos_directory: {all_videos_directory}")
print(f"yolo_model_weights_path: {yolo_model_weights_path}")
print(f"analysis_output_directory: {analysis_output_directory}")


In [None]:
# Step 1b: Optional file-organization utility toggles
# These map to helper scripts in this repository.

# Safety toggle:
# - False: dry-run preview only (recommended first)
# - True: make real file changes
file_organization_execute_changes = False

run_rename_videos_step = False
run_move_videos_to_date_step = False
run_fix_duplicate_names_step = False
run_cleanup_keep_dates_step = False
run_delete_annotated_videos_step = False

print("File-organization toggles loaded.")
print(f"file_organization_execute_changes: {file_organization_execute_changes}")


In [None]:
# Step 1c: Validate important paths before running long jobs

required_paths = {
    "project_root_directory": project_root_directory,
    "all_videos_directory": all_videos_directory,
    "yolo_model_weights_path": yolo_model_weights_path,
}

missing_paths = []
for name, path_value in required_paths.items():
    if not Path(path_value).exists():
        missing_paths.append((name, path_value))

if missing_paths:
    print("Missing required paths:")
    for name, path_value in missing_paths:
        print(f"- {name}: {path_value}")
    raise FileNotFoundError("Fix missing paths in Step 1 before continuing.")

print("All required paths exist.")


## Step 2: Optional File Organization / Cleanup Stage

This section reuses these scripts directly:
- `rename_videos.py`
- `move_videos_to_date.py`
- `fix_duplicate_names.py`
- `cleanup_keep_dates.py`
- `delete_palps.py`

Run in dry-run mode first.


In [None]:
# Step 2a: Run selected file-organization utilities

if run_optional_file_organization_stage:
    print("Running optional file organization stage...")

    rename_videos_module = importlib.import_module("rename_videos")
    move_videos_module = importlib.import_module("move_videos_to_date")
    fix_duplicate_names_module = importlib.import_module("fix_duplicate_names")
    cleanup_keep_dates_module = importlib.import_module("cleanup_keep_dates")
    delete_palps_module = importlib.import_module("delete_palps")

    # Override module-level base directories so the notebook config is always used.
    rename_videos_module.BASE_DIR = all_videos_directory
    move_videos_module.BASE_DIR = all_videos_directory
    fix_duplicate_names_module.BASE_DIR = all_videos_directory
    cleanup_keep_dates_module.BASE_DIR = all_videos_directory
    delete_palps_module.BASE_DIR = all_videos_directory

    dry_run_mode = not file_organization_execute_changes
    print(f"Dry-run mode: {dry_run_mode}")

    if run_rename_videos_step:
        print("\n--- Running rename_videos ---")
        rename_videos_module.rename_videos(dry_run=dry_run_mode)

    if run_move_videos_to_date_step:
        print("\n--- Running move_videos_to_date ---")
        move_videos_module.move_files(dry_run=dry_run_mode)

    if run_fix_duplicate_names_step:
        print("\n--- Running fix_duplicate_names ---")
        fix_duplicate_names_module.fix_duplicates(dry_run=dry_run_mode)

    if run_cleanup_keep_dates_step:
        print("\n--- Running cleanup_keep_dates ---")
        cleanup_keep_dates_module.process(dry_run=dry_run_mode)

    if run_delete_annotated_videos_step:
        print("\n--- Running delete_palps ---")
        delete_palps_module.run(dry_run=dry_run_mode)

    print("\nOptional file organization stage finished.")
else:
    print("Optional file organization stage skipped (run_optional_file_organization_stage=False).")


## Step 3: Video Processing Stage (Tracking + Annotated Video + CSV)

This stage reuses `process_all_videos.py` logic directly.


In [None]:
# Step 3a: Import processing module and prepare list of videos to process

import process_all_videos as processing_pipeline

# Choose date directories from config.
if date_directories_to_process:
    selected_date_directories = [Path(p) for p in date_directories_to_process if Path(p).exists()]
    missing_date_directories = [Path(p) for p in date_directories_to_process if not Path(p).exists()]
    for missing_path in missing_date_directories:
        print(f"Warning: date directory does not exist and will be skipped: {missing_path}")
else:
    selected_date_directories = processing_pipeline.find_date_dirs(all_videos_directory)

# Build concrete list of video files in selected date directories.
video_file_paths_to_process = []
for date_directory in selected_date_directories:
    for candidate_file in sorted(date_directory.iterdir()):
        if not candidate_file.is_file():
            continue
        if candidate_file.suffix.lower() not in processing_pipeline.VIDEO_EXTENSIONS:
            continue
        if candidate_file.stem.endswith("_palps_annotated_30fps"):
            continue
        video_file_paths_to_process.append(candidate_file)

print(f"Selected date directories: {len(selected_date_directories)}")
print(f"Video files queued: {len(video_file_paths_to_process)}")

for preview_path in video_file_paths_to_process[:10]:
    print(f"- {preview_path}")

if len(video_file_paths_to_process) > 10:
    print("... (list truncated)")


In [None]:
# Step 3b: Load YOLO model and choose compute device

import torch
from ultralytics import YOLO

if torch.cuda.is_available():
    compute_device_name = "cuda"
    print("CUDA is available. Using GPU.")
else:
    if allow_cpu_if_no_gpu:
        compute_device_name = "cpu"
        print("CUDA not available. Using CPU because allow_cpu_if_no_gpu=True.")
    else:
        raise RuntimeError("CUDA is not available and allow_cpu_if_no_gpu=False.")

yolo_model = YOLO(str(yolo_model_weights_path))
yolo_model.to(compute_device_name)

print(f"YOLO model loaded on device: {compute_device_name}")


In [None]:
# Step 3c: Run tracking pipeline for each queued video

if run_video_tracking_stage:
    if not video_file_paths_to_process:
        print("No videos queued. Nothing to process.")
    else:
        print(f"Starting tracking stage for {len(video_file_paths_to_process)} videos...")

        for video_index, video_file_path in enumerate(video_file_paths_to_process, start=1):
            print("=" * 80)
            print(f"[{video_index}/{len(video_file_paths_to_process)}] Processing: {video_file_path.name}")
            processing_pipeline.process_video(
                video_path=video_file_path,
                model=yolo_model,
                overwrite=overwrite_existing_tracking_outputs,
            )

        print("Tracking stage finished.")
else:
    print("Video tracking stage skipped (run_video_tracking_stage=False).")


## Step 4: Analysis Stage (Distance %, Threshold, Reactions, Plots)

This stage runs `locust_analysis.py` with your configured parameters.


In [None]:
# Step 4a: Build and run locust_analysis.py command

locust_analysis_script_path = project_root_directory / "locust_analysis.py"

analysis_command = [
    sys.executable,
    str(locust_analysis_script_path),
    "--all-vids-dir", str(all_videos_directory),
    "--output-dir", str(analysis_output_directory),
    "--fps", str(analysis_fps),
    "--odor-on-s", str(analysis_odor_on_seconds),
    "--odor-off-s", str(analysis_odor_off_seconds),
    "--threshold-k", str(analysis_threshold_k),
    "--min-samples-over", str(analysis_min_samples_over_threshold),
]

if analysis_min_floor_pixels is not None:
    analysis_command += ["--min-floor-px", str(analysis_min_floor_pixels)]
if analysis_skip_plot_generation:
    analysis_command += ["--skip-plots"]
if analysis_skip_combined_csv_outputs:
    analysis_command += ["--skip-combined"]

if run_analysis_stage:
    run_cli_command(analysis_command, raise_on_error=True)
else:
    print("Analysis stage skipped (run_analysis_stage=False).")


## Step 5: Optional Dropped-Frame Export Stage

This stage runs `export_dropped_frames.py` to save frames where distance is invalid or below threshold.


In [None]:
# Step 5a: Build and run export_dropped_frames.py command

export_dropped_frames_script_path = project_root_directory / "export_dropped_frames.py"

export_frames_command = [
    sys.executable,
    str(export_dropped_frames_script_path),
    "--all-vids-dir", str(all_videos_directory),
    "--output-dir", str(dropped_frame_export_directory),
    "--min-px", str(dropped_frame_minimum_pixels),
]

if run_dropped_frame_export_stage:
    run_cli_command(export_frames_command, raise_on_error=True)
else:
    print("Dropped-frame export stage skipped (run_dropped_frame_export_stage=False).")


## Step 6: Quick Output Inspection

Use these cells to confirm the pipeline created the expected files.


In [None]:
# Step 6a: Show important output files

important_output_paths = [
    analysis_output_directory / "locust_combined_long.csv",
    analysis_output_directory / "locust_combined_trials.csv",
    analysis_output_directory / "traces",
    analysis_output_directory / "reaction_matrix",
    analysis_output_directory / "dataset_means",
    dropped_frame_export_directory,
]

for output_path in important_output_paths:
    exists_flag = output_path.exists()
    print(f"{output_path} -> exists={exists_flag}")


In [None]:
# Step 6b: Preview first rows of combined trials CSV (if available)

combined_trials_csv_path = analysis_output_directory / "locust_combined_trials.csv"

if combined_trials_csv_path.exists():
    combined_trials_dataframe = pd.read_csv(combined_trials_csv_path)
    print(f"Rows: {len(combined_trials_dataframe)}")
    display(combined_trials_dataframe.head(10))
else:
    print(f"File not found: {combined_trials_csv_path}")


## Optional Convenience Cell: Run Main Stages Sequentially

Set stage toggles in Step 1 first, then run this cell to execute the selected stages in order.


In [None]:
# Optional one-click runner for selected stages.
# This simply reminds you of the intended run order.

print("Recommended sequential run order:")
print("1) Step 2 (optional file organization)")
print("2) Step 3 (video processing)")
print("3) Step 4 (analysis)")
print("4) Step 5 (optional dropped-frame export)")
print("5) Step 6 (output inspection)")
