# Setup

In [None]:
import os

dir_path = os.getcwd()

# Set CUDA_LAUNCH_BLOCKING=1 to catch CUDA errors immediately
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [None]:
! pip install papermill

In [None]:
import papermill as pm
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Dict, List, Union, Tuple


NotebookStep = Dict[str, Union[str, Dict]]  # Notebook và các tham số của nó
StepInfo = Dict[str, Union[bool, List[NotebookStep]]]  # Thông tin về một bước
StepsDict = Dict[str, StepInfo]  # Toàn bộ các bước

In [None]:
def execute_notebook(notebook: str, parameters: Dict, is_skip_on_failure: bool) -> Tuple[str, bool]:
    """
    Execute a notebook and handle errors if the notebook is in the skip_on_failure list.

    Parameters:
    - notebook: Path to the notebook to execute.
    - parameters: Parameters for the notebook.
    - is_skip_on_failure: Check if notebook allowed to fail without stopping the pipeline.

    Returns:
    - notebook: Path to the notebook.
    - success: True if the notebook executed successfully, False if there was an error.
    """
    original_dir = os.getcwd()
    notebook_dir = os.path.dirname(notebook)
    
    try:
        os.chdir(notebook_dir)
        
        output_notebook = notebook.replace(".ipynb", "_output.ipynb")
        pm.execute_notebook(
            input_path=notebook,
            output_path=output_notebook,
            parameters=parameters
        )
        print(
            f"Finished executing {notebook}. Output saved to {output_notebook}")
        return notebook, True
    except Exception as e:
        print(f"Error executing {notebook}: {e}")
        if is_skip_on_failure:
            print(f"Skipping {notebook} due to failure.")
            return notebook, False
        else:
            raise
    finally:
        os.chdir(original_dir)


def run_notebooks_in_parallel(notebooks: List[Dict]) -> None:
    """Run notebooks in parallel."""
    with ThreadPoolExecutor(max_workers=len(notebooks)) as executor:
        futures = [executor.submit(
            execute_notebook, nb['notebook'], nb['parameters'], nb.get('is_skip_on_failure', False)) for nb in notebooks]
        for future in as_completed(futures):
            notebook, success = future.result()
            cur_notebook = next(
                nb for nb in notebooks if nb['notebook'] == notebook)
            if not success and not cur_notebook.get('is_skip_on_failure', False):
                print(f"Aborting due to failure in {notebook}")
                return


def run_notebooks_sequentially(notebooks: List[Dict]) -> None:
    """Run notebooks sequentially."""
    for nb in notebooks:
        notebook, parameters, is_skip_on_failure = nb['notebook'], nb['parameters'], nb.get('is_skip_on_failure', False)
        notebook, success = execute_notebook(
            notebook, parameters, is_skip_on_failure)
        if not success and not is_skip_on_failure:
            print(f"Aborting due to failure in {notebook}")
            return


def run_notebooks(steps: Dict[str, Dict[str, Union[bool, List[Dict]]]]) -> None:
    """
    Execute notebooks sequentially with the specified parameters. If a step is marked as parallel,
    execute the notebooks in that step in parallel.

    Parameters:
    - steps: Dictionary containing information about the steps to execute.
    """

    for step_name, step_info in steps.items():
        is_parallel = step_info['parallel']
        notebooks = step_info['notebooks']
        print(
            f'----------------Step {step_name} starting----------------')
        if is_parallel:
            run_notebooks_in_parallel(notebooks)
        else:
            run_notebooks_sequentially(notebooks)

In [None]:
pipeline = {
    # 'Extract scenes & audios': {
    #     'parallel': True,
    #     'notebooks': [
    #         {
    #             'notebook': f'{dir_path}/transnet/transnetv2.ipynb', 
    #             'parameters': {
    #                 'videos_dir': f'{dir_path}/transnet/AIC_Video',
    #                 'save_dir': f'{dir_path}/transnet/SceneJSON',
    #             },
    #         },
    #         {
    #             'notebook': f'{dir_path}/audio/audio_extraction.ipynb',
    #             'parameters': {
    #                 'videos_dir': f'{dir_path}/transnet/AIC_Video',
    #                 'save_dir': f'{dir_path}/audio/Audio',
    #             },
    #         },
    #     ],
    # },
    # 'Cut frames & transcribe audio': {
    #     'parallel': True,
    #     'notebooks': [
    #         {
    #             'notebook': f'{dir_path}/transnet/cutframe.ipynb',
    #             'parameters': {
    #                 'videos_dir': f'{dir_path}/transnet/AIC_Video',
    #                 'scene_json_dirs': f'{dir_path}/transnet/SceneJSON',
    #                 'save_dir_all': f'{dir_path}/transnet/Keyframes',
    #                 'num_frames_per_segment': 5,
    #             },
    #         },
    #         {
    #             'notebook': f'{dir_path}/audio/audio_detection.ipynb',
    #             'parameters': {
    #                 'audios_dir': f'{dir_path}/audio/Audio',
    #                 'save_dir': f'{dir_path}/audio/audio_detection',
    #             },
    #         },
    #     ],
    # },
    'Extract metadata': {
        'parallel': True,
        'notebooks': [
            # {
            #     'notebook': f'{dir_path}/metadata/easyocr.ipynb',
            #     'parameters': {
            #         'bs': 16,
            #         'keyframes_dir': f'{dir_path}/transnet/Keyframes',
            #         'save_dir': f'{dir_path}/metadata/ocr',
            #     },
            # },
            {
                'notebook': f'{dir_path}/metadata/object_extraction.ipynb',
                'parameters': {
                    'keyframes_dir': f'{dir_path}/transnet/Keyframes',
                    'save_dir': f'{dir_path}/metadata/object_extraction',
                },
            },
        ],
    }
}

In [None]:
run_notebooks(pipeline)