# Setup

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/AIO-2024/pipeline-hcm-ai/notebooks

/content/drive/MyDrive/AIO-2024/pipeline-hcm-ai/notebooks


In [3]:
import os

dir_path = os.getcwd()

# Set CUDA_LAUNCH_BLOCKING=1 to catch CUDA errors immediately
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [None]:
print(dir_path)

/content/drive/MyDrive/AIO-2024/pipeline-hcm-ai/notebooks


In [4]:
!pip install --upgrade papermill

Collecting papermill
  Downloading papermill-2.6.0-py3-none-any.whl.metadata (13 kB)
Collecting ansicolors (from papermill)
  Downloading ansicolors-1.1.8-py2.py3-none-any.whl.metadata (9.0 kB)
Downloading papermill-2.6.0-py3-none-any.whl (38 kB)
Downloading ansicolors-1.1.8-py2.py3-none-any.whl (13 kB)
Installing collected packages: ansicolors, papermill
Successfully installed ansicolors-1.1.8 papermill-2.6.0


In [5]:
import papermill as pm
import json
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Dict, List, Union, Tuple


NotebookStep = Dict[str, Union[str, Dict]]  # Notebook type
StepInfo = Dict[str, Union[bool, List[NotebookStep]]]
StepsDict = Dict[str, StepInfo]

In [6]:
def set_kernel(notebook_path: str, kernel_name: str) -> None:
    """
    Set the kernel for a notebook.

    Args:
    notebook_path (str): Path to the notebook file.
    kernel_name (str): Name of the kernel to set.
    """
    with open(notebook_path, 'r', encoding='utf-8') as f:
        notebook = json.load(f)

    if 'metadata' not in notebook:
        notebook['metadata'] = {}

    if 'kernelspec' not in notebook['metadata']:
        notebook['metadata']['kernelspec'] = {}

    notebook['metadata']['kernelspec']['name'] = kernel_name
    notebook['metadata']['kernelspec']['display_name'] = kernel_name

    with open(notebook_path, 'w', encoding='utf-8') as f:
        json.dump(notebook, f, ensure_ascii=False, indent=4)

    print(f"Kernel set to '{kernel_name}' for notebook: {notebook_path}")


def get_current_kernel() -> str:
    """Get the kernel name of the current notebook."""
    with open('/proc/self/cmdline', 'r') as f:
        cmdline = f.read()

    for part in cmdline.split('\x00'):
        if '--KernelManager.kernel_name=' in part:
            return part.split('=')[1]
    return 'python3'


def execute_notebook(notebook: str, parameters: Dict, is_skip_on_failure: bool) -> Tuple[str, bool]:
    """
    Execute a notebook and handle errors if the notebook is in the skip_on_failure list.

    Parameters:
    - notebook: Path to the notebook to execute.
    - parameters: Parameters for the notebook.
    - is_skip_on_failure: Check if notebook allowed to fail without stopping the pipeline.

    Returns:
    - notebook: Path to the notebook.
    - success: True if the notebook executed successfully, False if there was an error.
    """
    original_dir = os.getcwd()
    notebook_dir = os.path.dirname(notebook)

    try:
        os.chdir(notebook_dir)

        # Set the kernel of the target notebook to be the same as the current notebook
        current_kernel = get_current_kernel()
        set_kernel(notebook, current_kernel)

        output_notebook = notebook.replace(".ipynb", "_output.ipynb")
        pm.execute_notebook(
            input_path=notebook,
            output_path=output_notebook,
            parameters=parameters
        )
        print(
            f"Finished executing {notebook}. Output saved to {output_notebook}")
        return notebook, True
    except Exception as e:
        print(f"Error executing {notebook}: {e}")
        if is_skip_on_failure:
            print(f"Skipping {notebook} due to failure.")
            return notebook, False
        else:
            raise
    finally:
        os.chdir(original_dir)


def run_notebooks_in_parallel(notebooks: List[Dict]) -> None:
    """Run notebooks in parallel."""
    with ThreadPoolExecutor(max_workers=len(notebooks)) as executor:
        futures = [executor.submit(
            execute_notebook, nb['notebook'], nb['parameters'], nb.get('is_skip_on_failure', False)) for nb in notebooks]
        for future in as_completed(futures):
            notebook, success = future.result()
            cur_notebook = next(
                nb for nb in notebooks if nb['notebook'] == notebook)
            if not success and not cur_notebook.get('is_skip_on_failure', False):
                print(f"Aborting due to failure in {notebook}")
                return


def run_notebooks_sequentially(notebooks: List[Dict]) -> None:
    """Run notebooks sequentially."""
    for nb in notebooks:
        notebook, parameters, is_skip_on_failure = nb['notebook'], nb['parameters'], nb.get('is_skip_on_failure', False)
        notebook, success = execute_notebook(
            notebook, parameters, is_skip_on_failure)
        if not success and not is_skip_on_failure:
            print(f"Aborting due to failure in {notebook}")
            return


def run_notebooks(steps: Dict[str, Dict[str, Union[bool, List[Dict]]]]) -> None:
    """
    Execute notebooks sequentially with the specified parameters. If a step is marked as parallel,
    execute the notebooks in that step in parallel.

    Parameters:
    - steps: Dictionary containing information about the steps to execute.
    """

    for step_name, step_info in steps.items():
        is_parallel = step_info['parallel']
        notebooks = step_info['notebooks']
        print(
            f'----------------Step {step_name} starting----------------')
        if is_parallel:
            run_notebooks_in_parallel(notebooks)
        else:
            run_notebooks_sequentially(notebooks)

## Data extraction pipeline

In [4]:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"

In [7]:
data_extraction_path = f'{dir_path}/data_extraction'
dataset_path = f'{data_extraction_path}/dataset/AIC_Video'

In [None]:
data_extraction_pipeline = {
    'Extract data': {
        'parallel': True,
        'notebooks': [
            {
                'notebook': f'{data_extraction_path}/transnet/transnetv2.ipynb',
                'parameters': {
                    'videos_dir': dataset_path,
                    'save_dir': f'{data_extraction_path}/transnet/SceneJSON',
                },
            },
            {
                'notebook': f'{data_extraction_path}/audio/audio_extraction.ipynb',
                'parameters': {
                    'videos_dir': dataset_path,
                    'save_dir': f'{data_extraction_path}/audio/Audio',
                },
            },
        ],
    },
    'Cut frames & transcribe audio': {
        'parallel': True,
        'notebooks': [
            {
                'notebook': f'{data_extraction_path}/transnet/cutframe.ipynb',
                'parameters': {
                    'videos_dir': dataset_path,
                    'scene_json_dirs': f'{data_extraction_path}/transnet/SceneJSON',
                    'save_dir_all': f'{data_extraction_path}/transnet/Keyframes',
                    'num_frames_per_segment': 5,
                },
            },
            {
                'notebook': f'{data_extraction_path}/audio/audio_detection.ipynb',
                'parameters': {
                    'audios_dir': f'{data_extraction_path}/audio/Audio',
                    'save_dir': f'{data_extraction_path}/audio/audio_detection',
                },
            },
        ],
    },
    'Extract metadata': {
        'parallel': True,
        'notebooks': [
            {
                'notebook': f'{data_extraction_path}/clip/clipv2.ipynb',
                'parameters': {
                    'bs': 4,
                    'keyframes_dir': f'{data_extraction_path}/transnet/Keyframes',
                    'save_dir': f'{data_extraction_path}/clip/CLIPv2_features',
                },
            },
            {
                'notebook': f'{data_extraction_path}/metadata/easyocr.ipynb',
                'parameters': {
                    'bs': 16,
                    'keyframes_dir': f'{data_extraction_path}/transnet/Keyframes',
                    'save_dir': f'{data_extraction_path}/metadata/ocr',
                },
            },
            {
                'notebook': f'{data_extraction_path}/metadata/object_extraction.ipynb',
                'parameters': {
                    'keyframes_dir': f'{data_extraction_path}/transnet/Keyframes',
                    'save_dir': f'{data_extraction_path}/metadata/object_extraction',
                },
            },
            {
                'notebook': f'{data_extraction_path}/metadata/color.ipynb',
                'parameters': {
                    'keyframes_dir': f'{data_extraction_path}/transnet/Keyframes',
                    'save_dir': f'{data_extraction_path}/metadata/color',
                },
            },
        ],
    }
}

In [None]:
run_notebooks(data_extraction_pipeline)

## Indexing pipeline

In [None]:
indexing_path = f'{dir_path}/indexing'

In [None]:
indexing_pipeline = {
    'Indexing': {
        'parallel': True,
        'notebooks': [
            {
                'notebook': f'{indexing_path}/create_faiss_bin.ipynb',
                'parameters': {
                    'feature_shape': 512,
                    'features_dir': f'{data_extraction_path}/clip/CLIPv2_features',
                    'cpu_bin_name': 'faiss_clipv2_cosine_cpu.bin',
                    'gpu_bin_name': 'faiss_clipv2_cosine_gpu.bin',
                },
            },
        ],
    },
}

In [None]:
run_notebooks(indexing_pipeline)

----------------Step Indexing starting----------------
Kernel set to 'python3' for notebook: /content/drive/MyDrive/AIO-2024/pipeline-hcm-ai/notebooks/indexing/create_faiss_bin.ipynb


Executing:   0%|          | 0/9 [00:00<?, ?cell/s]

ERROR:papermill:unhandled iopub msg: colab_request
ERROR:papermill:unhandled iopub msg: colab_request
ERROR:papermill:unhandled iopub msg: colab_request


Finished executing /content/drive/MyDrive/AIO-2024/pipeline-hcm-ai/notebooks/indexing/create_faiss_bin.ipynb. Output saved to /content/drive/MyDrive/AIO-2024/pipeline-hcm-ai/notebooks/indexing/create_faiss_bin_output.ipynb


## Query