# Setup

In [None]:
import os

dir_path = os.getcwd()

# Set CUDA_LAUNCH_BLOCKING=1 to catch CUDA errors immediately
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [None]:
! pip install papermill

Collecting papermill
  Downloading papermill-2.6.0-py3-none-any.whl (38 kB)
Collecting entrypoints
  Downloading entrypoints-0.4-py3-none-any.whl (5.3 kB)
Collecting ansicolors
  Downloading ansicolors-1.1.8-py2.py3-none-any.whl (13 kB)
Collecting nbformat>=5.2.0
  Downloading nbformat-5.10.4-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.5/78.5 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nbclient>=0.2.0
  Downloading nbclient-0.10.0-py3-none-any.whl (25 kB)
Collecting tenacity>=5.0.2
  Downloading tenacity-9.0.0-py3-none-any.whl (28 kB)
Collecting jsonschema>=2.6
  Downloading jsonschema-4.23.0-py3-none-any.whl (88 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.5/88.5 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fastjsonschema>=2.15
  Downloading fastjsonschema-2.20.0-py3-none-any.whl (23 kB)
Collecting referencing>=0.28.4
  Downloading referencing-0.35.1-py3-none-any.whl (26 kB)


In [None]:
import papermill as pm
import json
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Dict, List, Union, Tuple


NotebookStep = Dict[str, Union[str, Dict]]  # Notebook type
StepInfo = Dict[str, Union[bool, List[NotebookStep]]]
StepsDict = Dict[str, StepInfo] 

In [None]:
def set_kernel(notebook_path: str, kernel_name: str) -> None:
    """
    Set the kernel for a notebook.
    
    Args:
    notebook_path (str): Path to the notebook file.
    kernel_name (str): Name of the kernel to set.
    """
    with open(notebook_path, 'r', encoding='utf-8') as f:
        notebook = json.load(f)

    if 'metadata' not in notebook:
        notebook['metadata'] = {}

    if 'kernelspec' not in notebook['metadata']:
        notebook['metadata']['kernelspec'] = {}

    notebook['metadata']['kernelspec']['name'] = kernel_name
    notebook['metadata']['kernelspec']['display_name'] = kernel_name

    with open(notebook_path, 'w', encoding='utf-8') as f:
        json.dump(notebook, f, ensure_ascii=False, indent=4)

    print(f"Kernel set to '{kernel_name}' for notebook: {notebook_path}")


def get_current_kernel() -> str:
    """Get the kernel name of the current notebook."""
    with open('/proc/self/cmdline', 'r') as f:
        cmdline = f.read()

    for part in cmdline.split('\x00'):
        if '--KernelManager.kernel_name=' in part:
            return part.split('=')[1]
    return 'python3'


def execute_notebook(notebook: str, parameters: Dict, is_skip_on_failure: bool) -> Tuple[str, bool]:
    """
    Execute a notebook and handle errors if the notebook is in the skip_on_failure list.

    Parameters:
    - notebook: Path to the notebook to execute.
    - parameters: Parameters for the notebook.
    - is_skip_on_failure: Check if notebook allowed to fail without stopping the pipeline.

    Returns:
    - notebook: Path to the notebook.
    - success: True if the notebook executed successfully, False if there was an error.
    """
    original_dir = os.getcwd()
    notebook_dir = os.path.dirname(notebook)

    try:
        os.chdir(notebook_dir)

        # Set the kernel of the target notebook to be the same as the current notebook
        current_kernel = get_current_kernel()
        set_kernel(notebook, current_kernel)

        output_notebook = notebook.replace(".ipynb", "_output.ipynb")
        pm.execute_notebook(
            input_path=notebook,
            output_path=output_notebook,
            parameters=parameters
        )
        print(
            f"Finished executing {notebook}. Output saved to {output_notebook}")
        return notebook, True
    except Exception as e:
        print(f"Error executing {notebook}: {e}")
        if is_skip_on_failure:
            print(f"Skipping {notebook} due to failure.")
            return notebook, False
        else:
            raise
    finally:
        os.chdir(original_dir)


def run_notebooks_in_parallel(notebooks: List[Dict]) -> None:
    """Run notebooks in parallel."""
    with ThreadPoolExecutor(max_workers=len(notebooks)) as executor:
        futures = [executor.submit(
            execute_notebook, nb['notebook'], nb['parameters'], nb.get('is_skip_on_failure', False)) for nb in notebooks]
        for future in as_completed(futures):
            notebook, success = future.result()
            cur_notebook = next(
                nb for nb in notebooks if nb['notebook'] == notebook)
            if not success and not cur_notebook.get('is_skip_on_failure', False):
                print(f"Aborting due to failure in {notebook}")
                return


def run_notebooks_sequentially(notebooks: List[Dict]) -> None:
    """Run notebooks sequentially."""
    for nb in notebooks:
        notebook, parameters, is_skip_on_failure = nb['notebook'], nb['parameters'], nb.get('is_skip_on_failure', False)
        notebook, success = execute_notebook(
            notebook, parameters, is_skip_on_failure)
        if not success and not is_skip_on_failure:
            print(f"Aborting due to failure in {notebook}")
            return


def run_notebooks(steps: Dict[str, Dict[str, Union[bool, List[Dict]]]]) -> None:
    """
    Execute notebooks sequentially with the specified parameters. If a step is marked as parallel,
    execute the notebooks in that step in parallel.

    Parameters:
    - steps: Dictionary containing information about the steps to execute.
    """

    for step_name, step_info in steps.items():
        is_parallel = step_info['parallel']
        notebooks = step_info['notebooks']
        print(
            f'----------------Step {step_name} starting----------------')
        if is_parallel:
            run_notebooks_in_parallel(notebooks)
        else:
            run_notebooks_sequentially(notebooks)

## Data extraction pipeline

In [None]:
data_extraction_path = f'{dir_path}/data_extraction'
dataset_path = f'{data_extraction_path}/dataset/AIC_Video'

In [None]:
data_extraction_pipeline = {
    'Extract data': {
        'parallel': True,
        'notebooks': [
            {
                'notebook': f'{data_extraction_path}/transnet/transnetv2.ipynb',
                'parameters': {
                    'videos_dir': dataset_path,
                    'save_dir': f'{data_extraction_path}/transnet/SceneJSON',
                },
            },
            {
                'notebook': f'{data_extraction_path}/audio/audio_extraction.ipynb',
                'parameters': {
                    'videos_dir': dataset_path,
                    'save_dir': f'{data_extraction_path}/audio/Audio',
                },
            },
        ],
    },
    'Cut frames & transcribe audio': {
        'parallel': True,
        'notebooks': [
            {
                'notebook': f'{data_extraction_path}/transnet/cutframe.ipynb',
                'parameters': {
                    'videos_dir': dataset_path,
                    'scene_json_dirs': f'{data_extraction_path}/transnet/SceneJSON',
                    'save_dir_all': f'{data_extraction_path}/transnet/Keyframes',
                    'metadata_dir_all': f'{data_extraction_path}/transnet/Keyframes_Metadata',
                    'extract_metadata_only': False,
                    'num_frames_per_segment': 5,
                },
            },
            {
                'notebook': f'{data_extraction_path}/audio/audio_detection.ipynb',
                'parameters': {
                    'audios_dir': f'{data_extraction_path}/audio/Audio',
                    'save_dir': f'{data_extraction_path}/audio/audio_detection',
                },
            },
        ],
    },
    'Extract metadata': {
        'parallel': True,
        'notebooks': [
            {
                'notebook': f'{data_extraction_path}/clip/clipv2.ipynb',
                'parameters': {
                    'bs': 4,
                    'keyframes_dir': f'{data_extraction_path}/transnet/Keyframes',
                    'save_dir': f'{data_extraction_path}/clip/CLIPv2_features',
                },
            },
            {
                'notebook': f'{data_extraction_path}/metadata/easyocr.ipynb',
                'parameters': {
                    'bs': 16,
                    'keyframes_dir': f'{data_extraction_path}/transnet/Keyframes',
                    'save_dir': f'{data_extraction_path}/metadata/ocr',
                },
            },
            {
                'notebook': f'{data_extraction_path}/metadata/object_extraction.ipynb',
                'parameters': {
                    'keyframes_dir': f'{data_extraction_path}/transnet/Keyframes',
                    'save_dir': f'{data_extraction_path}/metadata/object_extraction',
                },
            },
        ],
    }
}

In [None]:
run_notebooks(data_extraction_pipeline)

----------------Step Extract data starting----------------
Kernel set to 'python3' for notebook: /home/jiggle/personal/competition/hcm-ai/Pipeline_HCM_AI/notebooks/data_extraction/transnet/transnetv2.ipynb
Kernel set to 'python3' for notebook: /home/jiggle/personal/competition/hcm-ai/Pipeline_HCM_AI/notebooks/data_extraction/audio/audio_extraction.ipynb


  from .autonotebook import tqdm as notebook_tqdm
Executing:  36%|███▋      | 4/11 [00:01<00:01,  3.85cell/s]ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enabl

## Indexing pipeline

In [None]:
indexing_path = f'{dir_path}/indexing'

In [None]:
indexing_pipeline = {
    'Preparation data': {
        'parallel': True,
        'notebooks': [
            {
                'notebook': 'data_preparation.ipynb',
                'parameters': {
                    'combined_keyframes_metadata_filename': 'keyframes_metadata.json',
                    'keyframes_metadata_dir': f'{data_extraction_path}/transnet/Keyframes_Metadata',
                    'combined_object_extraction_filename': 'object_extraction_metadata.json',
                    'object_extraction_dir': f'{data_extraction_path}/metadata/object_extraction/object_detection'
                },
            },
        ],
    },
    'Indexing': {
        'parallel': True,
        'notebooks': [
            {
                'notebook': f'{indexing_path}/create_faiss_bin.ipynb',
                'parameters': {
                    'feature_shape': 1024,
                    'features_dir': f'{data_extraction_path}/clip/CLIPv2_features',
                    'cpu_bin_name': 'faiss_clipv2_cosine_cpu.bin',
                    'gpu_bin_name': 'faiss_clipv2_cosine_gpu.bin',
                },
            },
        ],
    },
}

In [None]:
run_notebooks(indexing_pipeline)

  from .autonotebook import tqdm as notebook_tqdm


----------------Step Indexing starting----------------
Kernel set to 'python3' for notebook: /home/jiggle/personal/competition/hcm-ai/Pipeline_HCM_AI/notebooks/indexing/create_faiss_bin.ipynb


Executing:   0%|          | 0/9 [00:00<?, ?cell/s]

Executing: 100%|██████████| 9/9 [00:04<00:00,  1.94cell/s]

Finished executing /home/jiggle/personal/competition/hcm-ai/Pipeline_HCM_AI/notebooks/indexing/create_faiss_bin.ipynb. Output saved to /home/jiggle/personal/competition/hcm-ai/Pipeline_HCM_AI/notebooks/indexing/create_faiss_bin_output.ipynb



