# Setup

In [None]:
import os

dir_path = os.getcwd()

# Set CUDA_LAUNCH_BLOCKING=1 to catch CUDA errors immediately
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [None]:
! pip install papermill



In [None]:
import papermill as pm
import json
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Dict, List, Union, Tuple


NotebookStep = Dict[str, Union[str, Dict]]  # Notebook type
StepInfo = Dict[str, Union[bool, List[NotebookStep]]]
StepsDict = Dict[str, StepInfo] 

In [None]:
def set_kernel(notebook_path: str, kernel_name: str) -> None:
    """Set the kernel for a notebook."""
    with open(notebook_path, 'r', encoding='utf-8') as f:
        notebook = json.load(f)

    notebook['metadata']['kernelspec']['name'] = kernel_name
    notebook['metadata']['kernelspec']['display_name'] = kernel_name

    with open(notebook_path, 'w', encoding='utf-8') as f:
        json.dump(notebook, f, ensure_ascii=False, indent=4)


def get_current_kernel() -> str:
    """Get the kernel name of the current notebook."""
    with open('/proc/self/cmdline', 'r') as f:
        cmdline = f.read()

    for part in cmdline.split('\x00'):
        if '--KernelManager.kernel_name=' in part:
            return part.split('=')[1]
    return 'python3'


def execute_notebook(notebook: str, parameters: Dict, is_skip_on_failure: bool) -> Tuple[str, bool]:
    """
    Execute a notebook and handle errors if the notebook is in the skip_on_failure list.

    Parameters:
    - notebook: Path to the notebook to execute.
    - parameters: Parameters for the notebook.
    - is_skip_on_failure: Check if notebook allowed to fail without stopping the pipeline.

    Returns:
    - notebook: Path to the notebook.
    - success: True if the notebook executed successfully, False if there was an error.
    """
    original_dir = os.getcwd()
    notebook_dir = os.path.dirname(notebook)

    try:
        os.chdir(notebook_dir)

        # Set the kernel of the target notebook to be the same as the current notebook
        current_kernel = get_current_kernel()
        set_kernel(notebook, current_kernel)

        output_notebook = notebook.replace(".ipynb", "_output.ipynb")
        pm.execute_notebook(
            input_path=notebook,
            output_path=output_notebook,
            parameters=parameters
        )
        print(
            f"Finished executing {notebook}. Output saved to {output_notebook}")
        return notebook, True
    except Exception as e:
        print(f"Error executing {notebook}: {e}")
        if is_skip_on_failure:
            print(f"Skipping {notebook} due to failure.")
            return notebook, False
        else:
            raise
    finally:
        os.chdir(original_dir)


def run_notebooks_in_parallel(notebooks: List[Dict]) -> None:
    """Run notebooks in parallel."""
    with ThreadPoolExecutor(max_workers=len(notebooks)) as executor:
        futures = [executor.submit(
            execute_notebook, nb['notebook'], nb['parameters'], nb.get('is_skip_on_failure', False)) for nb in notebooks]
        for future in as_completed(futures):
            notebook, success = future.result()
            cur_notebook = next(
                nb for nb in notebooks if nb['notebook'] == notebook)
            if not success and not cur_notebook.get('is_skip_on_failure', False):
                print(f"Aborting due to failure in {notebook}")
                return


def run_notebooks_sequentially(notebooks: List[Dict]) -> None:
    """Run notebooks sequentially."""
    for nb in notebooks:
        notebook, parameters, is_skip_on_failure = nb['notebook'], nb['parameters'], nb.get('is_skip_on_failure', False)
        notebook, success = execute_notebook(
            notebook, parameters, is_skip_on_failure)
        if not success and not is_skip_on_failure:
            print(f"Aborting due to failure in {notebook}")
            return


def run_notebooks(steps: Dict[str, Dict[str, Union[bool, List[Dict]]]]) -> None:
    """
    Execute notebooks sequentially with the specified parameters. If a step is marked as parallel,
    execute the notebooks in that step in parallel.

    Parameters:
    - steps: Dictionary containing information about the steps to execute.
    """

    for step_name, step_info in steps.items():
        is_parallel = step_info['parallel']
        notebooks = step_info['notebooks']
        print(
            f'----------------Step {step_name} starting----------------')
        if is_parallel:
            run_notebooks_in_parallel(notebooks)
        else:
            run_notebooks_sequentially(notebooks)

## Data extraction pipeline

In [None]:
data_extraction_path = f'{dir_path}/data_extraction'
dataset_path = f'{data_extraction_path}/dataset/AIC_Video'

In [None]:
data_extraction_pipeline = {
    'Extract data': {
        'parallel': True,
        'notebooks': [
            {
                'notebook': f'{data_extraction_path}/transnet/transnetv2.ipynb',
                'parameters': {
                    'videos_dir': dataset_path,
                    'save_dir': f'{data_extraction_path}/transnet/SceneJSON',
                },
            },
            {
                'notebook': f'{data_extraction_path}/audio/audio_extraction.ipynb',
                'parameters': {
                    'videos_dir': dataset_path,
                    'save_dir': f'{data_extraction_path}/audio/Audio',
                },
            },
        ],
    },
    'Cut frames & transcribe audio': {
        'parallel': True,
        'notebooks': [
            {
                'notebook': f'{data_extraction_path}/transnet/cutframe.ipynb',
                'parameters': {
                    'videos_dir': dataset_path,
                    'scene_json_dirs': f'{data_extraction_path}/transnet/SceneJSON',
                    'save_dir_all': f'{data_extraction_path}/transnet/Keyframes',
                    'num_frames_per_segment': 5,
                },
            },
            {
                'notebook': f'{data_extraction_path}/audio/audio_detection.ipynb',
                'parameters': {
                    'audios_dir': f'{data_extraction_path}/audio/Audio',
                    'save_dir': f'{data_extraction_path}/audio/audio_detection',
                },
            },
        ],
    },
    'Extract metadata': {
        'parallel': True,
        'notebooks': [
            {
                'notebook': f'{data_extraction_path}/clip/clipv2.ipynb',
                'parameters': {
                    'bs': 4,
                    'keyframes_dir': f'{data_extraction_path}/transnet/Keyframes',
                    'save_dir': f'{data_extraction_path}/clip/CLIPv2_features',
                },
            },
            {
                'notebook': f'{data_extraction_path}/metadata/easyocr.ipynb',
                'parameters': {
                    'bs': 16,
                    'keyframes_dir': f'{data_extraction_path}/transnet/Keyframes',
                    'save_dir': f'{data_extraction_path}/metadata/ocr',
                },
            },
            {
                'notebook': f'{data_extraction_path}/metadata/object_extraction.ipynb',
                'parameters': {
                    'keyframes_dir': f'{data_extraction_path}/transnet/Keyframes',
                    'save_dir': f'{data_extraction_path}/metadata/object_extraction',
                },
            },
        ],
    }
}

In [None]:
run_notebooks(data_extraction_pipeline)

----------------Step Extract data starting----------------


  from .autonotebook import tqdm as notebook_tqdm
Executing:   9%|▉         | 1/11 [00:01<00:11,  1.15s/cell]ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enabl

Finished executing /home/jiggle/personal/competition/hcm-ai/Pipeline_HCM_AI/notebooks/data_extraction/audio/audio_extraction.ipynb. Output saved to /home/jiggle/personal/competition/hcm-ai/Pipeline_HCM_AI/notebooks/data_extraction/audio/audio_extraction_output.ipynb


Executing: 100%|██████████| 11/11 [25:45<00:00, 140.47s/cell]


Finished executing /home/jiggle/personal/competition/hcm-ai/Pipeline_HCM_AI/notebooks/data_extraction/transnet/transnetv2.ipynb. Output saved to /home/jiggle/personal/competition/hcm-ai/Pipeline_HCM_AI/notebooks/data_extraction/transnet/transnetv2_output.ipynb
----------------Step Cut frames & transcribe audio starting----------------


Executing:   9%|▉         | 1/11 [00:01<00:10,  1.02s/cell]2024-07-30 11:06:47.451431: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-07-30 11:06:47.545783: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Executing: 100%|██████████| 9/9 [12:35<00:00, 83.99s/cell] ]


Error executing /home/jiggle/personal/competition/hcm-ai/Pipeline_HCM_AI/notebooks/data_extraction/transnet/cutframe.ipynb: 
---------------------------------------------------------------------------
Exception encountered at "In [7]":
---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
Cell In[7], line 2
      1 all_video_paths = parse_video_info(videos_dir)
----> 2 process_videos(all_video_paths, scene_json_dirs, save_dir_all, num_frames_per_segment)

Cell In[6], line 68, in process_videos(all_video_paths, scene_json_dirs, save_dir_all, num_frames_per_segment)
     65 video_scene_path = f'{scene_json_dirs}/{key}/{video_id}.json'
     67 save_dir_video = f'{save_dir}/{video_id}'
---> 68 sample_frames(video_path, video_scene_path,
     69               save_dir_video, num_frames_per_segment)

Cell In[6], line 30, in sample_frames(video_path, scene_json_path, save_dir, num_frames_per_segm

KeyboardInterrupt: 

## Indexing pipeline

In [None]:
indexing_path = f'{dir_path}/indexing'

In [None]:
indexing_pipeline = {
    'Indexing': {
        'parallel': True,
        'notebooks': [
            {
                'notebook': f'{indexing_path}/create_faiss_bin.ipynb',
                'parameters': {
                    'feature_shape': 512,
                    'features_dir': f'{data_extraction_path}/clip/CLIPv2_features',
                    'bin_name': 'faiss_clipv2_cosine.bin',
                },
            },
        ],
    },
}

In [None]:
run_notebooks(indexing_pipeline)