## Format and Upload CSV Files (if using Colab)

The CSV file must have the following two columns in order for processing to work.

'youtube_id' - 11-character code that is unique to each video on YouTube

'start_seconds' - a floating point number representing the elapsed seconds at the start of the clip

## Specify the Input CSV File Path


In [None]:
csv_path = ""

## Install and Import Necessary Python Packages

In [1]:
!pip install katna
!pip install yt_dlp

Collecting katna
  Downloading katna-0.9.2-py3-none-any.whl.metadata (12 kB)
Collecting ffmpy (from katna)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Downloading katna-0.9.2-py3-none-any.whl (62 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ffmpy-0.4.0-py3-none-any.whl (5.8 kB)
Installing collected packages: ffmpy, katna
Successfully installed ffmpy-0.4.0 katna-0.9.2
Collecting yt_dlp
  Downloading yt_dlp-2024.12.13-py3-none-any.whl.metadata (172 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m172.1/172.1 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading yt_dlp-2024.12.13-py3-none-any.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m50.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: yt_dlp
Successfully installed yt_dlp-2024.12.13


In [2]:
from yt_dlp import YoutubeDL
from bs4 import BeautifulSoup
from Katna.video import Video
from Katna.writer import KeyFrameDiskWriter
import pandas as pd
import json
import subprocess
import re
import os

## Utils

In [3]:
def get_timestamps(df):
  """
  Gets the starting timestamp of each Youtube video in a datafram

  Args:
      df (pandas Dataframe): Dataframe containing at 'youtube_id' and
      'start_seconds' columns

  Returns:
      dict: Maps Youtube IDs to their starting timestamp
  """
  return {row['youtube_id']: row['start_seconds'] for i, row in df.iterrows()}

def clean_data_labels(data_path, label_json_path, output_path = None):
  """
  Replace Audioset labels with human-readable form

  Args:
      data_path (str): Path to csv containing audioset data
      label_json_path (str): Path to json file containing map from audioset
      labels to human readable labels
      output_path (str): Default is None. If specified, the cleaned CSV will be
      written to the specified output path

  Returns:
      str: Extracted YouTube ID
  """
  def cleaning(label):
    label = label.replace('"', '').replace(' ', '')
    return ';'.join([id_label_map[i] for i in label.split(';')])

  df = pd.read_csv(data_path)

  with open(label_json_path, 'r') as file:
      data = json.load(file)

  id_label_map = {dictionary['id'] : dictionary['name'] for dictionary in data}

  df['positive_labels'] = df[' positive_labels'].apply(cleaning)

  if output_path:
    df.to_csv(output_path)
  return df

def mkdir(name):
    """
    Creates a directory using mkdir command line command

    Args:
        name (str): Name of directory to be created

    Returns:
        None
    """
    subprocess.run(['mkdir', name])

def extract_youtube_ids(filename):
    """
    Extracts YouTube video ID from a filename

    Args:
        filenames (str): Filenames containing YouTube ID.

    Returns:
        str: Extracted YouTube ID
    """
    pattern = r'\[([-_\w]{11})\]'  # Matches square brackets with 11-character YouTube IDs

    match = re.search(pattern, filename)
    if match:
      return (match.group(1))
    return ''

## Code for Audio and Keyframe Extraction

In [4]:
YOUTUBE_URL_PREFIX = "https://www.youtube.com/watch?v="

def download_video(video_url,i):
    """
    Downloads a youtube video to ith download folder using YoutubeDL

    Args:
        video_url (str): URL to video to download
        i (int): Batch number

    """
    opts = {'paths': {'home': f'downloads{i}'}}
    with YoutubeDL(opts) as yt:
        yt.download(video_url)

In [5]:
def download_column(yt_ids, i, stop=None):
    """
    Downloads a batch of YouTube videos based on their IDs.

    Args:
        yt_ids (list): List of YouTube video IDs to download.
        i (int): Batch number for organizing downloads.
        stop (int, optional): Number of videos to download. If None, downloads all videos.

    Returns:
        int: The number of videos that failed to download.
    """

    # If stop is None, set it to the length of yt_ids
    stop = len(yt_ids) if stop is None else stop

    # Initialize a counter for failed downloads
    sum = 0

    # Iterate through the specified number of items
    for yt_id in yt_ids[:stop]:
        try:
            # Attempt to download the video
            download_video(YOUTUBE_URL_PREFIX + yt_id, i)
        except Exception as e:
            # Increment the failure counter on exception and continue
            sum += 1
            continue

    # Return the total number of failed downloads
    return sum

In [6]:
def get_audio(id):
    """
    Extracts audio from video files in the specified download folder and saves them as .wav files.

    Args:
        id (int): Batch number corresponding to the download folder.
    """

    # Iterate through all files in the downloads folder for the given batch ID
    for filename in os.listdir(f'downloads{id}'):
        # Split the filename into the base name (root) and extension
        root, extension = os.path.splitext(filename)

        # Define the FFmpeg command to extract audio from the video file
        command = [
            'ffmpeg',
            '-i', f'downloads{id}/{filename}',  # Input file
            '-q:a', '0',  # Audio quality: highest quality (0)
            '-map', 'a',  # Map only the audio streams
            f'audio/{root}.wav'  # Output file path
        ]

        # Run the FFmpeg command
        subprocess.run(command)


In [11]:
def get_clips(yt_ids, id):
    """
    Trims video files to create clips based on predefined timestamps and saves them to the same folder.

    Args:
        yt_ids (list): List of YouTube video IDs corresponding to the videos.
        id (int): Batch number corresponding to the download folder.
    """

    # Get a list of all filenames in the downloads folder for the given batch ID
    filenames = [file for file in os.listdir(f'downloads{id}')]

    # Map each filename to its YouTube ID using a helper function
    title_to_id = {filename: extract_youtube_ids(filename) for filename in filenames}

    # Process each file in the folder
    for filename in filenames:
        # Split the filename into the base name (root) and extension
        root, extension = os.path.splitext(filename)

        # Get the YouTube ID corresponding to the filename
        video_id = title_to_id[filename]

        # Construct the FFmpeg command to trim the video based on predefined timestamps
        command = [
            'ffmpeg',
            '-i', f'downloads{id}/{filename}',  # Input file path
            '-ss', str(timestamps[video_id]),  # Start time of the clip
            '-to', str(timestamps[video_id] + 10),  # End time of the clip (10 seconds duration)
            '-c', 'copy',  # Copy codec for faster processing
            f'downloads{id}/{root}_cut{extension}'  # Output file path
        ]

        # Execute the FFmpeg command
        subprocess.run(command)

        # Remove the original video file after the trimmed clip is created
        os.remove(f"downloads{id}/{filename}")


In [10]:
def get_frames(id):
    """
    Extracts keyframes from video files in the specified download folder and saves them to a frame directory.

    Args:
        id (int): Batch number corresponding to the download folder.
    """

    # Initialize the Video object for processing keyframes
    vd = Video()

    # Number of keyframes to extract from each video
    no_of_frames_to_returned = 1

    # Initialize the KeyFrameDiskWriter to save keyframes at the desired location
    diskwriter = KeyFrameDiskWriter(location=f"frames")

    # Iterate through all files in the downloads folder for the given batch ID
    for filename in os.listdir(f'downloads{id}'):
        # Construct the full path to the video file
        video_file_path = f"downloads{id}/{filename}"

        # Extract keyframes and process the data using the diskwriter
        try:
            vd.extract_video_keyframes(
                no_of_frames=no_of_frames_to_returned,
                file_path=video_file_path,
                writer=diskwriter
            )
        except Exception as e:
            # Skip processing this file if an exception occurs
            continue


In [13]:
from concurrent.futures import ThreadPoolExecutor, as_completed
import numpy as np

mkdir('audio')
df = pd.read_csv(csv_path)
timestamps = get_timestamps(df)
n_chunks = len(df)/100  # More chunks than workers
n_workers = os.cpu_count() + 4 # Limited number of workers

# Split the DataFrame into chunks
chunks = np.array_split(df, n_chunks)

def process_chunk(chunk, id):
    yt_ids = chunk['youtube_id']
    download_column(yt_ids, id)
    get_clips(yt_ids, id)
    get_audio(id)
    get_frames(id)
    command = ['rm', '-r', f'downloads{id}']
    subprocess.run(command)

# Use ThreadPoolExecutor with limited workers
with ThreadPoolExecutor(max_workers=n_workers) as executor:
    futures = [executor.submit(process_chunk, chunk, i) for i, chunk in enumerate(chunks)]
    for future in as_completed(futures):
        try:
            future.result()
        except Exception as e:
            print(f"Error processing chunk: {e}")


  return bound(*args, **kwds)


[youtube] Extracting URL: https://www.youtube.com/watch?v=--4gqARaEJE
[youtube] --4gqARaEJE: Downloading webpage
[youtube] --4gqARaEJE: Downloading ios player API JSON
[youtube] --4gqARaEJE: Downloading mweb player API JSON
[youtube] --4gqARaEJE: Downloading player 5b77d519
[youtube] --4gqARaEJE: Downloading m3u8 information
[info] --4gqARaEJE: Downloading 1 format(s): 136+251
[download] Destination: downloads0/Miniature, Standard, Teacup Dachshund, Puppies, For, Sale, In, New Jersey, NJ, PA, DE, MD,CT [--4gqARaEJE].f136.mp4
[download] 100% of    7.92MiB in 00:00:00 at 17.20MiB/s  
[download] Destination: downloads0/Miniature, Standard, Teacup Dachshund, Puppies, For, Sale, In, New Jersey, NJ, PA, DE, MD,CT [--4gqARaEJE].f251.webm
[download] 100% of  774.71KiB in 00:00:00 at 8.61MiB/s   
[Merger] Merging formats into "downloads0/Miniature, Standard, Teacup Dachshund, Puppies, For, Sale, In, New Jersey, NJ, PA, DE, MD,CT [--4gqARaEJE].mkv"
Deleting original file downloads0/Miniature, St

ERROR: [youtube] -1pPw9zZopA: Video unavailable


Completed processing for :  downloads0/Come Mettere La Matita Nera [-0nqfRcnAYE]_cut.mp4
Completed processing for :  downloads0/VELHO DA TOSSE.wmv [--U7joUcTCo]_cut.mp4
Completed processing for :  downloads0/ONE LAST SPECIAL GIFT!! [-0RWZT-miFs]_cut.mp4
Completed processing for :  downloads0/Hawk Attack Kill Eat Swoop Mockingbird Frog wild [-116CjQ3MAg]_cut.mp4
Completed processing for :  downloads0/Risate a catena (Skype Laughter Chain) [-0BIyqJj9ZU]_cut.mp4
Completed processing for :  downloads0/Vehicle Alignments [--BfvyPmVMo]_cut.mp4
Completed processing for :  downloads0/Doberman in bathtub [-1EXhfqLLwQ]_cut.mp4
Completed processing for :  downloads0/Smash Bros Lawl Moveset-CD-I Link [-1PZQg5Gi8A]_cut.mp4
