# Add ImageNet labels to YouTube video

Install requirements in a Python environment:

```shell
# pip install tensorflow  # if not CUDA
pip install tensorflow-gpu  # if CUDA
pip install ffmpeg-python keras Pillow
pip install pytube  # with this fix https://github.com/nficano/pytube/issues/333#issuecomment-436668766
```

Import modules:

In [None]:
import tempfile
from pathlib import Path

from IPython.display import YouTubeVideo

import ffmpeg
import numpy as np
from pytube import YouTube
from PIL import Image, ImageDraw, ImageFont
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras.applications.vgg16 import preprocess_input
from keras.applications.vgg16 import decode_predictions
from keras.applications.vgg16 import VGG16

Define functions:

In [None]:
FRAMES_PATTERN = '%08d.png'

def download_youtube_video(video_id, output_path):
    if output_path.is_file(): return
    print(f'Downloading video {video_id}...')
    yt = YouTube(f'http://youtube.com/watch?v={VIDEO_ID}')
    video_path = Path(
        yt
        .streams
        .filter(progressive=True, file_extension='mp4')
        .order_by('resolution')
        .desc()
        .first()
        .download()
    )
    video_path.rename(output_path)
    print('Video downloaded to', output_path)


def video_to_frames(video_path, frames_dir, pattern=FRAMES_PATTERN):
    if frames_dir.is_dir(): return
    print('Splitting video into frames...')
    frames_dir.mkdir(exist_ok=True)
    frames_pattern = frames_dir / pattern
    (
        ffmpeg
        .input(str(video_path))
        .output(str(frames_pattern))
        .run()
    )

    
def frames_to_video(frames_dir, video_path, pattern=FRAMES_PATTERN, fps=25):
    if video_path.is_file(): return
    print('Joining frames into video...')
    frames_pattern = frames_dir / pattern
    (
        ffmpeg
        .input(str(frames_pattern), framerate=fps)
        .output(str(video_path))
        .run()
    )

    
def get_frames_paths(frames_dir, pattern=FRAMES_PATTERN):
    return sorted(list(frames_dir.glob('*.png')))
    
    
def frames_to_labels(frames_dir, labels_path):
    if labels_path.is_file(): return
    print('Computing labels from frames...')
    print('Loading VGG16 model...')
    model = VGG16()
    frames_paths = get_frames_paths(frames_dir)
    num_frames = len(frames_paths)
    features_shape = num_frames, 1000  # VGG16
    features = np.empty(features_shape)
    for i, fp in enumerate(frames_paths):
        if i % 100 == 0:
            print(f'Extracting features from frame {i+1}/{num_frames}...')
        image = load_img(fp, target_size=(224, 224))
        image = img_to_array(image)
        image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
        image = preprocess_input(image)
        yhat = model.predict(image)
        features[i] = yhat
    decoded = decode_predictions(features)
    labels = [[label for (_, label, _) in frame] for frame in decoded]
    lines = [','.join(words) for words in labels]
    text = '\n'.join(lines)
    labels_path.write_text(text)
    return labels


def annotate_frame(frame_path, labels, annotated_path):
    image = Image.open(frame_path)
    text = '\n'.join(labels)
    fontsize = 30
    try:
        font = ImageFont.truetype('/Library/Fonts/Arial.ttf', fontsize)
    except OSError:
        font = ImageFont.truetype(
            '/usr/share/fonts/truetype/ubuntu/UbuntuMono-R.ttf', fontsize)
    draw = ImageDraw.Draw(image)
    draw.text((10,10), text, font=font)
    image.save(annotated_path)


def annotate_frames(frames_dir, labels_path, annotated_dir):
    if annotated_dir.is_dir(): return
    annotated_dir.mkdir(exist_ok=True)
    lines = labels_path.read_text().splitlines()
    frames_paths = get_frames_paths(frames_dir)
    num_frames = len(frames_paths)
    zipped = list(zip(frames_paths, lines))
    for i, (frame_path, line) in enumerate(zipped):
        if i % 100 == 0:
            print(f'Annotating frame {i+1}/{num_frames}...')
        annotated_path = annotated_dir / frame_path.name
        labels = [label.replace('_', ' ') for label in line.split(',')]
        annotate_frame(frame_path, labels, annotated_path)

        
def annotate_youtube_video(video_id, output_path):
    # Define paths
    video_dir = Path(video_id)
    downloaded_path = video_dir / 'original.mp4'
    frames_dir = video_dir / 'original_frames'
    labels_path = video_dir / 'labels.csv'
    annotated_dir = video_dir / 'annotated_frames'
    detectron_dir = video_dir / 'detectron_frames'
    output_path = video_dir / 'annotated.mp4'
    
    # Do stuff
    video_dir.mkdir(exist_ok=True)
    download_youtube_video(video_id, downloaded_path)
    video_to_frames(downloaded_path, frames_dir)
    frames_to_labels(frames_dir, labels_path)
    to_annotate_dir = detectron_dir if detectron_dir.is_dir() else frames_dir
    annotate_frames(to_annotate_dir, labels_path, annotated_dir)
    frames_to_video(annotated_dir, output_path)

Display the video:

In [None]:
VIDEO_ID = '9lY_yUZf6ts'
youtube_video = YouTubeVideo(VIDEO_ID)
display(youtube_video)

Download and annotate video:

In [None]:
processed_video_path = Path('processed.mp4')
annotate_youtube_video(VIDEO_ID, processed_video_path)

Run Mask R-CNN on original frames:
```shell
python /home/fernando/git/detectron/tools/infer_simple.py \                                            
    --cfg /home/fernando/git/detectron/configs/12_2017_baselines/e2e_mask_rcnn_R-101-FPN_2x.yaml \
    --output-dir /home/fernando/git/marina-ffmpeg/9lY_yUZf6ts/detectron_frames \
    --wts https://s3-us-west-2.amazonaws.com/detectron/35861858/12_2017_baselines/e2e_mask_rcnn_R-101-FPN_2x.yaml.02_32_51.SgT4y1cO/output/train/coco_2014_train:coco_2014_valminusminival/generalized_rcnn/model_final.pkl \
    --image-ext png \
    --output-ext png \
    /home/fernando/git/marina-ffmpeg/9lY_yUZf6ts/original_frames
```

In [None]:
annotate_youtube_video(VIDEO_ID, processed_video_path)