In [1]:
# !pip install -q git+https://github.com/huggingface/transformers.git pytube ffmpeg-python tqdm

## Realt-Time Detectron 

Sample (unoptimized) code for running the RT-DETR model on a video stream from Youtube.

In [2]:
import torch
import requests
from PIL import Image
import matplotlib.pyplot as plt
from transformers import RTDetrForObjectDetection, RTDetrImageProcessor


image_processor = RTDetrImageProcessor.from_pretrained("PekingU/rtdetr_r50vd_coco_o365")
model = RTDetrForObjectDetection.from_pretrained("PekingU/rtdetr_r50vd_coco_o365")

2024-07-05 14:21:08.666308: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-05 14:21:08.666363: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-05 14:21:08.667121: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-05 14:21:08.672995: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
from pytube import YouTube 
from tqdm import tqdm
import os 

os.makedirs('temp', exist_ok=True)
# yt = YouTube('https://www.youtube.com/watch?v=b7WD-SpNX_I')
# video_path = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download(output_path='temp')

In [4]:
import ffmpeg
from PIL import Image 
import os
import tempfile
import time 

os.makedirs('temp/videos', exist_ok=True)
from multiprocessing import Pool, cpu_count

def extract_frame(args):
    video_path, frame_number, output_path = args
    (ffmpeg.input(video_path)
           .filter('select', f'gte(n,{frame_number})')
           .output(output_path, vframes=1)
           .global_args('-loglevel', 'error')
           .run(overwrite_output=True))
    return Image.open(output_path).copy()

def extract_frames(video_path, frame_rate=1):
    start_time = time.time()
    probe = ffmpeg.probe(video_path)
    video_info = next(s for s in probe['streams'] if s['codec_type'] == 'video')
    duration = float(video_info['duration'])
    total_frames = int(video_info['nb_frames'])
    original_fps = total_frames / duration
    frame_rate = min(frame_rate, original_fps)

    frame_interval = int(original_fps / frame_rate)
    n_frames = int(duration * frame_rate)
    print("N Frames: ", n_frames, " Frame Interval: ", frame_interval, "Duration: ", duration, " Original FPS: ", original_fps, " Frame Rate: ", frame_rate)

    with tempfile.TemporaryDirectory() as temp_dir:
        frame_args = [
            (video_path, i * frame_interval, os.path.join(temp_dir, f'frame_{i:04d}.png'))
            for i in range(n_frames)
        ]

        # Use multiprocessing to extract frames in parallel
        with Pool(processes=cpu_count()) as pool:
            frames = pool.map(extract_frame, frame_args)
    print(f"Elapsed Time: {time.time() - start_time}")
    return frames

COLORS = [[0.000, 0.447, 0.741], [0.850, 0.325, 0.098], [0.929, 0.694, 0.125],
          [0.494, 0.184, 0.556], [0.466, 0.674, 0.188], [0.301, 0.745, 0.933]]
 
def plot_results(pil_img, scores, labels, boxes, output_path, fps, score_threshold=0.6):
    img_aspect_ratio = (pil_img.width / pil_img.height)
    plt.figure(figsize=(16, int(16 // img_aspect_ratio)))
    plt.imshow(pil_img)
    ax = plt.gca()
    colors = COLORS * 100
    for score, label, (xmin, ymin, xmax, ymax),c  in zip(scores.tolist(), labels.tolist(), boxes.tolist(), colors):
        if score > score_threshold:
            ax.add_patch(plt.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin,
                                        fill=False, color=c, linewidth=3))
                
            text = f'{model.config.id2label[label]}: {score:0.2f}'
            
            # Change text position and alignment
            ax.text(xmax, ymin, text, fontsize=15,
                    bbox=dict(facecolor='yellow', alpha=0.5),
                    horizontalalignment='right', verticalalignment='top')
    # plot fps in a box lower left  
    ax.text(0.01, 0.97, f'FPS: {fps:0.2f}', color='red', fontsize=15, transform=ax.transAxes, ha='left')

    plt.axis('off')
    plt.tight_layout()
    plt.savefig(output_path,   pad_inches=0, dpi=100) 
    plt.close()

def process_image_frame(image, output_dir='temp/frames', index=0,scrore_threshold=0.5):
    start_time = time.time()
    inputs = image_processor(images=image, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    results = image_processor.post_process_object_detection(outputs, target_sizes=torch.tensor([image.size[::-1]]), threshold=scrore_threshold)
    elapsed_time = time.time() - start_time
    # print(f"Elapsed Time: {elapsed_time}")
    fps = int(1/elapsed_time)

    output_path = os.path.join(output_dir, f'frame_{index:04d}.png')
    for result in results:
        plot_results(image, result["scores"], result["labels"], result["boxes"], output_path, fps)
    


# url = 'https://i.pinimg.com/originals/39/6f/b9/396fb90286728d6573405c60941043e1.jpg'
# image = Image.open(requests.get(url, stream=True).raw)
# process_image_frame(image)

In [5]:
import glob 

def images_to_video(directory, output_file='output.mp4', loglevel='error', fps=1):
    if not os.path.isdir(directory):
        raise ValueError(f"The provided directory '{directory}' does not exist.")

    png_files = sorted(glob.glob(os.path.join(directory, '*.png')))
    if not png_files:
        raise ValueError(f"No .png files found in the directory '{directory}'.")

    input_stream = ffmpeg.input('pipe:', format='image2pipe', vcodec='png', r=fps)
    output = ffmpeg.output(input_stream, output_file, vcodec='libx264', pix_fmt='yuv420p').global_args('-loglevel', loglevel, '-y')
    process = output.run_async(pipe_stdin=True)

    for file in png_files:
        with open(file, 'rb') as f:
            process.stdin.write(f.read())

    process.stdin.close()
    process.wait()
    print(f"Video file created: {output_file}")

In [6]:
from tqdm import tqdm
import shutil
def process_youtube_video(url, output_video_path='output.mp4', sample_rate=10, score_threshold=0.5):
    yt = YouTube(url)
    video_path = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download(output_path='temp/videos')
    frames = extract_frames(video_path, sample_rate)

    output_dir = 'temp/videos/output'
    try:
        shutil.rmtree(output_dir) 
    except Exception as e:
        print(f"Error: {str(e)}")
    os.makedirs(output_dir, exist_ok=True) 
    
    n_sub_frames = int(len(frames) / 2)
    frames = frames[:n_sub_frames]
    print("Processing and annotating frames...")
    for i in tqdm(range(len(frames))):
        process_image_frame(frames[i], output_dir=output_dir, index=i,scrore_threshold=score_threshold)
    images_to_video(output_dir, output_file=output_video_path, fps=sample_rate)


In [7]:
# output_dir = 'temp/videos/output'
# video_url = 'https://www.youtube.com/watch?v=b7WD-SpNX_I'
# output_video_path = 'temp/dogs.mp4' 
# process_youtube_video(video_url, output_video_path, sample_rate=20, score_threshold=0.5)

In [8]:
output_dir = 'temp/videos/football'
video_url = 'https://www.youtube.com/watch?v=3AtV36o0bOE'
output_video_path = 'temp/footbal.mp4' 
process_youtube_video(video_url, output_video_path, sample_rate=20, score_threshold=0.5)

N Frames:  9666  Frame Interval:  1 Duration:  483.333333  Original FPS:  30.000000020689658  Frame Rate:  20
Elapsed Time: 1502.5599193572998
Processing and annotating frames...


100%|██████████| 4833/4833 [1:14:42<00:00,  1.08it/s]


Video file created: temp/footbal.mp4


## Extra Credit

The code above could be significantly optimized in a few ways:

- Batch predictions. Currently only a single frame is processed at a time, but the model can handle prediction in batches depending on your GPU memory, for potential speed ups.
- Parrallel prediction: Frames can be split into batches with independent model instances running on different GPUs, for further speed ups.