In [1]:
# !pip install -q git+https://github.com/huggingface/transformers.git pytube ffmpeg-python

In [14]:
import torch
import requests
from PIL import Image
import matplotlib.pyplot as plt
from transformers import RTDetrForObjectDetection, RTDetrImageProcessor


image_processor = RTDetrImageProcessor.from_pretrained("PekingU/rtdetr_r50vd_coco_o365")
model = RTDetrForObjectDetection.from_pretrained("PekingU/rtdetr_r50vd_coco_o365")

2024-07-04 18:12:27.703410: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-04 18:12:27.703459: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-04 18:12:27.704221: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-04 18:12:27.710061: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
from pytube import YouTube 
import os 

os.makedirs('temp', exist_ok=True)
yt = YouTube('http://youtube.com/watch?v=2lAe1cqCOXo')
video_path = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download(output_path='temp')

In [218]:
import ffmpeg
from PIL import Image 
import os
import tempfile
import time 

os.makedirs('temp/videos', exist_ok=True)
def extract_frames(video_path, n_frames, sample_rate=10):
    # Get the total duration of the video
    probe = ffmpeg.probe(video_path)
    video_stream = next(stream for stream in probe['streams'] if stream['codec_type'] == 'video')
    duration = float(video_stream['duration'])
    total_frames = int(video_stream['nb_frames']) 
    print("Duration: ", duration, "Total Frames: ", total_frames)

    framerate = video_stream['r_frame_rate']
    num, den = map(int, framerate.split('/'))
    fps = num / den

    print("Frame Rate: ", fps)
    
    # Calculate the time intervals to extract frames
    # this should be the first n frames of the video calculated based on the sample rate
    frame_times =  range(n_frames)
    print("Frame Times: ", frame_times)
     
    
    # Create a temporary directory to store the extracted frames
    with tempfile.TemporaryDirectory() as temp_dir:
        print(f'Created temporary directory {temp_dir}')
        # Extract frames using ffmpeg
        for i, t in enumerate(frame_times):
            output_path = os.path.join(temp_dir, f'frame_{i:04d}.png')
            (
                ffmpeg
                .input(video_path, ss=t)
                .output(output_path, vframes=1)
                .global_args('-loglevel', 'error')
                .run(overwrite_output=True)
            )
        
        # Load frames into PIL Images
        frames = []
        for i in range(n_frames - 1):
            frame_path = os.path.join(temp_dir, f'frame_{i:04d}.png')
            with Image.open(frame_path) as img:
                frames.append(img.copy())
        return frames

COLORS = [[0.000, 0.447, 0.741], [0.850, 0.325, 0.098], [0.929, 0.694, 0.125],
          [0.494, 0.184, 0.556], [0.466, 0.674, 0.188], [0.301, 0.745, 0.933]]
 
def plot_results(pil_img, scores, labels, boxes, output_path, fps, score_threshold=0.6):
    img_aspect_ratio = (pil_img.width / pil_img.height)
    plt.figure(figsize=(16, int(16 // img_aspect_ratio)))
    plt.imshow(pil_img)
    ax = plt.gca()
    colors = COLORS * 100
    for score, label, (xmin, ymin, xmax, ymax),c  in zip(scores.tolist(), labels.tolist(), boxes.tolist(), colors):
        if score > score_threshold:
            ax.add_patch(plt.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin,
                                        fill=False, color=c, linewidth=3))
                
            text = f'{model.config.id2label[label]}: {score:0.2f}'
            
            # Change text position and alignment
            ax.text(xmax, ymin, text, fontsize=15,
                    bbox=dict(facecolor='yellow', alpha=0.5),
                    horizontalalignment='right', verticalalignment='top')
    # plot fps in a box lower left  
    ax.text(0.01, 0.97, f'FPS: {fps:0.2f}', color='red', fontsize=15, transform=ax.transAxes, ha='left')

    plt.axis('off')
    plt.tight_layout()
    plt.savefig(output_path,   pad_inches=0, dpi=100) 
    plt.close()

def process_image_frame(image, output_dir='temp/frames', index=0,scrore_threshold=0.5):
    start_time = time.time()
    inputs = image_processor(images=image, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    results = image_processor.post_process_object_detection(outputs, target_sizes=torch.tensor([image.size[::-1]]), threshold=scrore_threshold)
    elapsed_time = time.time() - start_time
    # print(f"Elapsed Time: {elapsed_time}")
    fps = int(1/elapsed_time)

    output_path = os.path.join(output_dir, f'frame_{index:04d}.png')
    for result in results:
        plot_results(image, result["scores"], result["labels"], result["boxes"], output_path, fps)
    


# url = 'https://i.pinimg.com/originals/39/6f/b9/396fb90286728d6573405c60941043e1.jpg'
# image = Image.open(requests.get(url, stream=True).raw)
# process_image_frame(image)

In [219]:
# frames = extract_frames(video_path, n_frames=40)

In [220]:
from tqdm import tqdm

def process_video(video_path, n_frames, output_dir='temp/frames', sample_rate=10):
    frames = extract_frames(video_path, n_frames, sample_rate)
    os.makedirs(output_dir, exist_ok=True)
    for i in tqdm(range(len(frames))):
        process_image_frame(frames[i], output_dir=output_dir, index=i,scrore_threshold=0.3)
    return frames
output_dir = 'temp/videos/output'
os.makedirs(output_dir, exist_ok=True)
for i in tqdm(range(len(frames[:10]))):
    process_image_frame(frames[i], output_dir=output_dir, index=i,scrore_threshold=0.3)

100%|██████████| 10/10 [00:08<00:00,  1.19it/s]


In [216]:
import glob
def images_to_video(directory, output_file='output.mp4', loglevel='error', fps=1):
    """
    Create a video from sorted PNG images in a directory.

    :param directory: Path to the directory containing PNG images.
    :param output_file: Path to the output video file (default is 'output.mp4').
    :param loglevel: FFmpeg logging level (default is 'error').
    :param fps: Frames per second (default is 1).
    """
    
    # Ensure the provided directory exists
    if not os.path.isdir(directory):
        raise ValueError(f"The provided directory '{directory}' does not exist.")

    # Get list of all .png files in the directory
    png_files = glob.glob(os.path.join(directory, '*.png'))

    # Sort the files
    png_files.sort()

    if not png_files:
        raise ValueError(f"No .png files found in the directory '{directory}'.")

    # Start building the ffmpeg input
    input_stream = ffmpeg.input('pipe:', format='image2pipe', vcodec='png', r=fps)

    # Set up the ffmpeg output with reduced verbosity
    output = ffmpeg.output(input_stream, output_file, vcodec='libx264', pix_fmt='yuv420p').global_args('-loglevel', loglevel, '-y')

    # Run the ffmpeg command
    process = output.run_async(pipe_stdin=True)

    # Write each image to the process standard input
    for file in png_files:
        with open(file, 'rb') as f:
            process.stdin.write(f.read())

    # Close the standard input to complete the streaming
    process.stdin.close()
    process.wait()

    print(f"Video file created: {output_file}")

In [217]:
images_to_video(output_dir, 'temp/videos/output.mp4', fps=0.8)

Video file created: temp/videos/output.mp4
