In [3]:
import cv2
import mediapipe as mp
import ffmpegcv
from Logger import Logger
#import face_recognition #https://github.com/ageitgey/face_recognition

import moviepy.editor as mpe

In [4]:
from ultralytics import YOLO
import os
os.environ['OPENCV_GPU_DEVICE_ID'] = '0'

# with
buffer
once every 2 frames
detecting person and putting block
more modern models


In [19]:
from collections import deque
import cv2
import ffmpegcv
import time
from ultralytics import YOLO  # Assuming YOLO model is from ultralytics

def hide_faces_using_yolo_new_buffer(
    video_in,
    video_out,
    model='yolov10n',
    expand_factor=0,
    size_bb_buffer=15,
    frame_count_threshold=5,
    ):
    """
    Hide faces in a video using YOLO model.

    Args:
        video_in (mp4): video file to process
        video_out (mp4): video file to save the processed video
        model (str, optional): model to be used. Defaults to 'yolov10n'.
        expand_factor (int, optional): Size of the expansion factor for the bounding box. Defaults to 0.
        size_bb_buffer (int, optional): Size of the buffer to decide how many last bounding boxes are stored. Defaults to 3.
        frame_count_threshold (int, optional): Threshold to decide how many frames to skip for detection. Defaults to 5.
    """
    print("-> hide_faces_using_yolo")
    counter = 0
    cap = cv2.VideoCapture(video_in)

    video_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    video_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    video_fps = int(cap.get(cv2.CAP_PROP_FPS))

    if video_width >= 540 or video_width >= 960:
        video_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT) / 2)
        video_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH) / 2)

    print(f"-> {video_in} video_height: {video_height} video_width: {video_width}")
    print(f"-> {video_in} video_fps: {video_fps}")

    out = ffmpegcv.VideoWriter(video_out, 'h264', video_fps)

    yolo = YOLO(model)

    # Buffer to store bounding boxes for recent frames
    bbox_buffer = deque()  # Storing half a second worth of bounding boxes
    no_person_detected_count = 0  # Counter for frames without detection
    max_no_detection_frames = video_fps * 6  # Clear buffer after seconds of no detection

    frame_count = 0  # Counter to track frames
    factor = expand_factor  # Expansion factor for the bounding box

    while True:
        ret, img = cap.read()

        if not ret:
            break
        
        # Detect every other frame (perform inference every frame_count_threshold frames)
        if frame_count >= size_bb_buffer*2 or frame_count % frame_count_threshold == 0:
            
            results = yolo.predict(img)

            names = yolo.names
            person_id = list(names)[list(names.values()).index('person')]
            boxes = results[0].boxes

            found_person = False
            for box in boxes:
                if box.cls == person_id:  # Check if the detected object is a person
                    bbox = box.xyxy.cpu().numpy()  # Convert tensor to numpy array
                    bbox = bbox[0].astype(int)  # Convert to integers

                    # Modify the bounding box to only cover the upper part
                    upper_bbox_height = bbox[1] + int((bbox[3] - bbox[1]) * 0.3)  # Use 30% of the height

                    # Store the current bounding box in the buffer
                    bbox_buffer.append(bbox)
                    found_person = True
                    cv2.rectangle(img, (bbox[0], bbox[1]), (bbox[2] + factor, upper_bbox_height + factor), (0, 0, 0), -1)


            # if a person is not detected, increment the counter and fix the incorrect prediction by using buffered bounding boxes
            if not found_person:
                no_person_detected_count += 1
                counter += 1
                # If no person detected, use the last bounding boxes from the buffer
                if len(bbox_buffer) > 0:
                    for bbox in list(bbox_buffer)[-5:]:
                        upper_bbox_height = bbox[1] + int((bbox[3] - bbox[1]) * 0.3)

                        cv2.rectangle(img, (bbox[0], bbox[1]), (bbox[2] + factor, upper_bbox_height + factor), (0, 0, 0), -1)


            # Clear the buffer if no person is detected for the threshold duration
            if no_person_detected_count > max_no_detection_frames:
                bbox_buffer.clear()  # Clear the buffer
                no_person_detected_count = 0  # Reset the counter after clearing
        else:
            # If not a detection frame, plot the last bounding boxes from the buffer
            if len(bbox_buffer) > 0:
                # Iterate through the last bounding boxes
                # it can be a bounding box for someone else, so we get the latest 
                for bbox in list(bbox_buffer)[-size_bb_buffer:]:
                    upper_bbox_height = bbox[1] + int((bbox[3] - bbox[1]) * 0.3)

                    cv2.rectangle(img, (bbox[0], bbox[1]), (bbox[2] + factor, upper_bbox_height + factor), (0, 0, 0), -1)

        out.write(img)

        frame_count += 1  # Increment the frame counter

    print(f'times of failed detection: {counter}')
    cap.release()
    out.release()


In [None]:
# video coleta

hide_faces_using_yolo_new_buffer('video_8710996766432314030.mp4', 'output_video_8710996766432314030.mp4',model='yolov10n')


cutting

In [7]:
from moviepy.video.io.VideoFileClip import VideoFileClip

def cut_video(input_path, output_path, start_time, end_time):
    # Load the video
    video = VideoFileClip(input_path)
    
    # Subclip the video from start_time to end_time
    subclip = video.subclip(start_time, end_time)
    
    # Write the result to the output file without modifying resolution or shape
    subclip.write_videofile(
        output_path,
        codec="libx264",        # Codec for mp4
        audio_codec="aac",      # Preserves audio
        threads=4,              # Multithreading for faster processing
        bitrate="5000k"         # Bitrate to preserve video quality (adjust as needed)
    )
# Example usage
input_video = "video_8710996766432314030.mp4"   # Path to your input video
output_video = "cut_video_8710996766432314030.mp4" # Path for the output video
start_time = 455                   # Start time in seconds (1 minute)
end_time = 478                    # End time in seconds (e.g., 2 minutes)

cut_video(input_video, output_video, start_time, end_time)


t:   0%|          | 1/690 [01:13<14:08:30, 73.89s/it, now=None]

Moviepy - Building video cut_video_8710996766432314030.mp4.
Moviepy - Writing video cut_video_8710996766432314030.mp4



t:   0%|          | 1/690 [01:25<16:26:43, 85.93s/it, now=None]

Moviepy - Done !
Moviepy - video ready cut_video_8710996766432314030.mp4


In [None]:
ffmpeg -i "video_8710996766432314030.mp4" -ss 00:07:35 -to 00:07:58 "cut_video_8710996766432314030.mp4"

In [8]:
hide_faces_using_yolo_new_buffer("cut_video_8710996766432314030.mp4",model='yolov9e',video_out="output_v9e_cut_video_8710996766432314030.mp4")

-> hide_faces_using_yolo
-> cut_video_8710996766432314030.mp4 video_height: 720 video_width: 480
-> cut_video_8710996766432314030.mp4 video_fps: 30

0: 640x448 1 person, 176.7ms
Speed: 1.9ms preprocess, 176.7ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 448)

0: 640x448 1 person, 176.3ms
Speed: 2.5ms preprocess, 176.3ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 448)

0: 640x448 1 person, 161.8ms
Speed: 2.4ms preprocess, 161.8ms inference, 1.4ms postprocess per image at shape (1, 3, 640, 448)

0: 640x448 1 person, 1 backpack, 160.9ms
Speed: 2.5ms preprocess, 160.9ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 448)

0: 640x448 1 person, 1 backpack, 161.3ms
Speed: 2.9ms preprocess, 161.3ms inference, 1.4ms postprocess per image at shape (1, 3, 640, 448)

0: 640x448 1 person, 162.2ms
Speed: 2.3ms preprocess, 162.2ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 448)

0: 640x448 1 person, 162.2ms
Speed: 3.0ms preprocess, 162.2

In [20]:
hide_faces_using_yolo_new_buffer("cut_video_8710996766432314030.mp4",model='yolov10n',video_out="output_v10n_cut_video_8710996766432314030_.mp4")

-> hide_faces_using_yolo
-> cut_video_8710996766432314030.mp4 video_height: 720 video_width: 480
-> cut_video_8710996766432314030.mp4 video_fps: 30

0: 640x448 1 person, 13.8ms
Speed: 1.5ms preprocess, 13.8ms inference, 0.3ms postprocess per image at shape (1, 3, 640, 448)

0: 640x448 1 person, 13.5ms
Speed: 1.8ms preprocess, 13.5ms inference, 0.5ms postprocess per image at shape (1, 3, 640, 448)

0: 640x448 1 person, 13.4ms
Speed: 1.5ms preprocess, 13.4ms inference, 0.4ms postprocess per image at shape (1, 3, 640, 448)

0: 640x448 1 person, 13.4ms
Speed: 1.6ms preprocess, 13.4ms inference, 0.4ms postprocess per image at shape (1, 3, 640, 448)

0: 640x448 1 person, 13.4ms
Speed: 2.1ms preprocess, 13.4ms inference, 0.6ms postprocess per image at shape (1, 3, 640, 448)

0: 640x448 1 person, 13.4ms
Speed: 1.7ms preprocess, 13.4ms inference, 0.5ms postprocess per image at shape (1, 3, 640, 448)

0: 640x448 1 person, 12.9ms
Speed: 2.9ms preprocess, 12.9ms inference, 0.6ms postprocess per im

In [13]:
def video_to_gif(input_path, output_path, start_time=None, end_time=None):
    # Load the video
    video = VideoFileClip(input_path)
    
    # If start_time and end_time are provided, use a subclip
    if start_time is not None and end_time is not None:
        video = video.subclip(start_time, end_time)
    
    # Write the GIF file
    video.write_gif(output_path, fps=10)  # fps can be adjusted

# Example usage
input_video = "output_mediapipefaces_cut_video_8710996766432314030.mp4"   # Path to your input video
output_gif = "output_animation_mediapipe.gif" # Path for the output GIF
start_time = 10                 # Start time in seconds (optional)
end_time = 20                     # End time in seconds (optional)

video_to_gif(input_video, output_gif)

t:   0%|          | 1/690 [59:37<684:39:03, 3577.28s/it, now=None]

MoviePy - Building file output_animation_mediapipe.gif with imageio.




In [14]:
input_video = "output_v8nfaces_cut_video_8710996766432314030.mp4"   # Path to your input video
output_gif = "output_animation_v8nfaces.gif" # Path for the output GIF

video_to_gif(input_video, output_gif)

t:   0%|          | 1/690 [1:02:18<715:25:07, 3738.04s/it, now=None]

MoviePy - Building file output_animation_v8nfaces.gif with imageio.




In [15]:
input_video = "output_v9e_cut_video_8710996766432314030.mp4"   # Path to your input video
output_gif = "output_animation_v9e.gif" # Path for the output GIF

video_to_gif(input_video, output_gif)

t:   0%|          | 1/690 [1:03:12<725:51:49, 3792.61s/it, now=None]

MoviePy - Building file output_animation_v9e.gif with imageio.




In [16]:
input_video = "output_v10n_cut_video_8710996766432314030.mp4"   # Path to your input video
output_gif = "output_animation_10n.gif" # Path for the output GIF

video_to_gif(input_video, output_gif)

t:   0%|          | 1/690 [1:04:37<742:11:13, 3877.90s/it, now=None]

MoviePy - Building file output_animation_10n.gif with imageio.




v9e 2m5 s 165

In [16]:
hide_faces_using_yolo_new_buffer('cut_test_hubert.mp4', 'output_cut_test_hubert_10n.mp4',model='yolov10n')


-> hide_faces_using_yolo
-> cut_test_hubert.mp4 video_height: 720 video_width: 480
-> cut_test_hubert.mp4 video_fps: 30

0: 640x448 4 persons, 2 chairs, 13.8ms
Speed: 2.3ms preprocess, 13.8ms inference, 0.3ms postprocess per image at shape (1, 3, 640, 448)

0: 640x448 3 persons, 4 chairs, 13.4ms
Speed: 1.7ms preprocess, 13.4ms inference, 0.5ms postprocess per image at shape (1, 3, 640, 448)

0: 640x448 4 persons, 2 chairs, 13.4ms
Speed: 1.6ms preprocess, 13.4ms inference, 0.7ms postprocess per image at shape (1, 3, 640, 448)

0: 640x448 4 persons, 2 chairs, 13.4ms
Speed: 1.7ms preprocess, 13.4ms inference, 0.4ms postprocess per image at shape (1, 3, 640, 448)

0: 640x448 4 persons, 2 chairs, 13.4ms
Speed: 1.5ms preprocess, 13.4ms inference, 0.5ms postprocess per image at shape (1, 3, 640, 448)

0: 640x448 4 persons, 2 chairs, 13.4ms
Speed: 1.6ms preprocess, 13.4ms inference, 0.5ms postprocess per image at shape (1, 3, 640, 448)

0: 640x448 4 persons, 2 chairs, 13.4ms
Speed: 1.7ms prepr

In [2]:
# video coleta

hide_faces_using_yolo_new_buffer('video_8710996766432314030.mp4', 'output_video_8710996766432314030.mp4',model='yolov10n')


-> hide_faces_using_yolo
-> video_8710996766432314030.mp4 video_height: 720 video_width: 480
-> video_8710996766432314030.mp4 video_fps: 29



73 segundos vídeo de 180 segundos, com skip a cada 2

45 segundos , com skip a cada 5

replicando as ultimas 144

com v9e e todos os frames, pior cenário 17 minutos


In [17]:
# em todos os frames
hide_faces_using_yolo_new_buffer('cut_test_hubert.mp4', 'output_cut_test_hubert_9e.mp4',model='yolov9e')


-> hide_faces_using_yolo
-> cut_test_hubert.mp4 video_height: 720 video_width: 480
-> cut_test_hubert.mp4 video_fps: 30

0: 640x448 3 persons, 1 backpack, 6 chairs, 181.3ms
Speed: 2.3ms preprocess, 181.3ms inference, 394.0ms postprocess per image at shape (1, 3, 640, 448)

0: 640x448 3 persons, 1 backpack, 1 sports ball, 5 chairs, 1 remote, 169.4ms
Speed: 1.9ms preprocess, 169.4ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 448)

0: 640x448 3 persons, 1 backpack, 5 chairs, 172.2ms
Speed: 2.1ms preprocess, 172.2ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 448)

0: 640x448 3 persons, 1 backpack, 1 bowl, 5 chairs, 171.9ms
Speed: 1.7ms preprocess, 171.9ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 448)

0: 640x448 3 persons, 1 backpack, 1 bowl, 5 chairs, 172.0ms
Speed: 1.6ms preprocess, 172.0ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 448)

0: 640x448 3 persons, 1 backpack, 1 bowl, 5 chairs, 172.6ms
Speed: 1.7ms preproce

In [2]:
hide_faces_using_yolo_new_buffer('video_8710996766432314030.mp4', 'output_video_8710996766432314030.mp4',model='yolov10n')


-> hide_faces_using_yolo
-> video_8710996766432314030.mp4 video_height: 720 video_width: 480
-> video_8710996766432314030.mp4 video_fps: 29

0: 640x448 1 person, 1 chair, 62.8ms
Speed: 4.9ms preprocess, 62.8ms inference, 48.3ms postprocess per image at shape (1, 3, 640, 448)

0: 640x448 1 person, 2 chairs, 13.5ms
Speed: 2.4ms preprocess, 13.5ms inference, 0.8ms postprocess per image at shape (1, 3, 640, 448)

0: 640x448 1 person, 1 chair, 1 remote, 13.5ms
Speed: 2.6ms preprocess, 13.5ms inference, 0.5ms postprocess per image at shape (1, 3, 640, 448)

0: 640x448 1 person, 13.4ms
Speed: 2.8ms preprocess, 13.4ms inference, 0.7ms postprocess per image at shape (1, 3, 640, 448)

0: 640x448 1 person, 1 chair, 13.5ms
Speed: 2.6ms preprocess, 13.5ms inference, 0.5ms postprocess per image at shape (1, 3, 640, 448)

0: 640x448 1 person, 13.6ms
Speed: 3.2ms preprocess, 13.6ms inference, 0.6ms postprocess per image at shape (1, 3, 640, 448)

0: 640x448 1 person, 1 chair, 13.4ms
Speed: 2.7ms prepr

12 minutos vídeo de 17