# Face Cropping, Live Subtitles and pop-up Images

## Installing necessary libraries

* `moviepy.editor` was removed from the current version of `moviepy`. Therefore, we are using `moviepy==1.0.3`.

In [None]:
pip install numpy opencv-python moviepy==1.0.3 pydub webrtcvad torch torchaudio




## Smooth Face Tracking and Cropping

In [2]:
import cv2
import numpy as np
from moviepy.editor import VideoFileClip, AudioFileClip
from pydub import AudioSegment
import webrtcvad
import os
from collections import deque

def extract_audio(video_path, audio_output="temp.wav"):
    video = VideoFileClip(video_path)
    video.audio.write_audiofile(audio_output, codec="pcm_s16le")
    return audio_output, video.audio

def detect_speech(audio_path, aggressiveness=3):
    audio = AudioSegment.from_file(audio_path, format="wav")
    audio = audio.set_frame_rate(16000).set_channels(1)
    vad = webrtcvad.Vad(aggressiveness)

    speech_intervals = []
    window_duration = 0.03
    frame_rate = audio.frame_rate
    samples_per_window = int(frame_rate * window_duration)

    for i in range(0, len(audio.raw_data), samples_per_window * 2):
        chunk = audio.raw_data[i:i + samples_per_window * 2]
        if len(chunk) < samples_per_window * 2:
            continue
        is_speech = vad.is_speech(chunk, frame_rate)
        timestamp = i / (frame_rate * 2)
        if is_speech:
            speech_intervals.append((timestamp, timestamp + window_duration))

    return speech_intervals

def load_face_detector():
    net = cv2.dnn.readNetFromCaffe(
        "deploy.prototxt",
        "res10_300x300_ssd_iter_140000.caffemodel"
    )
    return net

def detect_faces(frame, net):
    (h, w) = frame.shape[:2]
    blob = cv2.dnn.blobFromImage(cv2.resize(frame, (300, 300)), 1.0, (300, 300), (104.0, 177.0, 123.0))
    net.setInput(blob)
    detections = net.forward()
    faces = []
    for i in range(detections.shape[2]):
        confidence = detections[0, 0, i, 2]
        if confidence > 0.5:
            box = detections[0, 0, i, 3:7] * np.array([w, h, w, h])
            x1, y1, x2, y2 = box.astype("int")
            x1, y1, x2, y2 = max(0, x1), max(0, y1), min(x2, w), min(y2, h)
            faces.append((x1, y1, x2 - x1, y2 - y1))
    return faces

def moving_average(values, window_size=5):
    if not values:
        return None
    smoothed = np.mean(values, axis=0).astype(int)
    return tuple(smoothed)

def process_video(input_path, output_path, speech_intervals, detection_skip=5, smoothing_window=5):
    cap = cv2.VideoCapture(input_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    output_width = 720
    output_height = 1280
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter("temp_video.mp4", fourcc, fps, (output_width, output_height))

    net = load_face_detector()
    frame_number = 0
    last_detected_faces = []
    bbox_history = deque(maxlen=smoothing_window)
    last_speaker_bbox = None

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        current_time = frame_number / fps
        in_speech = any(start <= current_time <= end for (start, end) in speech_intervals)

        if in_speech:
            if frame_number % detection_skip == 0 or not last_detected_faces:
                last_detected_faces = detect_faces(frame, net)

            if len(last_detected_faces) == 0:
                smoothed_bbox = last_speaker_bbox
            else:
                if len(last_detected_faces) == 1:
                    x, y, w, h = last_detected_faces[0]

                    expand_factor = 3.2
                    x_new = max(0, int(x - (w * (expand_factor - 1) / 2)))
                    w_new = min(width - x_new, int(w * expand_factor))
                    y_new = max(0, int(y - (h * (expand_factor - 1) / 2)))
                    h_new = min(height - y_new, int(h * expand_factor))

                    bbox_history.append((x_new, y_new, w_new, h_new))
                    smoothed_bbox = moving_average(list(bbox_history))

                    last_speaker_bbox = smoothed_bbox
                else:

                    stacked_frames = []
                    for x, y, w, h in last_detected_faces[:2]:
                        expand_factor = 3.2
                        x_new = max(0, int(x - (w * (expand_factor - 1) / 2)))
                        w_new = min(width - x_new, int(w * expand_factor))
                        y_new = max(0, int(y - (h * (expand_factor - 1) / 2)))
                        h_new = min(height - y_new, int(h * expand_factor))

                        bbox_history.append((x_new, y_new, w_new, h_new))
                        smoothed_bbox = moving_average(list(bbox_history))

                        x_s, y_s, w_s, h_s = smoothed_bbox
                        person_roi = frame[y_s:y_s + h_s, x_s:x_s + w_s]
                        resized_person = cv2.resize(person_roi, (output_width, output_height // 2))
                        stacked_frames.append(resized_person)

                    if len(stacked_frames) == 2:
                        padded_frame = np.vstack(stacked_frames)
                    else:
                        padded_frame = stacked_frames[0]

                    out.write(padded_frame)
                    frame_number += 1
                    continue
        else:
            smoothed_bbox = last_speaker_bbox

        if smoothed_bbox:
            x_s, y_s, w_s, h_s = smoothed_bbox
            person_roi = frame[y_s:y_s + h_s, x_s:x_s + w_s]

            aspect_ratio = w_s / h_s
            target_aspect_ratio = output_width / output_height

            if aspect_ratio > target_aspect_ratio:
                new_width = output_width
                new_height = int(output_width / aspect_ratio)
            else:
                new_height = output_height
                new_width = int(output_height * aspect_ratio)

            resized = cv2.resize(person_roi, (new_width, new_height))


            padded_frame = np.zeros((output_height, output_width, 3), dtype=np.uint8)
            start_x = (output_width - new_width) // 2
            start_y = (output_height - new_height) // 2
            padded_frame[start_y:start_y + new_height, start_x:start_x + new_width] = resized
        else:

            resized = cv2.resize(frame, (output_width, int(output_width * height / width)))
            padded_frame = np.zeros((output_height, output_width, 3), dtype=np.uint8)
            start_y = (output_height - resized.shape[0]) // 2
            padded_frame[start_y:start_y + resized.shape[0], :, :] = resized

        out.write(padded_frame)
        frame_number += 1

    cap.release()
    out.release()



def merge_audio(video_path, audio_clip, output_path):
    video = VideoFileClip(video_path)
    video = video.set_audio(audio_clip)
    video.write_videofile(output_path, codec="libx264", audio_codec="aac")

input_video = "output.mp4"
output_video = "face_tracked_output.mp4"

audio_file, audio_clip = extract_audio(input_video)
speech_times = detect_speech(audio_file)

process_video(input_video, output_video, speech_times, detection_skip=5, smoothing_window=10)

merge_audio("temp_video.mp4", audio_clip, output_video)
os.remove("temp_video.mp4")
print("Video processing complete!")


  if event.key is 'enter':



MoviePy - Writing audio in temp.wav




MoviePy - Done.
Moviepy - Building video face_tracked_output.mp4.
MoviePy - Writing audio in face_tracked_outputTEMP_MPY_wvf_snd.mp4




MoviePy - Done.
Moviepy - Writing video face_tracked_output.mp4






Moviepy - Done !
Moviepy - video ready face_tracked_output.mp4
Video processing complete!


##adding sub titles

In [3]:
!pip install openai-whisper

Collecting openai-whisper
  Downloading openai-whisper-20240930.tar.gz (800 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/800.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m798.7/800.5 kB[0m [31m26.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.5/800.5 kB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tiktoken (from openai-whisper)
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m50.9 MB/s[0m eta [36m0:00:00[0m
[?25hBuildin

In [4]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))


True
Tesla T4


In [7]:
import whisper
import cv2
import moviepy.editor as mp
import numpy as np
import torch

def transcribe_with_word_timestamps(audio_path):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")

    model = whisper.load_model("large").to(device)


    result = model.transcribe(audio_path, word_timestamps=True)


    word_segments = []
    for segment in result["segments"]:
        for word in segment["words"]:
            word_segments.append((word["start"], word["end"], word["word"]))

    return word_segments

def overlay_live_subtitles(video_path, subtitles, output_path):
    video = mp.VideoFileClip(video_path)

    def add_text(get_frame, t):
        frame = get_frame(t)
        h, w, _ = frame.shape
        overlay = frame.copy()
        font = cv2.FONT_HERSHEY_SIMPLEX
        font_scale = 1.5
        font_thickness = 3

        current_text = " ".join([word for start, end, word in subtitles if start <= t <= end])

        if current_text:
            text_size = cv2.getTextSize(current_text, font, font_scale, font_thickness)[0]
            text_x = (w - text_size[0]) // 2
            text_y = h - 200

            padding = 20
            bg_x1, bg_y1 = text_x - padding, text_y - text_size[1] - padding
            bg_x2, bg_y2 = text_x + text_size[0] + padding, text_y + padding
            cv2.rectangle(overlay, (bg_x1, bg_y1), (bg_x2, bg_y2), (0, 0, 0), -1)

            alpha = 0.5
            frame = cv2.addWeighted(overlay, alpha, frame, 1 - alpha, 0)

            cv2.putText(frame, current_text, (text_x, text_y), font, font_scale, (255, 255, 255), font_thickness, cv2.LINE_AA)

        return frame


    new_video = video.fl(add_text)
    new_video.write_videofile(output_path, fps=video.fps)

word_subtitles = transcribe_with_word_timestamps("face_tracked_output.mp4")
overlay_live_subtitles("face_tracked_output.mp4", word_subtitles, "subtitled_output.mp4")


Using device: cuda



100%|█████████████████████████████████████| 2.88G/2.88G [00:38<00:00, 80.5MiB/s]
  checkpoint = torch.load(fp, map_location=device)



Moviepy - Building video subtitled_output.mp4.
MoviePy - Writing audio in subtitled_outputTEMP_MPY_wvf_snd.mp3




MoviePy - Done.
Moviepy - Writing video subtitled_output.mp4





Moviepy - Done !
Moviepy - video ready subtitled_output.mp4


In [8]:
!pip install groq

Collecting groq
  Downloading groq-0.18.0-py3-none-any.whl.metadata (14 kB)
Downloading groq-0.18.0-py3-none-any.whl (121 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/121.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m121.9/121.9 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: groq
Successfully installed groq-0.18.0


In [9]:

import os
from groq import Groq

client = Groq(api_key='YOUR_API_KEY')
filename = 'subtitled_output.mp4'

with open(filename, "rb") as file:
    transcription = client.audio.transcriptions.create(
      file=(filename, file.read()),
      model="whisper-large-v3",
      response_format="verbose_json",
    )
    print(transcription.text)


 and difficult and has it got better or is it something you're always working on so on one side of things struggle that school as i said so the teachers would write these reports that i was not focused or i wasn't doing well and i remember the fear of that report every year and i try so hard to do well and then these teachers i don't know if teachers out there realize when they write those reports what's happening back home whether you have an abusive household you know the the stress of that was was difficult i feel in racing if i if i would win i could see a smile on my that's fair and it was really like okay if I do well at this I know that I'll be accepted um but I've got to work double hard to be I've got to always be first I always laugh about the whole if you're not first you're last because I'm literally say I've not been first my whole life first was everything yeah um in order to be accepted in order to and maybe to be appreciated um not only in within my relationship perhaps

In [10]:
transcription_text = transcription.text
transcription_segments = transcription.segments

In [11]:
from groq import Groq
import json

def extract_keywords(transcribed_text):
    client = Groq(api_key='YOUR_API_KEY')
    completion = client.chat.completions.create(
        model="llama-3.3-70b-versatile",
        messages=[
        {
            "role": "system",
            "content": "you are a image's keyword provider assistent, you whole task is to look into the transcribed text and by understanding the transcribed text you need to look for the important keywords or phrases and give me the 'image_keyword' in JSON such that i can use those keyword to search the web and find images of that. and those keywords should be or phrases should be in the transcribed text."
        },

       {"role": "assistant",
            "content": transcribed_text}
    ],
    temperature=1,
    max_completion_tokens=1024,
    top_p=1,
    stream=False,
    response_format={"type": "json_object"},
    stop=None,
    )

    response_text = completion.choices[0].message.content
    keyword_data = json.loads(response_text)
    return keyword_data.get("image_keyword", [])



In [12]:
!pip install bing-image-downloader

Collecting bing-image-downloader
  Downloading bing_image_downloader-1.1.2-py3-none-any.whl.metadata (2.8 kB)
Downloading bing_image_downloader-1.1.2-py3-none-any.whl (5.9 kB)
Installing collected packages: bing-image-downloader
Successfully installed bing-image-downloader-1.1.2


In [13]:
from bing_image_downloader import downloader

def download_images(keywords, limit=1):
    image_paths = {}

    for keyword in keywords:
        downloader.download(keyword, limit=limit, output_dir="images", adult_filter_off=True, force_replace=False, timeout=60)
        image_paths[keyword] = f"images/{keyword}"

    return image_paths


In [14]:

keywords = extract_keywords(transcription_text)
image_paths = download_images(keywords)

print(image_paths)

[%] Downloading Images to /content/images/racing


[!!]Indexing page: 1

[%] Indexed 1 Images on Page 1.


[%] Downloading Image #1 from https://images.pexels.com/photos/158971/pexels-photo-158971.jpeg?cs=srgb&amp;dl=auto-racing-car-championship-158971.jpg&amp;fm=jpg
[%] File Downloaded !



[%] Done. Downloaded 1 images.
[%] Downloading Images to /content/images/school reports


[!!]Indexing page: 1

[%] Indexed 1 Images on Page 1.


[%] Downloading Image #1 from http://www.samplestemplates.org/wp-content/uploads/2015/05/school-Report-Template-image-.png
[%] File Downloaded !



[%] Done. Downloaded 1 images.
[%] Downloading Images to /content/images/abusive household


[!!]Indexing page: 1

[%] Indexed 1 Images on Page 1.


[%] Downloading Image #1 from https://domestic-violence.laws.com/wp-content/uploads/sites/79/2019/12/4dbb42db00924.jpg
[%] File Downloaded !



[%] Done. Downloaded 1 images.
[%] Downloading Images to /content/images/stress


[!!]Indexing page: 1

[%] Indexed 1 Im

In [15]:
def get_keyword_timestamps(transcription_segments, keywords):
    keyword_timestamps = {}

    for segment in transcription_segments:
        text = segment["text"]
        start = segment["start"]
        end = segment["end"]

        for keyword in keywords:
            if keyword.lower() in text.lower():
                if keyword not in keyword_timestamps:
                    keyword_timestamps[keyword] = []
                keyword_timestamps[keyword].append((max(0, start - 0.5), end + 0.5))

    return keyword_timestamps


In [16]:
from moviepy.editor import VideoFileClip, ImageClip, CompositeVideoClip
import os

def get_first_image_path(folder):
    if os.path.exists(folder):
        files = [f for f in os.listdir(folder) if f.endswith((".jpg", ".png", ".jpeg"))]
        return os.path.join(folder, files[0]) if files else None
    return None

def add_images_to_video(video_path, keyword_timestamps, image_paths, output_path):
    video = VideoFileClip(video_path)
    video_w, video_h = video.size

    overlays = []

    for keyword, timestamps in keyword_timestamps.items():
        img_path = get_first_image_path(image_paths[keyword])

        if not img_path:
            print(f"No valid image found for {keyword}")
            continue

        img_clip = (ImageClip(img_path, transparent=True)
                    .set_duration(1.0)
                    .resize(width=video_w * 0.8)
                    .set_opacity(1))

        for start, end in timestamps:
            fade_duration = 0.2
            print(f"Overlaying {keyword} from {start}s to {end}s")


            clip = (img_clip
                    .set_position(("center", "top"))
                    .set_start(start)
                    .set_duration(end - start)
                    .crossfadein(fade_duration)
                    .crossfadeout(fade_duration))

            overlays.append(clip)

    if not overlays:
        print("No overlays were added. Check timestamps and image paths.")


    final_video = CompositeVideoClip([video] + overlays, size=(video_w, video_h))
    final_video.write_videofile(output_path, fps=video.fps, codec="libx264", threads=4)



In [18]:
keyword_timestamps = get_keyword_timestamps(transcription_segments, keywords)

add_images_to_video("subtitled_output.mp4", keyword_timestamps, image_paths, "final_output.mp4")

✅ Overlaying abusive household from 17.04s to 22.4s
✅ Overlaying racing from 21.4s to 28.24s
✅ Overlaying stress from 21.4s to 28.24s
✅ Overlaying stress from 61.04s to 66.08s
⚠️ No valid image found for bigger picture
✅ Overlaying emotional roller coaster from 68.48s to 74.82s
✅ Overlaying winning a race from 86.16s to 88.52s
Moviepy - Building video final_output.mp4.
MoviePy - Writing audio in final_outputTEMP_MPY_wvf_snd.mp3




MoviePy - Done.
Moviepy - Writing video final_output.mp4






Moviepy - Done !
Moviepy - video ready final_output.mp4
