In [2]:
pip install opencv-python mediapipe numpy




In [5]:
import cv2
import mediapipe as mp
import numpy as np

# Import the new API components for MediaPipe Tasks
from mediapipe.tasks import python
from mediapipe.tasks.python.vision import FaceLandmarker, FaceLandmarkerOptions, RunningMode

# Download the model asset for FaceLandmarker
# This model is the successor to the older Face Mesh.
# More details: https://developers.google.com/mediapipe/solutions/vision/face_landmarker/python
model_path = 'face_landmarker_with_attention.task'
# Use !wget to download the model quietly (-q) and output to a specified file (-O).
!wget -q -O {model_path} https://storage.googleapis.com/mediapipe-models/face_landmarker/face_landmarker/float16/1/face_landmarker.task

# -----------------------------
# Load video & overlay image
# -----------------------------
video_path = "input_video.mp4"
overlay_path = "overlay.png"   # Ensure this is a transparent PNG

cap = cv2.VideoCapture(video_path)
overlay_img = cv2.imread(overlay_path, cv2.IMREAD_UNCHANGED)

# Validate overlay image to ensure it's a transparent PNG
if overlay_img is None or overlay_img.shape[2] != 4:
    raise ValueError("Overlay image must be a transparent PNG with an alpha channel")

width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = cap.get(cv2.CAP_PROP_FPS)
if fps == 0: # Handle cases where FPS might be 0, set a default
    fps = 30.0 # Use float for calculations

out = cv2.VideoWriter(
    "output_video.mp4",
    cv2.VideoWriter_fourcc(*"mp4v"), # Codec for output video
    fps,
    (width, height)
)

# -----------------------------
# MediaPipe Face Landmarker (New API Setup)
# -----------------------------
base_options = python.BaseOptions(model_asset_path=model_path)
options = FaceLandmarkerOptions(
    base_options=base_options,
    running_mode=RunningMode.VIDEO, # Set to VIDEO mode for processing video frames
    num_faces=1, # Detect a single face
    output_face_blendshapes=False, # Not needed for this task
    output_facial_transformation_matrixes=False # Not needed for this task
)
# Create a FaceLandmarker object to detect faces and their landmarks.
detector = FaceLandmarker.create_from_options(options)

# -----------------------------
# Overlay function (alpha blend with robust boundary handling)
# -----------------------------
def overlay_transparent(bg, overlay, x, y, w, h):
    # Ensure overlay dimensions are positive
    if w <= 0 or h <= 0:
        return bg

    overlay_resized = cv2.resize(overlay, (w, h))

    # If overlay image does not have an alpha channel, return background as is
    if overlay_resized.shape[2] < 4:
        return bg

    overlay_rgb = overlay_resized[:, :, :3]
    alpha = overlay_resized[:, :, 3] / 255.0  # Alpha channel (0-1 range)

    # Calculate valid region for overlay within background (clamping coordinates)
    y1, y2 = max(0, y), min(bg.shape[0], y + h)
    x1, x2 = max(0, x), min(bg.shape[1], x + w)

    # Calculate corresponding region in overlay and mask based on clamped coordinates
    overlay_crop_y1 = y1 - y
    overlay_crop_y2 = h - (y + h - y2)
    overlay_crop_x1 = x1 - x
    overlay_crop_x2 = w - (x + w - x2)

    # Extract the cropped overlay and mask, ensuring they are valid
    if overlay_crop_y1 >= overlay_crop_y2 or overlay_crop_x1 >= overlay_crop_x2:
        return bg # No valid area to overlay

    overlay_cropped_rgb = overlay_rgb[overlay_crop_y1:overlay_crop_y2, overlay_crop_x1:overlay_crop_x2]
    alpha_cropped = alpha[overlay_crop_y1:overlay_crop_y2, overlay_crop_x1:overlay_crop_x2]

    # Perform alpha blending for each color channel
    for c in range(3):
        bg[y1:y2, x1:x2, c] = (
            alpha_cropped * overlay_cropped_rgb[:, :, c] +
            (1 - alpha_cropped) * bg[y1:y2, x1:x2, c]
        )
    return bg


# -----------------------------
# Smoothing variables for stable overlay placement
# -----------------------------
prev_cx, prev_cy, prev_eye_dist = None, None, None
alpha_smooth = 0.7 # Smoothing factor (0.0 to 1.0; higher means more smoothing)


# -----------------------------
# Process video frame by frame
# -----------------------------
timestamp_ms = 0 # Initialize timestamp for video mode processing
while cap.isOpened():
    ret, frame = cap.read()
    if not ret: # Break if no frame is returned (end of video or error)
        break

    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) # Convert BGR to RGB for MediaPipe
    mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=rgb_frame)

    # Detect face landmarks using the new API
    # The timestamp is crucial for `RunningMode.VIDEO`
    face_landmarker_result = detector.detect_for_video(mp_image, timestamp_ms)

    if face_landmarker_result.face_landmarks: # Check if any faces were detected
        lm = face_landmarker_result.face_landmarks[0] # Get landmarks for the first detected face

        # Use specific landmark indices for eye points from the 478-point mesh.
        # These indices are commonly used in the FaceLandmarker model.
        left_eye_landmark = lm[33]  # Approximate left eye outer corner
        right_eye_landmark = lm[263] # Approximate right eye outer corner

        # Convert normalized coordinates (0.0 to 1.0) to pixel coordinates
        lx, ly = int(left_eye_landmark.x * width), int(left_eye_landmark.y * height)
        rx, ry = int(right_eye_landmark.x * width), int(right_eye_landmark.y * height)

        # Calculate center point between the eyes for overlay placement
        cx = (lx + rx) // 2
        cy = (ly + ry) // 2

        # Calculate distance between eyes for scaling the overlay
        eye_dist = int(np.hypot(rx - lx, ry - ly))

        # Apply smoothing to the position and size to make the overlay movement less jittery
        if prev_cx is None:
            # Initialize smoothing variables on the first detected frame
            prev_cx, prev_cy, prev_eye_dist = cx, cy, eye_dist
        else:
            # Apply exponential moving average for smoothing
            cx = int(alpha_smooth * prev_cx + (1 - alpha_smooth) * cx)
            cy = int(alpha_smooth * prev_cy + (1 - alpha_smooth) * cy)
            eye_dist = int(alpha_smooth * prev_eye_dist + (1 - alpha_smooth) * eye_dist)

            # Update previous values for the next frame
            prev_cx, prev_cy, prev_eye_dist = cx, cy, eye_dist

        # Determine glasses size based on eye distance and aspect ratio of overlay image
        overlay_w = int(2.0 * eye_dist) # Scale factor for width, adjust as needed
        overlay_h = int(
            overlay_w * overlay_img.shape[0] / overlay_img.shape[1] # Maintain aspect ratio
        )

        # Calculate position for the overlay (centered between eyes, slightly lifted)
        x = cx - overlay_w // 2
        y = cy - overlay_h // 2 - int(0.15 * overlay_h) # Adjust Y to lift glasses slightly above eyes

        # Clamp overlay position within frame boundaries to prevent drawing out of bounds
        x = max(0, min(x, width - overlay_w))
        y = max(0, min(y, height - overlay_h))

        # Apply the transparent overlay to the frame
        frame = overlay_transparent(
            frame, overlay_img, x, y, overlay_w, overlay_h
        )

    out.write(frame) # Write the processed frame to the output video
    # cv2.imshow("Glasses Overlay", frame) # cv2.imshow does not work in Colab environments directly

    # if cv2.waitKey(1) & 0xFF == 27: # This key press listener also won't work without a display
    #     break

    timestamp_ms += int(1000 / fps) # Increment timestamp for the next frame processing

# -----------------------------
# Cleanup
# -----------------------------
cap.release() # Release video capture object
out.release() # Release video writer object
# cv2.destroyAllWindows() # Not applicable in Colab as there's no display
detector.close() # Close the MediaPipe FaceLandmarker detector to release resources