In [None]:
import os
from pathlib import Path

import cv2
import mediapipe as mp
import numpy as np
from IPython.display import Video


# ==== THAM SỐ CƠ BẢN ====
SEQ_LEN = 32        # số frame cố định mỗi video
FEATURE_DIM = 63    # 21 điểm tay * 3 (x,y,z) cho 1 tay
# FEATURE_DIM_TWO_HANDS = 126  # 21 điểm tay * 3 (x,y,z) cho 2 tay
# =========================

PROJECT_ROOT = Path("/home/dangkhoi/dev/Projects/WorldLevel_DIP")
DATA_RAW_ROOT = PROJECT_ROOT / "data_raw"
DATA_NPY_ROOT = PROJECT_ROOT / "data_npy"
META_DIR = PROJECT_ROOT / "meta"
FILELIST_PATH = META_DIR / "filelist.csv"

mp_hands = mp.solutions.hands
hands = mp_hands.Hands(
    static_image_mode=False,
    max_num_hands=1,
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5
)


I0000 00:00:1764428298.502109  104558 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1764428298.504072  109206 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 Mesa 25.0.7-0ubuntu0.24.04.2), renderer: AMD Radeon 780M (radeonsi, phoenix, LLVM 20.1.2, DRM 3.61, 6.14.0-36-generic)


W0000 00:00:1764428298.530897  109196 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1764428298.550474  109188 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


In [16]:
def preview_with_landmarks(video_path: str, max_frames=100, out_path="debug_hand.mp4"):
    cap = cv2.VideoCapture(video_path)
    frames = []

    if not cap.isOpened():
        print("Cannot open video:", video_path)
        return

    count = 0
    while count < max_frames:
        ret, frame = cap.read()
        if not ret:
            break

        # detect tay
        image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        result = hands.process(image_rgb)

        if result.multi_hand_landmarks:
            for hand_lms in result.multi_hand_landmarks:
                mp_drawing.draw_landmarks(
                    frame,
                    hand_lms,
                    mp_hands.HAND_CONNECTIONS
                )

        frames.append(frame)
        count += 1

    cap.release()

    print("Captured frames:", len(frames))
    if len(frames) == 0:
        print("Không lấy được frame nào – check lại video_path hoặc OpenCV.")
        return

    # ghi ra video mp4
    h, w, _ = frames[0].shape
    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    writer = cv2.VideoWriter(out_path, fourcc, 20, (w, h))
    for f in frames:
        writer.write(f)
    writer.release()

    # dùng IPython Video để hiển thị
    return Video(out_path, embed=True)


In [17]:
def extract_keypoints_from_frame(image_bgr: np.ndarray) -> np.ndarray:
    """Nhận 1 frame BGR → trả về vector (FEATURE_DIM,) chứa keypoints bàn tay."""
    image_rgb = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB)
    result = hands.process(image_rgb)
    keypoints = []

    if result.multi_hand_landmarks:
        # lấy tay đầu tiên
        hand_landmarks = result.multi_hand_landmarks[0]
        for lm in hand_landmarks.landmark:
            keypoints.extend([lm.x, lm.y, lm.z])

    if len(keypoints) == 0:
        # không thấy tay → vector 0
        keypoints = [0.0] * FEATURE_DIM

    return np.array(keypoints, dtype=np.float32)


def video_to_seq(video_path: str, seq_len: int = SEQ_LEN) -> np.ndarray | None:
    """Đọc 1 video → trả về tensor (SEQ_LEN, FEATURE_DIM)."""
    cap = cv2.VideoCapture(video_path)
    frames = []

    if not cap.isOpened():
        print(f"Cannot open video: {video_path}")
        return None

    while True:
        ret, frame = cap.read()
        if not ret:
            break
        kp = extract_keypoints_from_frame(frame)
        frames.append(kp)

    cap.release()

    if len(frames) == 0:
        print(f"No frames extracted from: {video_path}")
        return None

    frames = np.stack(frames, axis=0)  # (num_frames, FEATURE_DIM)
    num_frames = frames.shape[0]

    if num_frames >= seq_len:
        idxs = np.linspace(0, num_frames - 1, seq_len).astype(int)
        frames = frames[idxs]
    else:
        pad_len = seq_len - num_frames
        pad = np.zeros((pad_len, FEATURE_DIM), dtype=np.float32)
        frames = np.concatenate([frames, pad], axis=0)

    return frames  # (SEQ_LEN, FEATURE_DIM)


In [18]:
video_path= "/home/dangkhoi/dev/Projects/WorldLevel_DIP/data_raw/happy/26531.mp4"

print("Test video:", video_path)

seq = video_to_seq(str(video_path))
print("seq shape:", seq.shape)           # mong đợi (32, 63)
print("First frame (10 values):", seq[0][:10])

# đếm xem có bao nhiêu frame thực sự có tay (vector != 0)
nonzero_frames = np.sum(np.any(seq != 0.0, axis=1))
print("Frames with hand landmarks:", nonzero_frames, "/", SEQ_LEN)


Test video: /home/dangkhoi/dev/Projects/WorldLevel_DIP/data_raw/happy/26531.mp4
seq shape: (32, 63)
First frame (10 values): [ 5.8695352e-01  8.5566169e-01  1.9220279e-07  5.7223064e-01
  8.2091111e-01 -9.1049075e-03  5.4879409e-01  8.0311847e-01
 -1.3833984e-02  5.2890819e-01]
Frames with hand landmarks: 32 / 32


In [19]:
from IPython.display import HTML
from base64 import b64encode

mp_drawing = mp.solutions.drawing_utils

def preview_with_landmarks(video_path: str, max_frames=100):
    cap = cv2.VideoCapture(video_path)
    frames = []

    count = 0
    while count < max_frames:
        ret, frame = cap.read()
        if not ret:
            break

        image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        result = hands.process(image_rgb)

        if result.multi_hand_landmarks:
            for hand_lms in result.multi_hand_landmarks:
                mp_drawing.draw_landmarks(
                    frame,
                    hand_lms,
                    mp_hands.HAND_CONNECTIONS
                )

        frames.append(frame)
        count += 1

    cap.release()

    # ghi tạm thành file mp4 rồi nhúng vào notebook
    out_path = "debug_hand.mp4"
    h, w, _ = frames[0].shape
    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    out = cv2.VideoWriter(out_path, fourcc, 20, (w, h))
    for f in frames:
        out.write(f)
    out.release()

    mp4 = open(out_path, "rb").read()
    data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
    return HTML(f'<video controls src="{data_url}" />')

preview_with_landmarks(str(video_path))
