# Touchless HCI for Media Control Using Hand Gestures (VLC)

**Objective:**  
Develop a real-time touchless Human–Computer Interaction (HCI) system using MediaPipe Hands to control VLC Media Player via hand gestures.

**Key Features:**
- MediaPipe Hands for hand landmark detection
- Gesture-based VLC control using keyboard shortcuts
- Real-time FPS, accuracy, and end-to-end latency measurement
- Optimized for real-time performance (<200 ms latency)


In Command Prompt type: "C:\Program Files\VideoLAN\VLC\vlc.exe" --extraintf http --http-port 8090 --http-password vlc123

## 1. Imports and Setup

Install and import all required libraries for the pipeline.

In [8]:
import cv2
import time
import math
import mediapipe as mp
from pynput.keyboard import Controller, Key

CELL 2 — VLC KEYBOARD CONTROL LOGIC

In [9]:
keyboard = Controller()

def send_vlc_key(gesture):
    if gesture == "play_pause":
        keyboard.press(Key.space)
        keyboard.release(Key.space)

    elif gesture == "stop":
        keyboard.press('s')
        keyboard.release('s')

    elif gesture == "volume_up":
        keyboard.press(Key.ctrl)
        keyboard.press(Key.up)
        keyboard.release(Key.up)
        keyboard.release(Key.ctrl)

    elif gesture == "volume_down":
        keyboard.press(Key.ctrl)
        keyboard.press(Key.down)
        keyboard.release(Key.down)
        keyboard.release(Key.ctrl)

    elif gesture == "next":
        keyboard.press('n')
        keyboard.release('n')

    elif gesture == "previous":
        keyboard.press('p')
        keyboard.release('p')

    elif gesture == "minimize":
        keyboard.press(Key.cmd)
        keyboard.press(Key.down)
        keyboard.release(Key.down)
        keyboard.release(Key.cmd)

    elif gesture == "close":
        keyboard.press(Key.alt)
        keyboard.press(Key.f4)
        keyboard.release(Key.f4)
        keyboard.release(Key.alt)


CELL 3 — MEDIAPIPE INITIALIZATION

In [10]:
mp_hands = mp.solutions.hands
mp_draw = mp.solutions.drawing_utils

hands = mp_hands.Hands(
    static_image_mode=False,
    max_num_hands=1,
    min_detection_confidence=0.7,
    min_tracking_confidence=0.7
)

CELL 4 — GESTURE & FEATURE EXTRACTION

In [11]:
GENERAL_COOLDOWN = 0.25
NAV_COOLDOWN = 1.2

def distance(a, b):
    return math.hypot(a.x - b.x, a.y - b.y)

def get_finger_states(landmarks):
    wrist = landmarks.landmark[0]
    fingers = []

    fingers.append(distance(landmarks.landmark[4], wrist)  > distance(landmarks.landmark[2], wrist)  * 1.15)
    fingers.append(distance(landmarks.landmark[8], wrist)  > distance(landmarks.landmark[6], wrist)  * 1.15)
    fingers.append(distance(landmarks.landmark[12], wrist) > distance(landmarks.landmark[10], wrist) * 1.15)
    fingers.append(distance(landmarks.landmark[16], wrist) > distance(landmarks.landmark[14], wrist) * 1.15)
    fingers.append(distance(landmarks.landmark[20], wrist) > distance(landmarks.landmark[18], wrist) * 1.15)

    return [int(f) for f in fingers]

def recognize_gesture(states, landmarks):
    wrist = landmarks.landmark[0]
    thumb_tip = landmarks.landmark[4]

    if states == [1,1,1,1,1]:
        return "play_pause"
    if states == [0,0,0,0,0]:
        return "stop"
    if states == [1,0,0,0,0] and thumb_tip.y < wrist.y:
        return "volume_up"
    if states == [1,0,0,0,0] and thumb_tip.y > wrist.y:
        return "volume_down"
    if states == [1,1,0,0,1]:
        return "next"
    if states == [1,0,0,0,1]:
        return "previous"
    if states == [1,1,1,1,0]:
        return "minimize"
    if states == [1,1,1,0,1]:
        return "close"
    return "unknown"


CELL 5 — GESTURE STABILIZATION

In [12]:
class GestureStabilizer:
    def __init__(self, frames=2):
        self.history = []
        self.frames = frames

    def update(self, gesture):
        self.history.append(gesture)
        if len(self.history) > self.frames:
            self.history.pop(0)
        if self.history.count(gesture) == self.frames:
            return gesture
        return None

stabilizer = GestureStabilizer(frames=2)


CELL 6 — CAMERA & PERFORMANCE METRICS

In [13]:
cap = cv2.VideoCapture(0)
cap.set(3, 640)
cap.set(4, 480)

fps_start = time.time()
fps_counter = 0
fps = 0

latency_ms = 0
latency_start_time = None

last_gesture = None
last_action_time = 0

total_gestures = 0
valid_gestures = 0
accuracy = 0

TARGET_FPS = 18
FRAME_DELAY = 1 / TARGET_FPS

print("Touchless VLC Control Started | Focus VLC window | Press 'q' to exit")


Touchless VLC Control Started | Focus VLC window | Press 'q' to exit


CELL 7 — MAIN EXECUTION LOOP

In [14]:
while True:
    frame_start = time.time()

    ret, frame = cap.read()
    if not ret:
        break

    frame = cv2.flip(frame, 1)

    # FPS
    fps_counter += 1
    if time.time() - fps_start >= 1:
        fps = fps_counter
        fps_counter = 0
        fps_start = time.time()

    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = hands.process(rgb)

    if results.multi_hand_landmarks:
        hand = results.multi_hand_landmarks[0]
        mp_draw.draw_landmarks(frame, hand, mp_hands.HAND_CONNECTIONS)

        states = get_finger_states(hand)
        gesture = recognize_gesture(states, hand)
        stable = stabilizer.update(gesture)

        # ⏱️ Start END-TO-END latency timer when gesture becomes STABLE
        if stable and latency_start_time is None:
            latency_start_time = time.time()

        if stable:
            total_gestures += 1
            if stable != "unknown":
                valid_gestures += 1

                now = time.time()
                cooldown = NAV_COOLDOWN if stable in ["next", "previous"] else GENERAL_COOLDOWN

                if (
                    stable != "unknown"
                    and (stable != last_gesture or stable in ["next", "previous"])
                    and (now - last_action_time) > cooldown
                ):
                    send_vlc_key(stable)

                    # ⏱️ End END-TO-END latency measurement
                    if latency_start_time is not None:
                        latency_ms = int((time.time() - latency_start_time) * 1000)
                        latency_start_time = None

                    last_gesture = stable
                    last_action_time = now

            accuracy = int((valid_gestures / total_gestures) * 100)

            cv2.putText(frame, f"Gesture: {stable}", (10, 120),
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (0,255,0), 2)
    else:
        stabilizer.history.clear()
        last_gesture = None
        latency_start_time = None

    # ================= OVERLAYS =================
    cv2.putText(frame, f"FPS: {fps}", (10, 30),
                cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0,255,0), 2)

    cv2.putText(frame, f"Latency (E2E): {latency_ms} ms", (10, 60),
                cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0,255,255), 2)

    cv2.putText(frame, f"Accuracy: {accuracy}%", (10, 90),
                cv2.FONT_HERSHEY_SIMPLEX, 0.9, (255,200,0), 2)

    cv2.imshow("Touchless VLC Control", frame)

    elapsed = time.time() - frame_start
    if elapsed < FRAME_DELAY:
        time.sleep(FRAME_DELAY - elapsed)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
hands.close()
cv2.destroyAllWindows()