In [1]:
import mediapipe as mp
import cv2
import numpy as np

# Initialize MediaPipe Hand Solution
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=2, min_detection_confidence=0.7)
mp_drawing = mp.solutions.drawing_utils

def extract_hand_landmarks(image):
    """Extract hand landmarks using MediaPipe."""
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    results = hands.process(image_rgb)

    if results.multi_hand_landmarks:
        landmarks = []
        for hand_landmarks in results.multi_hand_landmarks:
            for lm in hand_landmarks.landmark:
                landmarks.append([lm.x, lm.y, lm.z])  # Normalize landmark positions
            mp_drawing.draw_landmarks(image, hand_landmarks, mp_hands.HAND_CONNECTIONS)
        return np.array(landmarks).flatten()  # Return flattened array of landmarks
    return None


I0000 00:00:1737281237.395905 1120784 gl_context.cc:369] GL version: 2.1 (2.1 Metal - 88.1), renderer: Apple M2


In [3]:
import os
import pandas as pd

# Parameters
gestures = ['up', 'down', 'flip', 'stop', 'throttle']  # Labels for gestures
data_dir = './gesture_data'

# Create data directory
os.makedirs(data_dir, exist_ok=True)

cap = cv2.VideoCapture(0)
current_gesture = 0

print(f"Press 's' to save landmarks for gesture: {gestures[current_gesture]}.")

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    frame = cv2.flip(frame, 1)  # Flip for mirror effect
    landmarks = extract_hand_landmarks(frame)

    if landmarks is not None:
        cv2.putText(frame, f"Gesture: {gestures[current_gesture]}", (10, 30),
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

    cv2.imshow("Collecting Data", frame)

    key = cv2.waitKey(1) & 0xFF
    if key == ord('s') and landmarks is not None:
        # Save landmarks to CSV
        save_path = os.path.join(data_dir, f"{gestures[current_gesture]}_data.csv")
        pd.DataFrame([landmarks]).to_csv(save_path, mode='a', index=False, header=False)
        print(f"Saved data for gesture: {gestures[current_gesture]}")
    elif key == ord('n'):
        # Switch to the next gesture
        current_gesture = (current_gesture + 1) % len(gestures)
        print(f"Switched to gesture: {gestures[current_gesture]}")
    elif key == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()


Press 's' to save landmarks for gesture: up.


W0000 00:00:1737281290.936206 1121746 landmark_projection_calculator.cc:186] Using NORM_RECT without IMAGE_DIMENSIONS is only supported for the square ROI. Provide IMAGE_DIMENSIONS or use PROJECTION_MATRIX.


Saved data for gesture: up
Saved data for gesture: up
Saved data for gesture: up
Saved data for gesture: up
Saved data for gesture: up
Saved data for gesture: up
Saved data for gesture: up
Saved data for gesture: up
Saved data for gesture: up
Saved data for gesture: up
Saved data for gesture: up
Saved data for gesture: up
Saved data for gesture: up
Saved data for gesture: up
Saved data for gesture: up
Saved data for gesture: up
Saved data for gesture: up
Saved data for gesture: up
Saved data for gesture: up
Saved data for gesture: up
Saved data for gesture: up
Saved data for gesture: up
Saved data for gesture: up
Saved data for gesture: up
Saved data for gesture: up
Saved data for gesture: up
Saved data for gesture: up
Saved data for gesture: up
Saved data for gesture: up
Saved data for gesture: up
Saved data for gesture: up
Saved data for gesture: up
Saved data for gesture: up
Saved data for gesture: up
Saved data for gesture: up
Saved data for gesture: up
Saved data for gesture: up
S

In [8]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Define gestures
gestures = ['up', 'down', 'flip', 'stop', 'throttle']

# Define data directory
data_dir = './gesture_data'

# Load Dataset
data = []
labels = []

for gesture in gestures:
    file_path = os.path.join(data_dir, f"{gesture}_data.csv")
    if os.path.exists(file_path):
        try:
            df = pd.read_csv(file_path, header=None, on_bad_lines='skip')  # Skip bad lines
            data.append(df.values)
            labels.extend([gesture] * len(df))
        except Exception as e:
            print(f"Error loading {file_path}: {e}")
    else:
        print(f"File {file_path} does not exist. Skipping...")

# Combine data into numpy arrays
if data:
    data = np.vstack(data)
    labels = np.array(labels)

    # Split Data
    X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

    # Train Random Forest Classifier
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # Evaluate Model
    y_pred = model.predict(X_test)
    print(f"Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%")

    # Save the model
    import joblib
    joblib.dump(model, 'gesture_classifier.pkl')
    print("Model saved as 'gesture_classifier.pkl'")
else:
    print("No data found. Ensure your dataset is prepared correctly.")


Accuracy: 100.00%
Model saved as 'gesture_classifier.pkl'


In [12]:
import cv2

In [2]:
##This code is with the safety feature.
import mediapipe as mp
import cv2
import numpy as np
import joblib

# Load trained model
model = joblib.load('gesture_classifier.pkl')

# Define gestures for prediction
gestures = ['up', 'down', 'flip', 'stop', 'throttle']

# Initialize MediaPipe Hand Solution
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=1, min_detection_confidence=0.7)
mp_drawing = mp.solutions.drawing_utils

# Function to extract hand landmarks
def extract_hand_landmarks(image):
    """
    Extract normalized hand landmarks from an image using MediaPipe.
    """
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    results = hands.process(image_rgb)

    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            landmarks = []
            for lm in hand_landmarks.landmark:
                landmarks.append([lm.x, lm.y, lm.z])  # Normalize landmark positions
            return np.array(landmarks).flatten(), results.multi_hand_landmarks[0].landmark  # Return landmarks and hand bbox
    return None, None

# Real-time Gesture Recognition
cap = cv2.VideoCapture(0)
print("Press 'q' to quit.")

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    frame = cv2.flip(frame, 1)  # Flip for mirror effect
    landmarks, hand_landmarks = extract_hand_landmarks(frame)

    if landmarks is not None:
        # Predict gesture
        prediction = model.predict([landmarks])
        label = prediction[0]  # Get predicted gesture
        prediction_proba = model.predict_proba([landmarks])[0]
        accuracy = max(prediction_proba) * 100  # Get accuracy of prediction

        # Draw hand landmarks
        if hand_landmarks:
            for landmark in hand_landmarks:
                x = int(landmark.x * frame.shape[1])
                y = int(landmark.y * frame.shape[0])
                cv2.circle(frame, (x, y), 5, (0, 255, 0), -1)

        # Draw a bounding box around the hand
        x_min = min([int(landmark.x * frame.shape[1]) for landmark in hand_landmarks])
        y_min = min([int(landmark.y * frame.shape[0]) for landmark in hand_landmarks])
        x_max = max([int(landmark.x * frame.shape[1]) for landmark in hand_landmarks])
        y_max = max([int(landmark.y * frame.shape[0]) for landmark in hand_landmarks])

        cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0, 0, 255), 2)

        # Display label and accuracy
        cv2.putText(frame, f"Gesture: {label}", (x_min, y_min - 10), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
        cv2.putText(frame, f"Accuracy: {accuracy:.2f}%", (x_min, y_min - 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)

    cv2.imshow("Real-Time Hand Gesture Recognition", frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()


I0000 00:00:1737444216.647158 1759777 gl_context.cc:369] GL version: 2.1 (2.1 Metal - 88.1), renderer: Apple M2
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
W0000 00:00:1737444216.675577 1760253 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1737444216.684919 1760253 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


Press 'q' to quit.


W0000 00:00:1737444220.191433 1760250 landmark_projection_calculator.cc:186] Using NORM_RECT without IMAGE_DIMENSIONS is only supported for the square ROI. Provide IMAGE_DIMENSIONS or use PROJECTION_MATRIX.


In [None]:
##This code is optimised for 2 hands
import mediapipe as mp
import cv2
import numpy as np
import joblib

# Load the trained gesture classification model
model = joblib.load('gesture_classifier.pkl')

# Define gesture labels
gestures = ['up', 'down', 'flip', 'stop', 'throttle']

# Initialize MediaPipe Hands
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=2, min_detection_confidence=0.7)
mp_drawing = mp.solutions.drawing_utils

def extract_hand_landmarks(image):
    """
    Extract normalized hand landmarks from an image using MediaPipe.
    """
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    results = hands.process(image_rgb)

    if results.multi_hand_landmarks:
        hands_data = []
        for hand_landmarks in results.multi_hand_landmarks:
            landmarks = []
            for lm in hand_landmarks.landmark:
                landmarks.append([lm.x, lm.y, lm.z])  # Normalize landmark positions
            hands_data.append((np.array(landmarks).flatten(), hand_landmarks))
        return hands_data  # Return list of landmarks and hand data for all detected hands
    return None

# Real-time Gesture Recognition
cap = cv2.VideoCapture(0)
print("Press 'q' to quit.")

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    frame = cv2.flip(frame, 1)  # Flip for mirror effect
    hands_data = extract_hand_landmarks(frame)

    if hands_data:
        for hand_idx, (landmarks, hand_landmarks) in enumerate(hands_data):
            # Predict gesture
            prediction = model.predict([landmarks])
            label = prediction[0]  # Get predicted gesture
            prediction_proba = model.predict_proba([landmarks])[0]
            accuracy = max(prediction_proba) * 100  # Get accuracy of prediction

            # Draw hand landmarks
            mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)

            # Draw a bounding box around the hand
            x_min = min([int(landmark.x * frame.shape[1]) for landmark in hand_landmarks.landmark])
            y_min = min([int(landmark.y * frame.shape[0]) for landmark in hand_landmarks.landmark])
            x_max = max([int(landmark.x * frame.shape[1]) for landmark in hand_landmarks.landmark])
            y_max = max([int(landmark.y * frame.shape[0]) for landmark in hand_landmarks.landmark])

            cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0, 0, 255), 2)

            # Display label and accuracy
            cv2.putText(
                frame,
                f"Hand {hand_idx + 1}: {label}",
                (x_min, y_min - 10),
                cv2.FONT_HERSHEY_SIMPLEX,
                1,
                (0, 0, 255),
                2
            )
            cv2.putText(
                frame,
                f"Accuracy: {accuracy:.2f}%",
                (x_min, y_min - 30),
                cv2.FONT_HERSHEY_SIMPLEX,
                1,
                (0, 0, 255),
                2
            )

    cv2.imshow("Real-Time Hand Gesture Recognition", frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()


I0000 00:00:1737486953.944868 1890129 gl_context.cc:369] GL version: 2.1 (2.1 Metal - 88.1), renderer: Apple M2
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
W0000 00:00:1737486953.975358 1890651 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1737486953.981546 1890651 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


Press 'q' to quit.


W0000 00:00:1737486956.147318 1890647 landmark_projection_calculator.cc:186] Using NORM_RECT without IMAGE_DIMENSIONS is only supported for the square ROI. Provide IMAGE_DIMENSIONS or use PROJECTION_MATRIX.
