In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms
import cv2
import mediapipe as mp
import numpy as np
from PIL import Image

# Define the CNN model structure (matching training)
class RockPaperScissorsCNN(nn.Module):
    def __init__(self):
        super(RockPaperScissorsCNN, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.fc1 = nn.Linear(64 * 32 * 32, 128)
        self.fc2 = nn.Linear(128, 4)  # 4 output classes: rock, paper, scissors, unknown  
    
    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(x.shape[0], -1)  # Flatten the tensor
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Load trained model
model_path = "rps_model_improved.pth"

def get_device():
    if torch.cuda.is_available():
        return torch.device("cuda")
    elif torch.backends.mps.is_available():
        return torch.device("mps")
    else:
        return torch.device("cpu")

device = get_device()
print(f"Using device: {device}")
model = RockPaperScissorsCNN().to(device)
model.load_state_dict(torch.load(model_path, map_location=device)['model_state_dict'])
model.eval()  # Set to evaluation mode

# Define transformation for image preprocessing
transform = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))  # Must match training normalization
])

# Class mapping
classes = ['rock', 'paper', 'scissors', 'unknown']
print(f"Class mapping: {dict(enumerate(classes))}")

# Initialize MediaPipe Hands
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
hands = mp_hands.Hands(min_detection_confidence=0.5, min_tracking_confidence=0.5)

# Function to predict the class of a detected hand
def predict_hand(image):
    # Convert OpenCV image (NumPy array) to PIL image
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image_pil = Image.fromarray(image)  # Convert NumPy array to PIL Image
    image_pil = image_pil.resize((128, 128))  # Resize to match model input
    
    # Apply transformations
    image_tensor = transform(image_pil).unsqueeze(0).to(device)
    
    # Get model prediction
    with torch.no_grad():
        output = model(image_tensor)
        _, predicted = torch.max(output, 1)
        probabilities = F.softmax(output, dim=1)[0]
    
    predicted_class = classes[predicted.item()]
    
    return predicted_class, {classes[i]: float(probabilities[i]) * 100 for i in range(len(classes))}

Using device: mps
Class mapping: {0: 'rock', 1: 'paper', 2: 'scissors', 3: 'unknown'}


  model.load_state_dict(torch.load(model_path, map_location=device)['model_state_dict'])
I0000 00:00:1742003928.573944  819426 gl_context.cc:369] GL version: 2.1 (2.1 Metal - 89.3), renderer: Apple M2 Pro
W0000 00:00:1742003928.582478  820374 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1742003928.588452  820374 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


In [4]:
# Start webcam capture
cap = cv2.VideoCapture(0)

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break
    
    # Convert to RGB and process with MediaPipe
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = hands.process(rgb_frame)
    
    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            # Get bounding box around the hand
            h, w, _ = frame.shape
            x_min = min([lm.x for lm in hand_landmarks.landmark]) * w
            y_min = min([lm.y for lm in hand_landmarks.landmark]) * h
            x_max = max([lm.x for lm in hand_landmarks.landmark]) * w
            y_max = max([lm.y for lm in hand_landmarks.landmark]) * h
            
            x_min, y_min, x_max, y_max = int(x_min), int(y_min), int(x_max), int(y_max)

            # Ensure bounding box stays within frame
            x_min, y_min = max(0, x_min), max(0, y_min)
            x_max, y_max = min(w, x_max), min(h, y_max)

            # Extract and predict hand gesture
            hand_crop = frame[y_min:y_max, x_min:x_max]
            if hand_crop.size > 0:
                predicted_class, probabilities = predict_hand(hand_crop)

                # Draw bounding box & label
                cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)
                label = f"{predicted_class} ({probabilities[predicted_class]:.2f}%)"
                cv2.putText(frame, label, (x_min, y_min - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)

            # Draw hand landmarks
            mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
    
    # Show live feed
    cv2.imshow("Rock-Paper-Scissors Detection", frame)
    
    # Press 'q' to exit
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()

W0000 00:00:1742003931.346879  820378 landmark_projection_calculator.cc:186] Using NORM_RECT without IMAGE_DIMENSIONS is only supported for the square ROI. Provide IMAGE_DIMENSIONS or use PROJECTION_MATRIX.
2025-03-14 18:58:51.574 python[17609:819426] +[IMKClient subclass]: chose IMKClient_Legacy
2025-03-14 18:58:51.574 python[17609:819426] +[IMKInputSession subclass]: chose IMKInputSession_Legacy


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms
import cv2
import mediapipe as mp
import numpy as np
from PIL import Image

# Define CNN Model (Must match training)
class RockPaperScissorsCNN(nn.Module):
    def __init__(self):
        super(RockPaperScissorsCNN, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.fc1 = nn.Linear(64 * 32 * 32, 128)
        self.fc2 = nn.Linear(128, 4)  # 4 classes: rock, paper, scissors, unknown  

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(x.shape[0], -1)  # Flatten
        x = F.relu(self.fc1(x)) 
        x = self.fc2(x)
        return x

# Load trained model
model_path = "rps_model_improved.pth"
def get_device():
    if torch.cuda.is_available():
        return torch.device("cuda")
    elif torch.backends.mps.is_available():
        return torch.device("mps")
    else:
        return torch.device("cpu")

device = get_device()
print(f"Using device: {device}")
model = RockPaperScissorsCNN().to(device)
model.load_state_dict(torch.load(model_path, map_location=device)['model_state_dict'])
model.eval()

# Define transformation for input images
transform = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

# Class labels
classes = ['rock', 'paper', 'scissors', 'unknown']
print(f"Class mapping: {dict(enumerate(classes))}")

# Initialize MediaPipe Hands
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
hands = mp_hands.Hands(min_detection_confidence=0.5, min_tracking_confidence=0.5)

# Fixed Bounding Box Size
BOX_SIZE = 400  # Fixed size in pixels

# Function to classify hand gesture
def predict_hand(image):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # Convert to RGB
    image_pil = Image.fromarray(image).resize((128, 128))  # Convert to PIL and resize
    
    # Apply transformations
    image_tensor = transform(image_pil).unsqueeze(0).to(device)

    # Get prediction
    with torch.no_grad():
        output = model(image_tensor)
        _, predicted = torch.max(output, 1)
        probabilities = F.softmax(output, dim=1)[0]

    predicted_class = classes[predicted.item()]
    return predicted_class, {classes[i]: float(probabilities[i]) * 100 for i in range(len(classes))}

In [None]:
# Start webcam
cap = cv2.VideoCapture(0)

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break
    
    h, w, _ = frame.shape  # Get frame dimensions

    # Convert to RGB and process with MediaPipe
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = hands.process(rgb_frame)

    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            # Get center of hand
            x_center = int(np.mean([lm.x for lm in hand_landmarks.landmark]) * w)
            y_center = int(np.mean([lm.y for lm in hand_landmarks.landmark]) * h)

            # Define fixed bounding box around the detected hand
            x_min = max(0, x_center - BOX_SIZE // 2)
            y_min = max(0, y_center - BOX_SIZE // 2)
            x_max = min(w, x_center + BOX_SIZE // 2)
            y_max = min(h, y_center + BOX_SIZE // 2)

            # Extract hand region inside fixed-size box
            hand_crop = frame[y_min:y_max, x_min:x_max]

            # Ensure valid extraction
            if hand_crop.shape[0] > 0 and hand_crop.shape[1] > 0:
                predicted_class, probabilities = predict_hand(hand_crop)

                # Display classification result
                label = f"{predicted_class} ({probabilities[predicted_class]:.2f}%)"
                cv2.putText(frame, label, (x_min, y_min - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)

            # Draw the fixed-size bounding box
            cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)

            # Draw hand landmarks
            mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)

    # Display the frame
    cv2.imshow("Rock-Paper-Scissors Detection", frame)

    # Press 'q' to exit
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()