# Setting up the environment

In [1]:
## Windows
# python -m venv .venv

## MacOS
# python3 -m venv .venv

## Linux
# python3 -m venv .venv

# Activating the virtual environment
## Windows
# .venv\Scripts\activate

## MacOS
# source .venv/bin/activate

## Install the dependencies
# pip install ultralytics numpy opencv-python jupyter


# Importing the necessary libraries

In [1]:
import cv2
import numpy as np
import os
from ultralytics import YOLO
import time

# Opening your own camera

In [2]:
## Explain how this works through frame by frame reading and displaying

def capture_video():
    # cap is a VideoCapture object
    cap = cv2.VideoCapture(0)   # 0 is the default camera

    while True:
        ret, frame = cap.read()  # ret is a boolean value that returns true if the frame is read correctly
        if not ret:
            break
        cv2.imshow('Camera', frame)

        # cv2.waitKey(1) & 0xFF == ord('q') is a condition that waits for 1 millisecond and checks if the 'q' key is pressed
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()

capture_video()

2025-02-25 19:42:32.108 python[26863:772428] +[IMKClient subclass]: chose IMKClient_Legacy
2025-02-25 19:42:32.108 python[26863:772428] +[IMKInputSession subclass]: chose IMKInputSession_Legacy


: 

# Loading the model

In [3]:
# Explain how the model was previously trained
# Explain how a pretrained model is loaded
model = YOLO('sign_language.pt')

# Example detection of an image
image_path = 'samples/hand10.png'
image = cv2.imread(image_path)

cv2.imshow('Image', image)
while True:
    key = cv2.waitKey(1) & 0xFF
    if key == ord('q'):
        break
cv2.destroyAllWindows()

# Explain how the model returns the bounding boxes and the confidence scores and show the image with the bounding boxes
results = model.predict(image)
print(results)



0: 640x448 1 W, 39.4ms
Speed: 3.0ms preprocess, 39.4ms inference, 3.9ms postprocess per image at shape (1, 3, 640, 448)
[ultralytics.engine.results.Results object with attributes:

boxes: ultralytics.engine.results.Boxes object
keypoints: None
masks: None
names: {0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E', 5: 'F', 6: 'G', 7: 'H', 8: 'I', 9: 'J', 10: 'K', 11: 'L', 12: 'M', 13: 'N', 14: 'O', 15: 'P', 16: 'Q', 17: 'R', 18: 'S', 19: 'T', 20: 'U', 21: 'V', 22: 'W', 23: 'X', 24: 'Y', 25: 'Z'}
obb: None
orig_img: array([[[151, 147, 100],
        [151, 147, 100],
        [151, 147, 100],
        ...,
        [153, 149, 103],
        [153, 149, 103],
        [153, 149, 103]],

       [[151, 147, 100],
        [151, 147, 100],
        [151, 147, 100],
        ...,
        [153, 149, 103],
        [153, 149, 103],
        [153, 149, 103]],

       [[151, 147, 100],
        [151, 147, 100],
        [151, 147, 100],
        ...,
        [153, 149, 103],
        [153, 149, 103],
        [153, 149, 103]

## Drawing the bounding boxes

In [4]:
def draw_bounding_box(frame, box, names):
    '''
    This function draws a thicker bounding box on the frame and returns the frame.
    The letter and confidence score are also displayed in larger text.
    '''
    confidence = box.conf[0].item()

    x1, y1, x2, y2 = map(int, box.xyxy[0])
    label = int(box.cls[0].item())
    detected_letter = names[label]
    
    # Parameters for a thicker bounding box and larger text
    box_thickness = 3         # Increased thickness for the bounding box
    font_scale = 1.0          # Larger font scale for the text
    text_thickness = 2        # Increased text thickness for better visibility
    
    # Draw the thick bounding box
    cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), box_thickness)
    
    # Draw the text with the detected letter and confidence score
    cv2.putText(frame, f'{detected_letter} {confidence:.2f}', (x1, y1 - 10), 
                cv2.FONT_HERSHEY_SIMPLEX, font_scale, (0, 255, 0), text_thickness)
    
    return frame

In [5]:
# Apply to the previous image and display it
results = results[0]
frame = draw_bounding_box(image, results.boxes[0], results.names)

cv2.imshow('Image', frame)
while True:
    key = cv2.waitKey(1) & 0xFF
    if key == ord('q'):
        break

## Incorporating the model

In [6]:
# Function to find the best detection

def find_best_detection(results):
    max_conf = -1
    best_box = None
    
    for result in results:
        for box in result.boxes:
            conf = box.conf[0].item()
            if conf > max_conf:
                max_conf = conf
                best_box = box
    
    if max_conf > 0.4:
        return best_box
    else:
        return None


In [7]:
def detect_video(model):

    # Start capturing video input
    cap = cv2.VideoCapture(0)

    while cap.isOpened():
        
        # Capture frame-by-frame
        ret, frame = cap.read()
        if not ret:
            break
        
        # Flip the frame for a mirror effect
        frame = cv2.flip(frame, 1)
        
        # Perform detection
        results = model.predict(frame)
        
        # Find the best detection
        best_box = find_best_detection(results)
        
        if best_box is not None:
            # If we found a valid detection, draw the bounding box and the confidence score
            frame = draw_bounding_box(frame, best_box, results[0].names)
        
        # Display the video frame with the detected letter
        cv2.imshow("Sign Language Detection", frame)
        
        # Break loop on 'q' key press
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()


In [8]:
detect_video(model)


0: 384x640 (no detections), 51.9ms
Speed: 2.1ms preprocess, 51.9ms inference, 0.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 39.0ms
Speed: 1.3ms preprocess, 39.0ms inference, 0.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 40.2ms
Speed: 1.3ms preprocess, 40.2ms inference, 0.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 38.1ms
Speed: 1.2ms preprocess, 38.1ms inference, 0.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 35.0ms
Speed: 1.4ms preprocess, 35.0ms inference, 0.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 36.0ms
Speed: 1.5ms preprocess, 36.0ms inference, 0.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 33.7ms
Speed: 1.2ms preprocess, 33.7ms inference, 0.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 37.7ms
Speed: 1.2ms preprocess, 37.7ms i

## Adding some extra features with the detected letter

Ideas:
1. Spelling with the detections
2. Display random letter and match with sign language


In [9]:

def detect_video_with_spelling(model):
    '''
    This function detects the video, draws the bounding box, shows the current spelling and displays it.
    '''
    # Start capturing video input
    cap = cv2.VideoCapture(0)
    spelling = ''
    last_detection_time = time.time()

    while cap.isOpened():
        
        # Capture frame-by-frame
        ret, frame = cap.read()
        if not ret:
            break
        
        # Flip the frame for a mirror effect
        frame = cv2.flip(frame, 1)
        
        # Perform detection
        results = model.predict(frame)
        
        # Find the best detection
        best_box = find_best_detection(results)
        
        current_time = time.time()

        if best_box is not None:
            # If we found a valid detection, draw the bounding box and the confidence score
            frame = draw_bounding_box(frame, best_box, results[0].names)
            
            # Add the detected letter to the spelling if 3 seconds have passed
            if current_time - last_detection_time >= 3:
                spelling += results[0].names[int(best_box.cls[0].item())]
                last_detection_time = current_time

        # Get frame dimensions
        height, width = frame.shape[:2]
        
        # Calculate text size and position for centered text
        text = f'Spelling: {spelling}'
        font = cv2.FONT_HERSHEY_SIMPLEX
        font_scale = 2.0
        thickness = 3
        text_size = cv2.getTextSize(text, font, font_scale, thickness)[0]
        text_x = (width - text_size[0]) // 2
        text_y = height - 50  # 50 pixels from bottom
        
        # Display the spelling centered and in black
        cv2.putText(frame, text, (text_x, text_y), font, font_scale, (0, 0, 0), thickness)
        
        # Display the video frame with the detected letter
        cv2.imshow("Sign Language Detection", frame)
        
        # Handle key presses
        key = cv2.waitKey(1) & 0xFF

        if key == ord('q'):
            break
        elif key == ord('d') and len(spelling) > 0:
            spelling = spelling[:-1]  # Remove the last letter

    cap.release()
    cv2.destroyAllWindows()

In [10]:
detect_video_with_spelling(model)


0: 384x640 1 G, 46.4ms
Speed: 1.7ms preprocess, 46.4ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 41.6ms
Speed: 1.1ms preprocess, 41.6ms inference, 0.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 36.7ms
Speed: 1.3ms preprocess, 36.7ms inference, 0.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 40.5ms
Speed: 1.2ms preprocess, 40.5ms inference, 0.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 39.0ms
Speed: 1.3ms preprocess, 39.0ms inference, 0.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 43.3ms
Speed: 1.2ms preprocess, 43.3ms inference, 0.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 35.4ms
Speed: 1.6ms preprocess, 35.4ms inference, 0.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 35.1ms
Speed: 1.2ms preprocess, 35.1ms inference, 0.

: 