In [1]:
import cv2
import torch
import torch.nn as nn
from ultralytics import YOLO
from torchvision import transforms
from PIL import Image
import matplotlib.pyplot as plt


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cuda


In [3]:
class_names = [
    "0","1","2","3","4","5","6","7","8","9",
    "A","B","C","D","E","F","G","H","I","J",
    "K","L","M","N","O","P","Q","R","S","T",
    "U","V","W","X","Y","Z"
]


In [4]:
yolo_model = YOLO("yolo_models/asl_yolo_v8/weights/best.pt")


In [5]:
class ASLCNN(nn.Module):
    def __init__(self, num_classes=26):
        super().__init__()

        self.features = nn.Sequential(
            nn.Conv2d(3, 32, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),     # 14√ó14

            nn.Conv2d(32, 64, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),     # 7√ó7

            nn.Conv2d(64, 128, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),     # 3√ó3
        )

        self.classifier = nn.Sequential(
            nn.Linear(128 * 3 * 3, 256),
            nn.ReLU(),
            nn.Linear(256, num_classes)
        )

    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        return self.classifier(x)


In [6]:
cnn_model = ASLCNN(num_classes=36).to(device)
cnn_model.load_state_dict(torch.load(
    "d:/Computer Vision/ASL Recognition/models/cnn/asl_cnn.pth",
    map_location=device
))
cnn_model.eval()


NVIDIA GeForce RTX 5060 Laptop GPU with CUDA capability sm_120 is not compatible with the current PyTorch installation.
The current PyTorch install supports CUDA capabilities sm_37 sm_50 sm_60 sm_61 sm_70 sm_75 sm_80 sm_86 sm_90 compute_37.
If you want to use the NVIDIA GeForce RTX 5060 Laptop GPU GPU with PyTorch, please check the instructions at https://pytorch.org/get-started/locally/



ASLCNN(
  (features): Sequential(
    (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): ReLU()
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU()
    (8): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (classifier): Sequential(
    (0): Linear(in_features=1152, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=36, bias=True)
  )
)

In [7]:
cnn_transform = transforms.Compose([
    transforms.Resize((28, 28)),   # üî• FIX
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.5, 0.5, 0.5],
        std=[0.5, 0.5, 0.5]
    )
])


In [8]:
def yolo_cnn_image(image_path, conf_yolo=0.5):
    image = cv2.imread(image_path)

    results = yolo_model(image, conf=conf_yolo)[0]

    for box in results.boxes:
        x1, y1, x2, y2 = map(int, box.xyxy[0])
        crop = image[y1:y2, x1:x2]

        if crop.size == 0:
            continue

        crop_rgb = cv2.cvtColor(crop, cv2.COLOR_BGR2RGB)
        crop_pil = Image.fromarray(crop_rgb)
        input_tensor = cnn_transform(crop_pil).unsqueeze(0).to(device)

        with torch.no_grad():
            outputs = cnn_model(input_tensor)
            probs = torch.softmax(outputs, dim=1)
            conf_cnn, pred = torch.max(probs, 1)

        label = class_names[pred.item()]

        cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 2)
        cv2.putText(
            image,
            f"{label} ({conf_cnn.item():.2f})",
            (x1, y1 - 10),
            cv2.FONT_HERSHEY_SIMPLEX,
            0.9,
            (0, 255, 0),
            2
        )

    return image


In [10]:
import cv2
import torch
import sys
from torchvision import transforms
last_letter = None

# -------------------- MODELS --------------------
cnn_model.eval()
yolo_model.to("cpu")          # keep CPU until everything is stable
device = next(cnn_model.parameters()).device

# -------------------- TRANSFORM --------------------
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((28, 28)),   # ‚úÖ MUST BE 28
    transforms.ToTensor(),
    transforms.Normalize([0.5, 0.5, 0.5],
                         [0.5, 0.5, 0.5])
])


# -------------------- CAMERA --------------------
cap = cv2.VideoCapture(0, cv2.CAP_DSHOW)  # CAP_DSHOW fixes Windows issues
window_name = "ASL Detection"
cv2.namedWindow(window_name, cv2.WINDOW_NORMAL)

if not cap.isOpened():
    print("‚ùå Camera not opened")
    sys.exit(1)

print("‚úÖ Camera started")

# -------------------- MAIN LOOP --------------------
while True:
    ret, frame = cap.read()
    if not ret:
        break

    frame = cv2.flip(frame, 1)
    frame = cv2.resize(frame, (640, 480))

    # ---------- YOLO ----------
    results = yolo_model(frame, verbose=False)[0]

    for box in results.boxes:
        x1, y1, x2, y2 = map(int, box.xyxy[0])

        h, w, _ = frame.shape
        x1, y1 = max(0, x1), max(0, y1)
        x2, y2 = min(w, x2), min(h, y2)

        hand = frame[y1:y2, x1:x2]
        if hand.size == 0:
            continue

        # BGR ‚Üí RGB
        hand = cv2.cvtColor(hand, cv2.COLOR_BGR2RGB)

        # ---------- CNN ----------
        hand_tensor = transform(hand).unsqueeze(0).to(device)
        with torch.no_grad():
            pred = cnn_model(hand_tensor).argmax(dim=1).item()

        label = class_names[pred]
        current_letter = class_names[pred]

        if current_letter != last_letter:
            print("Detected:", current_letter)
            last_letter = current_letter


        # ---------- DRAW ----------
        label_text = f"{class_names[pred]}"

        # Move text INSIDE the box to ensure visibility
        text_x = max(x1, 10)
        text_y = max(y1 + 30, 30)

        # Solid background for text (VERY IMPORTANT)
        (text_w, text_h), _ = cv2.getTextSize(
            label_text,
            cv2.FONT_HERSHEY_SIMPLEX,
            1,
            2
        )

        cv2.rectangle(
            frame,
            (text_x, text_y - text_h - 10),
            (text_x + text_w + 10, text_y),
            (0, 0, 0),      # black background
            -1
        )

        cv2.putText(
            frame,
            label_text,
            (text_x + 5, text_y - 5),
            cv2.FONT_HERSHEY_SIMPLEX,
            1,
            (0, 255, 0),    # bright green text
            2,
            cv2.LINE_AA
        )

        # Bounding box
        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)


    cv2.imshow(window_name, frame)

    # ---------- EXIT CONDITIONS ----------
    # ESC key
    if cv2.waitKey(1) == 27:
        break

    # Window close button (‚ùå)
    if cv2.getWindowProperty(window_name, cv2.WND_PROP_VISIBLE) < 1:
        break

# -------------------- CLEAN EXIT --------------------
print("üõë Exiting program")
cap.release()
cv2.destroyAllWindows()
sys.exit(0)


‚úÖ Camera started
Detected: Y
Detected: P
Detected: Y
Detected: P
Detected: Y
Detected: P
Detected: Y
Detected: A
Detected: J
Detected: D
Detected: L
Detected: X
Detected: A
Detected: X
Detected: A
Detected: O
Detected: J
Detected: P
Detected: Y
Detected: O
Detected: P
Detected: J
Detected: P
Detected: F
Detected: P
Detected: H
Detected: P
üõë Exiting program


SystemExit: 0