In [1]:
import cv2
import numpy as np
import mediapipe as mp
import time
import handTracker as htm

In [2]:
import torch
import torchvision.transforms as transforms
import torch.nn as nn

In [3]:
class CNNModel(nn.Module):
    def __init__(self,num_classes):
        super(CNNModel,self).__init__()
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=32,kernel_size=3,padding=1)
        self.conv2 = nn.Conv2d(32,32,kernel_size=3,stride=1)
        self.conv3 = nn.Conv2d(32,64,kernel_size=3,padding=1)
        self.conv4 = nn.Conv2d(64,64,kernel_size=3,padding=1)

        self.pool = nn.MaxPool2d(2,2)
        self.fc1 = nn.Linear(64*27*27,1024)
        self.fc2 = nn.Linear(1024,256)
        self.fc3 = nn.Linear(256,num_classes)

        self.dropout = nn.Dropout(0.5)

        self.batch_normalization_1 = nn.BatchNorm2d(32)
        self.batch_normalization_2 = nn.BatchNorm2d(32)
        self.batch_normalization_3 = nn.BatchNorm2d(64)
        self.batch_normalization_4 = nn.BatchNorm2d(64)

    def forward(self,x):
        x = self.pool(torch.relu(self.batch_normalization_1(self.conv1(x))))
        x = self.pool(torch.relu(self.batch_normalization_2(self.conv2(x))))
        x = self.pool(torch.relu(self.batch_normalization_3(self.conv3(x))))
        x = torch.relu(self.batch_normalization_4(self.conv4(x)))
        x = torch.flatten(x,1)
        x = self.dropout(x)
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)

        return x

In [4]:
model_path = 'sign_language_detection.pth'
model = CNNModel(29)

state_dict = torch.load('sign_language_detection.pth', map_location=torch.device('cpu'))
model.load_state_dict(state_dict)

model.eval()

CNNModel(
  (conv1): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1))
  (conv3): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv4): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=46656, out_features=1024, bias=True)
  (fc2): Linear(in_features=1024, out_features=256, bias=True)
  (fc3): Linear(in_features=256, out_features=29, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (batch_normalization_1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batch_normalization_2): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batch_normalization_3): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batch_normalization_4): BatchNorm2d(64, eps=1e-05, momentu

In [5]:
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224, 224)),  
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),  
])

In [6]:
mapping= {'A': 0,
 'B': 1,
 'C': 2,
 'D': 3,
 'E': 4,
 'F': 5,
 'G': 6,
 'H': 7,
 'I': 8,
 'J': 9,
 'K': 10,
 'L': 11,
 'M': 12,
 'N': 13,
 'O': 14,
 'P': 15,
 'Q': 16,
 'R': 17,
 'S': 18,
 'T': 19,
 'U': 20,
 'V': 21,
 'W': 22,
 'X': 23,
 'Y': 24,
 'Z': 25,
 'del': 26,
 'nothing': 27,
 'space': 28}
mapping = {v: k for k, v in mapping.items()}

In [9]:
cap = cv2.VideoCapture(0)
detector = htm.handDetector()  
while True:
    success, img = cap.read()
    hands,x_min, y_min, x_max, y_max = detector.findHands(img)
    h = np.array(hands)
    cv2.imshow('Cropped Image', hands)
    hand_tensor = transform(hands).unsqueeze(0)  
    with torch.no_grad():
        predictions = model(hand_tensor)
        pred = torch.argmax(predictions,axis=1)
        cv2.putText(img, str(mapping[pred.item()]),(x_min,y_min), cv2.FONT_HERSHEY_PLAIN, 3, (255,0,255), 3)
        cv2.rectangle(img, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)
        cv2.imshow('Image', img)
        if cv2.waitKey(1) & 0xFF==ord('q'):
            break
cv2.destroyAllWindows()