In [1]:
import cv2
import mediapipe as mp
import matplotlib.pyplot as plt
import numpy as np
import os
import time

In [2]:
mp_holistic= mp.solutions.holistic #Holistic model
mp_drawing= mp.solutions.drawing_utils #Drawing Skeleton in feed

In [3]:
# we are use the model to get holistic skeleton from each frame or image
def mediapipe_holistic_detection(image,model):
    image=cv2.cvtColor(image,cv2.COLOR_BGR2RGB) # Convert BGR(cv2 feed) to RGB
    image.flags.writeable= False
    results = model.process(image)
    image.flags.writeable= True
    image=cv2.cvtColor(image,cv2.COLOR_RGB2BGR) # Convert RGB to BGR
    return image, results

In [4]:
mp_face_mesh = mp.solutions.face_mesh 
def draw_landmarks(image, results):
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS)# Draw pose landmarks
    mp_drawing.draw_landmarks(image, results.face_landmarks,mp_face_mesh.FACEMESH_TESSELATION)# Draw face landmarks
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS)# Draw hand landmarks
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS)

In [5]:
mp_face_mesh = mp.solutions.face_mesh 
DrawingSpec = mp_drawing.DrawingSpec
def draw_design_landmarks(image, results):
    pose_style = DrawingSpec(color=(255, 255, 255), thickness=3, circle_radius=2)  # White lines, blue dots
    face_style = DrawingSpec(color=(255, 0, 0), thickness=1, circle_radius=2)  # Smaller blue dots for face
    hand_style = DrawingSpec(color=(255, 255, 255), thickness=3, circle_radius=2)
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,landmark_drawing_spec=pose_style)# Draw pose landmarks
    mp_drawing.draw_landmarks(image, results.face_landmarks,mp_face_mesh.FACEMESH_TESSELATION,landmark_drawing_spec=face_style)# Draw face landmarks
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS,landmark_drawing_spec=hand_style)# Draw hand landmarks
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS,landmark_drawing_spec=hand_style)

In [6]:
def extract_keypoints(results):
    #pose
    pose=[] #list
    if results.pose_landmarks:
            for res in results.pose_landmarks.landmark: #extracting each list from landmark list 
                temp = np.array([res.x,res.y,res.z,res.visibility])# coverting each sub list in to a np array
                pose.append(temp)# appending all np array to pose list
            pose=np.array(pose) # list to np array
            pose = pose.flatten() # from (33,4) to (132)
    else:
        pose= np.zeros(33*4) # handelling the empty values

    #face
    face=[] 
    if results.face_landmarks:
            for res in results.face_landmarks.landmark:
                temp = np.array([res.x,res.y,res.z])
                face.append(temp)
            face = np.array(face)
            face = face.flatten() 
    else:
        face= np.zeros(468*3) 

    #left hand
    left_hand=[] 
    if results.left_hand_landmarks:
            for res in results.left_hand_landmarks.landmark:
                temp = np.array([res.x,res.y,res.z])
                left_hand.append(temp)
            left_hand = np.array(left_hand)
            left_hand = left_hand.flatten() 
    else:
        left_hand= np.zeros(21*3) 
    
    #right hand
    right_hand=[] 
    if results.right_hand_landmarks:
            for res in results.right_hand_landmarks.landmark:
                temp = np.array([res.x,res.y,res.z])
                right_hand.append(temp)
            right_hand = np.array(right_hand)
            right_hand = right_hand.flatten() 
    else:
        right_hand= np.zeros(21*3) 


    return np.concatenate([pose,face, left_hand, right_hand])

In [7]:
keypoints_data = os.path.join('Keypoints_Data')

# Create the folder if it doesn't exist
if not os.path.exists(keypoints_data):
    os.makedirs(keypoints_data)

# Actions
actions = ['Hello', 'Thanks', 'I_Love_You', 'Yes', 'No', 'Help', 'Please']
no_sequences = 40  # Number of videos
sequence_length = 30  # Frames per video

In [9]:
for action in actions:
    for sequence in range(no_sequences):
        folder_path = os.path.join(keypoints_data, action, str(sequence))
        if not os.path.exists(folder_path):
            os.makedirs(folder_path)

In [11]:
cap = cv2.VideoCapture(0)

#setting the size of the feed video
cap.set(cv2.CAP_PROP_FRAME_WIDTH, 1000)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 600)
       
# set up the holistic model with the function
with mp_holistic.Holistic(min_detection_confidence=0.5,min_tracking_confidence=0.5) as holistic :
    for action in actions:
        for sequence in range(no_sequences):
            for frame_no in range(sequence_length):
                ret, frame= cap.read()# read frame
                image,results=mediapipe_holistic_detection(frame,holistic) #holistic detection
                draw_landmarks(image,results)# drawing landmarks

                # Wait logic
                if frame_no == 0:
                    cv2.putText(image, 'Start Collection', (20, 40), 
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 0), 2, cv2.LINE_AA)
                    cv2.putText(image, f'Collecting frames for {action} Video no {sequence}', (20, 80), 
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 0), 2, cv2.LINE_AA)
                    cv2.waitKey(2000)
                else:
                    cv2.putText(image, f'Collecting frames for {action} Video no {sequence}', (20, 40), 
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 0), 2, cv2.LINE_AA)

                #export keypoints
                keypoints = extract_keypoints(results)
                npy_path= os.path.join(keypoints_data,action,str(sequence),str(frame_no))
                np.save(npy_path,keypoints)
                
                cv2.imshow("OpenCV", image)
                if cv2.waitKey(1) & 0xFF == ord('q'):
                    break

        print(f"Completed sequence {sequence} for action {action}. Waiting for 5 seconds...")
        cv2.putText(image, 'Sequence Completed. Waiting...', (20, 120),
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 0), 2, cv2.LINE_AA)
        cv2.imshow("OpenCV", image)
        cv2.waitKey(5000)
        
cap.release()
cv2.destroyAllWindows() 

Completed sequence 39 for action Hello. Waiting for 5 seconds...
Completed sequence 39 for action Thanks. Waiting for 5 seconds...
Completed sequence 39 for action I_Love_You. Waiting for 5 seconds...
Completed sequence 39 for action Yes. Waiting for 5 seconds...
Completed sequence 39 for action No. Waiting for 5 seconds...
Completed sequence 39 for action Help. Waiting for 5 seconds...
Completed sequence 39 for action Please. Waiting for 5 seconds...


In [8]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

In [9]:
label_map = {} 
for num, label in enumerate(actions):#enumarate provides both index & item in the list
    label_map[label] = num # Each action is added to the dictionary as a key & value = num


In [10]:
sequences, labels= [], []
for action in actions:
    for sequence in range(no_sequences):
        window=[]
        for frame_no in range(sequence_length):
            res= np.load(os.path.join(keypoints_data,action,str(sequence),"{}.npy".format(frame_no)))
            window.append(res)
        sequences.append(window)
        labels.append(label_map[action])

In [11]:
X=np.array(sequences)
y= to_categorical(labels).astype(int)

In [25]:
X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.05)

In [26]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, BatchNormalization, Dense, Bidirectional, GlobalAveragePooling1D
from tensorflow.keras.callbacks import TensorBoard

In [27]:
log_dir=os.path.join('Logs')
tb_callback= TensorBoard(log_dir=log_dir)

In [42]:
model = Sequential([
    Bidirectional(LSTM(128, return_sequences=True), input_shape=(30, 1662)),
    Dropout(0.3),
    BatchNormalization(),
    
    Bidirectional(LSTM(64, return_sequences=True)),
    Dropout(0.3),
    BatchNormalization(),
    
    GlobalAveragePooling1D(),
    Dense(128, activation='relu'),
    Dropout(0.4),
    Dense(64, activation='relu'),
    Dropout(0.4),
    Dense(len(actions), activation='softmax')
])


  super().__init__(**kwargs)


In [29]:
model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])
model.summary()

In [17]:
model.fit(X_train,y_train,epochs=35,callbacks=[tb_callback])

Epoch 1/35
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 189ms/step - categorical_accuracy: 0.3257 - loss: 1.6952
Epoch 2/35
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 188ms/step - categorical_accuracy: 0.5538 - loss: 1.1789
Epoch 3/35
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 186ms/step - categorical_accuracy: 0.7139 - loss: 0.8095
Epoch 4/35
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 187ms/step - categorical_accuracy: 0.7848 - loss: 0.6430
Epoch 5/35
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 184ms/step - categorical_accuracy: 0.8245 - loss: 0.5009
Epoch 6/35
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 180ms/step - categorical_accuracy: 0.8489 - loss: 0.4114
Epoch 7/35
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 165ms/step - categorical_accuracy: 0.8792 - loss: 0.3602
Epoch 8/35
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 162ms/step - cat

<keras.src.callbacks.history.History at 0x2a0fdc76810>

In [24]:
del model

In [30]:
model.load_weights('my_model.keras')

In [31]:
res=model.predict(X_test)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step


In [32]:
print(actions[np.argmax(res[1])])
print(actions[np.argmax(res[6])])

Yes
Help


In [33]:
print(actions[np.argmax(y_test[1])])
print(actions[np.argmax(y_test[6])])

Yes
Help


In [34]:
from sklearn.metrics import multilabel_confusion_matrix,accuracy_score

res= np.argmax(res, axis=1).tolist()  # Convert predicted results to a list of indices
true = np.argmax(y_test, axis=1).tolist() 

In [35]:
multilabel_confusion_matrix(res,true)

array([[[12,  0],
        [ 0,  2]],

       [[13,  1],
        [ 0,  0]],

       [[13,  0],
        [ 0,  1]],

       [[11,  0],
        [ 0,  3]],

       [[13,  0],
        [ 0,  1]],

       [[10,  0],
        [ 0,  4]],

       [[11,  0],
        [ 1,  2]]], dtype=int64)

In [36]:
accuracy_score(res,true)

0.9285714285714286

In [25]:
model.save('my_modell.keras')

In [41]:
sequence = []
sentence = []
predictions = []
threshold = 0.5

colors = [
    (255, 0, 0), (0, 255, 0), (0, 0, 255), 
    (255, 255, 0), (255, 0, 255), (0, 255, 255), 
    (128, 128, 128)
]

cap = cv2.VideoCapture(0)

#setting the size of the feed video
cap.set(cv2.CAP_PROP_FRAME_WIDTH, 1000)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 600)

with mp.solutions.holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret: 
            print("Error: Unable to read from camera")
            break

        image, results = mediapipe_holistic_detection(frame, holistic)
        draw_landmarks(image, results)

        keypoints = extract_keypoints(results)
        sequence.append(keypoints)
        sequence = sequence[-30:]

        if len(sequence) == 30:
            res = model.predict(np.expand_dims(sequence, axis=0))[0]
            predictions.append(np.argmax(res))

            
            if np.unique(predictions[-10:])[0] == np.argmax(res):
                if res[np.argmax(res)] > threshold:
                    if len(sentence) == 0 or actions[np.argmax(res)] != sentence[-1]:
                        sentence.append(actions[np.argmax(res)])

            if len(sentence) > 5:
                sentence = sentence[-5:]

            for idx, prob in enumerate(res):
                cv2.rectangle(image, (0, 60 + idx * 40), (int(prob * 200), 90 + idx * 40), colors[idx % len(colors)], -1)
                cv2.putText(
                image, f"{actions[idx]}: {prob:.2f}",
                (10, 85 + idx * 40), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1, cv2.LINE_AA
                )

            
        cv2.putText(
            image, ' '.join(sentence),
            (image.shape[1] // 2 - len(' '.join(sentence)) * 7, image.shape[0] - 20),
            cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA
        )

        # Show the feed
        cv2.imshow('Sign Language Detection', image)

        # Graceful exit
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break

cap.release()
cv2.destroyAllWindows()

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42