In [1]:
import cv2
import numpy as np
import os
import matplotlib.pyplot as plt
import time
import mediapipe as mp
import sklearn
#import tensorflow

import tensorflow
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM ,Dense
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.models import load_model

from sklearn.metrics import multilabel_confusion_matrix, accuracy_score

In [2]:
mp_holistic = mp.solutions.holistic #makes detections
mp_drawing = mp.solutions.drawing_utils #draws detections

In [4]:
model = load_model("model_new_final.h5")

In [5]:
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image.flags.writeable = False   #image not writeable
    results = model.process(image)  #make prediction
    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    return image, results

In [6]:
def draw_landmarks(image, results):
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS, mp_drawing.DrawingSpec(color=(80,110,10), thickness=1, circle_radius=1), 
                                                                                            mp_drawing.DrawingSpec(color=(80,256,121), thickness=1, circle_radius=1))
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS, mp_drawing.DrawingSpec(color=(80,110,10), thickness=1, circle_radius=1),
                                                                                            mp_drawing.DrawingSpec(color=(80,256,121), thickness=1, circle_radius=1))
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS, mp_drawing.DrawingSpec(color=(80,110,10), thickness=1, circle_radius=1),
                                                                                            mp_drawing.DrawingSpec(color=(80,256,121), thickness=1, circle_radius=1))
    

In [7]:
def extract_keypoints(results):
    pose = np.array([[res.x, res.y, res.z] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*3)
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)

    pose=pose[:69]
    return np.concatenate([pose,lh,rh])

In [8]:
Data_Path = os.path.join("test video_all/test video")
#Data_Path = os.path.join("test video")

In [None]:
actions = np.array(["Book","Do","Eat","Go","Good","Hello","Home","Hungry","I","Morning","No","Not","Pizza" , "Place" ,"Read","School","Student","Teacher","Thank You", "This" , "Tomorrow" ,"Want", "What", "Yes", "Yesterday","You"])


In [10]:
len(actions)

26

In [11]:
label_map = {label:num for num , label in enumerate(actions)}

In [12]:
length = 3

In [68]:
action = "Place"

In [12]:
for sequence in range(length):
        try:
            os.makedirs(os.path.join(Data_Path,action, (str(sequence))))
        except:
            pass

In [71]:
cap = cv2.VideoCapture(0)

with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
        
    for sequence in range(length):
        
        for frame_num in range(30):

            ret, frame = cap.read()

            image, results = mediapipe_detection(frame, holistic)
                            
            draw_landmarks(image, results)
                        
            if frame_num == 0: 
                cv2.putText(image, 'STARTING COLLECTION', (120,200), cv2.FONT_HERSHEY_SIMPLEX, 1, (0,255, 0), 4, cv2.LINE_AA)
                cv2.putText(image, 'Collecting Video Number {}'.format(sequence), (15,12), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA)
                
                cv2.imshow('OpenCV Feed', image)
                cv2.waitKey(1500)
            else: 
                cv2.putText(image, 'Collecting Video Number {}'.format(sequence), (15,12), 
                            cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA)
                
                cv2.imshow('OpenCV Feed', image)               
            
            keypoints = extract_keypoints(results)

            npy_path = os.path.join(Data_Path, action , (str(sequence)), str(frame_num))
            np.save(npy_path, keypoints)

            if cv2.waitKey(10) & 0xFF == ord('q'):
                break
                    
    cap.release()
    cv2.destroyAllWindows()

In [13]:
def normalize_keypoints(keypoints, center_keypoint, reference_distance):
    # Reshape the keypoints into (x, y, z) coordinates
    #print(keypoints)
    keypoints = keypoints.reshape(-1, 3)
    
    #print(keypoints)
    # Subtract center keypoint to get relative coordinates
    relative_keypoints = keypoints - center_keypoint

    #print(relative_keypoints)
    
    # If reference distance is provided, scale the keypoints
    relative_keypoints = relative_keypoints / reference_distance
    #relative_keypoints = keypoints / reference_distance
    

    return relative_keypoints.flatten()

In [14]:
sequences = []


for sequence in range(length):
    window = []
    for frame_num in range(30):
        frame = np.load(os.path.join(Data_Path, action,str(sequence), f"{frame_num}.npy"))
        center_keypoint = frame[0:3]  # Nose keypoint (x, y, z)
        left_shoulder = frame[11*3:11*3+3]
        right_shoulder = frame[12*3:12*3+3]
        reference_distance = np.linalg.norm(left_shoulder - right_shoulder)
        if not reference_distance:
            reference_distance=1
        

        frame = normalize_keypoints(frame, center_keypoint, reference_distance)
        window.append(frame)
    sequences.append(window)

' sequences = []\n\n\nfor sequence in range(length):\n    window = []\n    for frame_num in range(30):\n        frame = np.load(os.path.join(Data_Path, action,str(sequence), f"{frame_num}.npy"))\n        center_keypoint = frame[0:3]  # Nose keypoint (x, y, z)\n        left_shoulder = frame[11*3:11*3+3]\n        right_shoulder = frame[12*3:12*3+3]\n        reference_distance = np.linalg.norm(left_shoulder - right_shoulder)\n        if not reference_distance:\n            reference_distance=1\n        \n\n        frame = normalize_keypoints(frame, center_keypoint, reference_distance)\n        window.append(frame)\n    sequences.append(window) '

In [15]:
import numpy as np
from scipy.ndimage import uniform_filter1d  # For temporal smoothing

# Function to calculate relative hand keypoints
def preprocess_hand_keypoints(hand_keypoints):
    # If there are hand keypoints, calculate relative positions with respect to the wrist (0th keypoint)
    if np.any(hand_keypoints):
        wrist_keypoint = hand_keypoints[0:3]  # Wrist is the first keypoint in MediaPipe
        relative_hand_keypoints = (hand_keypoints.reshape(-1, 3) - wrist_keypoint)  # Relative to wrist
    else:
        relative_hand_keypoints = np.zeros(21 * 3)  # If no hand keypoints, return zeros
    return relative_hand_keypoints.flatten()


In [16]:
""" 


# Preprocessing: Including relative hand keypoints and temporal smoothing
sequences, labels = [], []


for sequence in range(length):
    window = []
    for frame_num in range(30):
        frame = np.load(os.path.join(Data_Path,action,str(sequence), f"{frame_num}.npy"))
        
        # Center keypoint (nose) and shoulder distance (for normalization)
        center_keypoint = frame[0:3]  # Nose keypoint (x, y, z)
        left_shoulder = frame[11*3:11*3+3]
        right_shoulder = frame[12*3:12*3+3]
        reference_distance = np.linalg.norm(left_shoulder - right_shoulder)
        if not reference_distance:
            reference_distance = 1
        
        # Normalize the pose keypoints relative to the nose
        normalized_pose = normalize_keypoints(frame[:69], center_keypoint, reference_distance)
        
        # Preprocess left hand keypoints (relative to wrist)
        left_hand = frame[69:69 + 21*3]
        relative_left_hand = preprocess_hand_keypoints(left_hand)
        
        # Preprocess right hand keypoints (relative to wrist)
        right_hand = frame[69 + 21*3:]
        relative_right_hand = preprocess_hand_keypoints(right_hand)
        
        # Concatenate normalized pose, relative left hand, and relative right hand keypoints
        full_frame = np.concatenate([normalized_pose, relative_left_hand, relative_right_hand])
        
        window.append(full_frame)
    
    # Convert the window into a numpy array for smoothing
    window = np.array(window)
    
    # Apply temporal smoothing using a moving average filter
    smoothed_window = uniform_filter1d(window, size=3, axis=0)
    
    sequences.append(smoothed_window)

 """

' \n\n\n# Preprocessing: Including relative hand keypoints and temporal smoothing\nsequences, labels = [], []\n\n\nfor sequence in range(length):\n    window = []\n    for frame_num in range(30):\n        frame = np.load(os.path.join(Data_Path,action,str(sequence), f"{frame_num}.npy"))\n        \n        # Center keypoint (nose) and shoulder distance (for normalization)\n        center_keypoint = frame[0:3]  # Nose keypoint (x, y, z)\n        left_shoulder = frame[11*3:11*3+3]\n        right_shoulder = frame[12*3:12*3+3]\n        reference_distance = np.linalg.norm(left_shoulder - right_shoulder)\n        if not reference_distance:\n            reference_distance = 1\n        \n        # Normalize the pose keypoints relative to the nose\n        normalized_pose = normalize_keypoints(frame[:69], center_keypoint, reference_distance)\n        \n        # Preprocess left hand keypoints (relative to wrist)\n        left_hand = frame[69:69 + 21*3]\n        relative_left_hand = preprocess_han

In [17]:



# Preprocessing: Including relative hand keypoints and temporal smoothing
sequences, labels = [], []

for action in actions:
    sequences, labels = [], []
    for sequence in range(length):
        window = []
        for frame_num in range(30):
            frame = np.load(os.path.join(Data_Path,action,str(sequence), f"{frame_num}.npy"))
            
            # Center keypoint (nose) and shoulder distance (for normalization)
            center_keypoint = frame[0:3]  # Nose keypoint (x, y, z)
            left_shoulder = frame[11*3:11*3+3]
            right_shoulder = frame[12*3:12*3+3]
            reference_distance = np.linalg.norm(left_shoulder - right_shoulder)
            if not reference_distance:
                reference_distance = 1
            
            # Normalize the pose keypoints relative to the nose
            normalized_pose = normalize_keypoints(frame[:69], center_keypoint, reference_distance)
            
            # Preprocess left hand keypoints (relative to wrist)
            left_hand = frame[69:69 + 21*3]
            relative_left_hand = preprocess_hand_keypoints(left_hand)
            
            # Preprocess right hand keypoints (relative to wrist)
            right_hand = frame[69 + 21*3:]
            relative_right_hand = preprocess_hand_keypoints(right_hand)
            
            # Concatenate normalized pose, relative left hand, and relative right hand keypoints
            full_frame = np.concatenate([normalized_pose, relative_left_hand, relative_right_hand])
            
            window.append(full_frame)
        
        # Convert the window into a numpy array for smoothing
        window = np.array(window)
        
        # Apply temporal smoothing using a moving average filter
        smoothed_window = uniform_filter1d(window, size=3, axis=0)
        
        sequences.append(smoothed_window)

    ans = ""

    for sign in range(length):
        res = model.predict(np.expand_dims(sequences[sign], axis=0))
        ans = ans + str(actions[np.argmax(res)]) + " "
        ans = ans.upper()
        print(actions[np.argmax(res)] , np.max(res))
        #print(res)

    #print(ans)




Book 0.580436
Book 0.93534654
Book 0.99996626
Do 0.99912757
Do 0.9999665
Do 0.99998283
Eat 0.9564631
Eat 0.998604
Eat 0.9999187
Good 0.9508843
Go 0.9999794
Go 0.99953735
Good 0.99978393
Good 0.9959163
Good 0.9997267
Hello 0.9999902
Hello 0.9994954
Hello 0.9956357
Home 0.9999206
Home 0.99992955
Home 0.99991214
Hungry 0.99584997
Hungry 0.9980263
Hungry 0.9978848
I 0.8527386
I 0.9985366
I 0.9975884
Morning 0.86046827
Morning 0.9828462
Morning 0.97989094
No 0.9998191
No 0.9998306
No 0.9998073
Not 0.99982136
Not 0.99985206
Not 0.99960655
Pizza 0.9881457
Pizza 0.9996012
Pizza 0.8172637
Place 0.99997604
Place 0.99999917
Place 0.9999987
Read 0.99999785
Read 0.99999857
Read 0.9999969
School 0.99996567
School 0.99996364
School 0.9999684
Student 0.999813
Student 0.998547
Student 0.99992955
Teacher 0.999995
Teacher 0.99999595
Teacher 0.99999535
Thank You 0.99993086
Thank You 0.99991226
Thank You 0.99984455
This 0.99996793
This 0.99954766
This 0.9999707
Tomorrow 0.9999801
Tomorrow 0.9999664
Tomorro

In [76]:
print(np.array(sequences).shape)

(3, 30, 195)


In [77]:
ans = ""

for sign in range(length):
    res = model.predict(np.expand_dims(sequences[sign], axis=0))
    ans = ans + str(actions[np.argmax(res)]) + " "
    ans = ans.upper()
    print(actions[np.argmax(res)] , np.max(res))
    #print(res)

print(ans)

Place 0.99863917
Place 0.9999423
Place 0.99993014
PLACE PLACE PLACE 


You (99, 98, 85)-> make sure you point at/above chest level (otherwise can misclassify as "This")
Yesterday (99,98,99)
Yes (99) ->Fast movement as wrist needs to be rotated
What (99)
Want (99)
Tomorrow (99) -> See the video before signing, it helps
This (99)
Thank You (99)
Teacher (99)
Student (89, 45 , 99) (Sometimes "Hungry", so check sign video)
School (99)
Read (99)
Place (X) ->  misclassified as Teacher 



model new-
Student -> Hungry (1/3)
Teacher -> Place (3/3)
Yesterday-> Yes(1/3)
You -> Yes (1/3)



In [111]:
from langchain import PromptTemplate, FewShotPromptTemplate
from langchain_community.llms import Ollama
from langchain_openai import OpenAI
from langchain.chains import LLMChain


from openai import OpenAI

client = OpenAI(
	base_url="https://api-inference.huggingface.co/v1/",
	api_key=""                                                  #enter api key
)

raw = "YOU WANT EAT"
raw = "STUDENT TOMORROW SCHOOL READ BOOK BOOK"


messages = [
    {"role": "system", "content": "You are an AI skilled at translating raw sign language input into grammatically correct English sentences. Remember that when a word is repeated twice, it means that the word is in plural form not that it is 2 in quantity."},
    {"role": "user", "content": "Translate the following sign language into proper English sentences."},
    
    {"role": "assistant", "content": "Raw Input: 'HOME RAIN HEAVY'\nTranslation: 'It is raining heavily in my home area'"},

     {"role": "assistant", "content": "Raw Input: 'I TOMORROW EAT FRUIT FRUIT'\nTranslation: 'Tomorrow I will eat fruits.'"},
    
    {"role": "assistant", "content": "Raw Input: 'CLASS STUDENTS SIT'\nTranslation: 'There are students sitting in the class.'"},
    
    {"role": "assistant", "content": "Raw Input: 'I TONIGHT HOME GO LATE'\nTranslation: 'I will go home late tonight.'"},

    {"role": "assistant", "content": "Raw Input: 'YOU HUNGRY'\nTranslation: 'Are you feeling hungry?'"},
    
    {"role": "user", "content": f"Raw Input: {raw}"},
]


llm = client.chat.completions.create(
    model="meta-llama/Meta-Llama-3-8B-Instruct", 
	messages=messages, 
	max_tokens=20
)

final = str((llm.choices[0].message.content))
print(final)

from gtts import gTTS

import os

language = 'en'


myobj = gTTS(text=final, lang=language, slow=False)


myobj.save("welcome.mp3")


os.system("start welcome.mp3")


 Hello, good morning, teacher.


0

Book 0.99619746
Book 0.9999957
Book 0.99999547

Do 0.98608255
Do 0.9999471
Do 0.99994814

Eat 0.99525696
Eat 0.9999442
Eat 0.9999498

Go 0.99149173
Go 0.99988604
Go 0.99987805

Good -> I

Hello 0.99971336
Hello 0.99986565
Hello 0.9998723

Home 0.99402666
Home 0.9989083
Home 0.99521095

Hungry 0.98818445
Hungry 0.9999211
Hungry 0.99994934

I 0.8913519
I 0.99081624
I 0.52419513

Morning -> Do 0.7095311
Morning 0.9977514
Morning 0.99484104

No 0.99256283
No 0.99942327
No 0.9998179

Not ->Tomorrow , You , I

Pizza 0.98818153
Pizza 0.9998086
Pizza 0.9991148

Place 0.99283713
Place 0.9999306
Place 0.9960641

Read 0.9882073
Read 0.9999994
Read 0.9999994

School 0.9965444
School 0.99957186
School 0.9998172

Student 0.98731625
Student 0.99818146
Student 0.535804

Teacher 0.9144656
Teacher 0.9998091
Teacher 0.9999212

Thank You 0.9967784
Thank You 0.99974054
Thank You 0.99701047

This 0.99154836
This 0.98547655
This 0.89454705

Tomorrow 0.8973631
Tomorrow 0.90961677
Tomorrow 0.89528126

Want 0.9954282
Want 0.98475176
Want 0.99558383

What 0.99124473
What 0.9941282
What 0.99150753

Yes 0.98449403
Yes 0.9991202
Yes 0.9855783

Yesterday 0.7214897
Yes 0.60461485
Yesterday 0.9743711

You 0.9918717
You 0.9999629
You 0.9998995

