In [1]:
import cv2 #to access these functions and perform operations such as image manipulation, object detection, and feature extraction
import mediapipe as mp # provides pre-built solutions for tasks like hand tracking, facial keypoints detection, and pose estimation
from google.protobuf.json_format import MessageToDict #converts a protobuf message into a Python dictionary. This allows us to work with protobuf data in a more human-readable and manipulatable format. By importing MessageToDict, we can easily convert protobuf data to a dictionary, enabling us to process and analyze it more efficiently.

#Initialize MediaPipe objects: mp_drawing, mp_drawing_styles, and mp_hands are modules provided by MediaPipe for visualizing and processing hand landmarks.
mp_drawing = mp.solutions.drawing_utils #module offers functions like draw_landmarks() to draw landmarks on an image or video frame, draw_connections() to draw connections between landmarks, and draw_annotation() to add text annotations.
mp_drawing_styles = mp.solutions.drawing_styles #This module contains predefined drawing styles that can be used with the mp_drawing module. These styles define the colors, line thickness, and circle radius for different annotations. For example, the LANDMARK_CONNECTIONS style defines the color and thickness for drawing connections between landmarks, while the POSE_CONNECTIONS style defines the style for drawing body pose connections.
mp_hands = mp.solutions.hands #This module is part of the Mediapipe Hands solution and provides a pre-trained model for hand tracking. It allows developers to detect and track hand landmarks in real-time from images or videos. The mp_hands module provides a class called Hands that can be used to create an instance of the hand tracking solution.

mpHands = mp_hands.Hands(max_num_hands=2) #maximum number of hands to detect

cap = cv2.VideoCapture(0) #Open a video capture object with the index '0' to access the default webcam. This object will be used to read video frames.

#continuous loop for video processing
while True:
    success, img = cap.read() #Read a frame from the webcam: The cap.read() function reads the next frame from the video capture object, returning a success flag (success) and the frame image (img).
    img = cv2.flip(img, 1) #The cv2.flip() function is used to flip the image horizontally. This is done because the webcam's default orientation may not match the expected hand orientation
    imgRGB = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) #The cv2.cvtColor() function converts the image from the default BGR color format to the RGB format required by MediaPipe.

    hands_results = mpHands.process(imgRGB) #he mpHands.process() function takes the RGB image as input and returns the hand detection results, including the hand landmarks.

#These variables will store the number of fingers detected and the label (left or right) of the detected hand.
    finger_count = 0
    hand_label = None

#check if there are any hands detected in the image or video frame    
    if hands_results.multi_hand_landmarks:
        for hand_landmarks, handedness in zip(hands_results.multi_hand_landmarks, hands_results.multi_handedness): #The code loops through each pair of hand landmarks and their respective handedness information. Handedness refers to whether the hand is left or right.
            #MessageToDict(handedness): This function converts the ‘handedness’ object into a dictionary format. It assumes that ‘handedness’ is an object that can be converted into a dictionary representation using the MessageToDict function.
            #['classification']: After converting 'handedness' into a dictionary, we access the value associated with the key ‘classification’. This assumes that the converted dictionary has a key named ‘classification’, which holds another nested dictionary.
            #The [0] index is used to access the first element in the list associated with the key ‘classification’. This assumes that the value associated with ‘classification’ is a list, and we want to retrieve the first element from it.
            #['label']: Finally, we access the value associated with the key ‘label’ in the nested dictionary obtained from step 3. This assumes that the nested dictionary has a key named 'label'.
            label = MessageToDict(handedness)['classification'][0]['label'] #The code converts the handedness information into a dictionary format and extracts the ‘label’ attribute from it.
            hand_label = label #The variable hand_label is set to the classification label of the detected hand.

            hand_landmarks_list = [] #Each hand landmark consists of 3D coordinates (x, y, and z) relative to the image. We extract the x and y coordinates of each landmark and store them in the hand_landmarks_list.
         
           #iterating through the hand_landmarks object, extracting the x and y coordinates of each landmark and appending them to the hand_landmarks_list. This allows the code to keep track of the position of each landmark in the hand.
            for landmark in hand_landmarks.landmark:# بيمر على كل علامة في اليد وبيضيف إحداثياتها إلى القائمة
                hand_landmarks_list.append([landmark.x, landmark.y])
            #checks if the label is “Left” and the 4th landmark’s x coordinate is greater than the 3rd landmark’s x coordinate or if the label is “Right” and the 4th landmark’s x coordinate is less than the 3rd landmark’s x coordinate. In both cases, the finger_count is incremented. This method is used to identify the thumb’s position.
            if label == "Left" and hand_landmarks_list[4][0] > hand_landmarks_list[3][0]:
                finger_count += 1
            elif label == "Right" and hand_landmarks_list[4][0] < hand_landmarks_list[3][0]:
                finger_count += 1

            #check the positions of the other fingers:
            if hand_landmarks_list[8][1] < hand_landmarks_list[6][1]:  # Index finger
                finger_count += 1 #If the 8th landmark’s y coordinate is less than the 6th landmark’s y coordinate, the finger_count is incremented.
            if hand_landmarks_list[12][1] < hand_landmarks_list[10][1]:  # Middle finger
                finger_count += 1 #If the 12th landmark’s y coordinate is less than the 10th landmark’s y coordinate, the finger_count is incremented.
            if hand_landmarks_list[16][1] < hand_landmarks_list[14][1]:  # Ring finger
                finger_count += 1 #If the 16th landmark’s y coordinate is less than the 14th landmark’s y coordinate, the finger_count is incremented.
            if hand_landmarks_list[20][1] < hand_landmarks_list[18][1]:  # Pinky
                finger_count += 1 #If the 20th landmark’s y coordinate is less than the 18th landmark’s y coordinate, the finger_count is incremented.

            #Visualize hand landmarks on the image
            mp_drawing.draw_landmarks(#The mp_drawing.draw_landmarks() function is used to draw the detected hand landmarks and connections on the img image.
                img, #This is the input image on which the landmarks and connections will be drawn.
                hand_landmarks, #This is the output of the hand tracking model, which contains the detected landmarks for each hand in the image.
                mp_hands.HAND_CONNECTIONS, #This parameter specifies the type of connections to draw between the hand landmarks.
                mp_drawing_styles.get_default_hand_landmarks_style()) #This parameter defines the style for drawing the connections between the hand landmarks. Similar to the previous parameter, it can be customized to change the appearance of the connections.

    if hand_label:
        cv2.putText(img, hand_label, (50, 100), cv2.FONT_HERSHEY_SIMPLEX, 3, (0, 0, 255), 2)

    cv2.putText(img, str(finger_count), (50, 450), cv2.FONT_HERSHEY_SIMPLEX, 3, (255, 0, 0), 10)

    cv2.imshow('Hand Gesture', img)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()