# Volume Control using OpenCV and MediaPipe

This code captures video from a camera, detects hand/finger movements, and changes the volume of the computer based on hand gestures. It uses OpenCV, Mediapipe, ctypes, and pycaw libraries to perform these operations. The user can increase the volume by bringing their index and middle fingers to the middle left of the screen and decrease it by bringing them to the middle right. The current volume level is displayed on the screen.

## Documentation


The first thing we do is import necessary libraries like sys, cv2, mediapipe, math, ctypes, comtypes and numpy using `import` statements.

In [None]:
import cv2
import mediapipe as mp
from math import hypot
from ctypes import cast, POINTER
from comtypes import CLSCTX_ALL
from pycaw.pycaw import AudioUtilities, IAudioEndpointVolume
import numpy as np

We then define a boolean variable called `DEBUG` which will be used later to print some debug information if the value is true.

In [None]:
# If there is an argument in the command line pass it to the function set DEBUG to False
DEBUG = argv[1] == "--debug" if len(argv) > 1 else False

The program then creates a capture object using `cv2.VideoCapture(0)`. This checks if a camera is available and if it is, selects the first available camera. 

In [None]:
cap = cv2.VideoCapture(0)  # Checks for camera

Next, we set the resolution and frame rate of the video feed using the following code:

In [None]:
cap.set(cv2.CAP_PROP_FRAME_WIDTH, 1280)  # Sets the width
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 720)  # Sets the height
cap.set(cv2.CAP_PROP_FPS, 30)  # Sets the fps

We then initialize the MediaPipe hands object and configure its options using `mp_hands = mp.solutions.hands` and `hands = mp_hands.Hands()`.

In [None]:
mp_hands = mp.solutions.hands  # detects hand/finger
hands = mp_hands.Hands()  # complete the initialization configuration of hands
mpDraw = mp.solutions.drawing_utils

In this step, we access the speaker through the pycaw library.

In [None]:
# To access speaker through the library pycaw
devices = AudioUtilities.GetSpeakers()
interface = devices.Activate(IAudioEndpointVolume._iid_, CLSCTX_ALL, None)
volume = cast(interface, POINTER(IAudioEndpointVolume))
volbar = 400
volper = 0

We get some important configuration values for audio volume from our speaker interface using `volMin, volMax = volume.GetVolumeRange()[:2]`.

In [None]:
# volume range
volMin, volMax = volume.GetVolumeRange()[:2]

Inside the while loop, we read frames from the camera.

In [None]:
success, frame = cap.read()  # If camera works capture an image
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)  # Convert to RGB

We flip the image horizontally to make it easier for us to interact with the right hand.

In [None]:
# flip the image
frame = cv2.flip(frame, 1)

We use `hands.process(frame)` to process the frame and detect hand landmarks. If landmarks are available, we proceed to check for specific gestures.

In [None]:
# Collection of gesture information
results = hands.process(frame)  # completes the image processing.

We check if the index and middle fingers are in the middle left of the screen using some calculations and comparisons. If they are, then we update our volume level accordingly by setting the master volume level of the audio interface using `volume.SetMasterVolumeLevel(vol, None)`.

In [None]:
lmList = []  # empty list
    if results.multi_hand_landmarks:  # list of all hands detected.
        
        # By accessing the list, we can get the information of each hand's corresponding flag bit
        for hand_landmarks in results.multi_hand_landmarks:
                        
            # adding counter and returning it
            for id, lm in enumerate(hand_landmarks.landmark):
                # Get finger joint points
                h, w, _ = frame.shape
                cx, cy = int(lm.x * w), int(lm.y * h)
                # adding to the empty list 'lmList'
                lmList.append([id, cx, cy])
            
            if DEBUG:
                print(lmList)        
                mpDraw.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)

    if lmList != []:
        
        # if the index finger and middle finger are in the middle left of the screen
        # increase the volume
        # getting the value at a point as x, y
        x1, y1 = lmList[4][1], lmList[4][2]  # thumb
        x2, y2 = lmList[8][1], lmList[8][2]  # index finger

        if x1 < frame.shape[1] // 2 and x2 < frame.shape[1] // 2 and y1 < frame.shape[0] // 2 and y2 < frame.shape[0] // 2:

            # creating circle at the tips of thumb and index finger
            # image #fingers #radius #rgb
            cv2.circle(frame, (x1, y1), 13, (255, 0, 0), cv2.FILLED)
            # image #fingers #radius #rgb
            cv2.circle(frame, (x2, y2), 13, (255, 0, 0), cv2.FILLED)
            # create a line b/w tips of index finger and thumb
            cv2.line(frame, (x1, y1), (x2, y2), (255, 0, 0), 3)

            length = hypot(x2 - x1, y2 - y1)  
            # distance b/w tips using hypotenuse
            # from numpy we find our length by converting hand range in terms of volume range ie b/w -63.5 to 0
            vol = np.interp(length, [30, 350], [volMin, volMax])
            volbar = np.interp(length, [30, 350], [400, 150])
            volper = np.interp(length, [30, 350], [0, 100])

            volume.SetMasterVolumeLevel(vol, None)

We create a rectangle and fill it up to the current volume level on the frame.

In [None]:
# Hand range 30 - 350
    # Volume range -63.5 - 0.0
    # creating volume bar for volume level
    # vid ,initial position ,ending position ,rgb ,thickness
    cv2.rectangle(frame, (50, 150), (85, 400), (0, 0, 255), 4)
    cv2.rectangle(frame, (50, int(volbar)), (85, 400),
                (0, 0, 255), cv2.FILLED)
    cv2.putText(frame, f"{int(volper)}%", (10, 40),
                cv2.FONT_ITALIC, 1, (0, 255, 98), 3)
    # tell the volume percentage ,location,font of text,length,rgb color,thickness

Finally, we display the video output on the screen.

In [None]:
cv2.imshow('Image', frame)  # Show the video

This loop runs until the capture device is stopped. User can stop the capture device by pressing 'q' key on their keyboard.

In [None]:
# while the capture device is open run the loop, if q is pressed break
if cv2.waitKey(1) & 0xff == ord('q'):
        break


13. After the while loop ends, release the camera resource and destroy all windows using `cap.release()` and `cv2.destroyAllWindows()`, respectively.


In [None]:
cap.release()  # stop cam
cv2.destroyAllWindows()  # close window

## Complete Python Code

In [None]:
from sys import argv
import cv2
import mediapipe as mp
from math import hypot
from ctypes import cast, POINTER
from comtypes import CLSCTX_ALL
from pycaw.pycaw import AudioUtilities, IAudioEndpointVolume
import numpy as np

DEBUG = argv[1] == '-d' or argv[1] == '--debug'

cap = cv2.VideoCapture(0)  # Checks for camera
cap.set(cv2.CAP_PROP_FRAME_WIDTH, 1280)  # Sets the width
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 720)  # Sets the height
cap.set(cv2.CAP_PROP_FPS, 30)  # Sets the fps

mp_hands = mp.solutions.hands  # detects hand/finger
hands = mp_hands.Hands()  # complete the initialization configuration of hands
mpDraw = mp.solutions.drawing_utils

# To access speaker through the library pycaw
devices = AudioUtilities.GetSpeakers()
interface = devices.Activate(IAudioEndpointVolume._iid_, CLSCTX_ALL, None)
volume = cast(interface, POINTER(IAudioEndpointVolume))
volbar = 400
volper = 0

# volume range
volMin, volMax = volume.GetVolumeRange()[:2]

# while the capture device is open run the loop
while cap.isOpened():

    success, frame = cap.read()  # If camera works capture an image
    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)  # Convert to RGB
    # flip the image
    frame = cv2.flip(frame, 1)

    # Collection of gesture information
    results = hands.process(frame)  # completes the image processing.

    lmList = []  # empty list
    if results.multi_hand_landmarks:  # list of all hands detected.
        
        # By accessing the list, we can get the information of each hand's corresponding flag bit
        for hand_landmarks in results.multi_hand_landmarks:
            
            # Get the landmarks for the index and middle fingers
            index_finger = hand_landmarks.landmark[mp_hands.HandLandmark.INDEX_FINGER_TIP]
            middle_finger = hand_landmarks.landmark[mp_hands.HandLandmark.MIDDLE_FINGER_TIP]

            # Check if the fingers are in the middle left of the screen
            if index_finger.x < frame.shape[1] // 2 and middle_finger.x < frame.shape[1] // 2 and \
                    index_finger.y < frame.shape[0] // 2 and middle_finger.y < frame.shape[0] // 2:
                print("Fingers location is middle left of the screen. Increase volume.")
            
                # adding counter and returning it
                for id, lm in enumerate(hand_landmarks.landmark):
                    # Get finger joint points
                    h, w, _ = frame.shape
                    cx, cy = int(lm.x * w), int(lm.y * h)
                    # adding to the empty list 'lmList'
                    lmList.append([id, cx, cy])
            
            if DEBUG:
                print(lmList)        
                mpDraw.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)

    if lmList != []:
        
        # if the index finger and middle finger are in the middle left of the screen
        # increase the volume
        # getting the value at a point as x, y
        x1, y1 = lmList[4][1], lmList[4][2]  # thumb
        x2, y2 = lmList[8][1], lmList[8][2]  # index finger

        if x1 < frame.shape[1] // 2 and x2 < frame.shape[1] // 2 and y1 < frame.shape[0] // 2 and y2 < frame.shape[0] // 2:

            # creating circle at the tips of thumb and index finger
            # image #fingers #radius #rgb
            cv2.circle(frame, (x1, y1), 13, (255, 0, 0), cv2.FILLED)
            # image #fingers #radius #rgb
            cv2.circle(frame, (x2, y2), 13, (255, 0, 0), cv2.FILLED)
            # create a line b/w tips of index finger and thumb
            cv2.line(frame, (x1, y1), (x2, y2), (255, 0, 0), 3)

            length = hypot(x2 - x1, y2 - y1)  
            # distance b/w tips using hypotenuse
            # from numpy we find our length by converting hand range in terms of volume range ie b/w -63.5 to 0
            vol = np.interp(length, [30, 350], [volMin, volMax])
            volbar = np.interp(length, [30, 350], [400, 150])
            volper = np.interp(length, [30, 350], [0, 100])

            volume.SetMasterVolumeLevel(vol, None)

    # Hand range 30 - 350
    # Volume range -63.5 - 0.0
    # creating volume bar for volume level
    # vid ,initial position ,ending position ,rgb ,thickness
    cv2.rectangle(frame, (50, 150), (85, 400), (0, 0, 255), 4)
    cv2.rectangle(frame, (50, int(volbar)), (85, 400),
                (0, 0, 255), cv2.FILLED)
    cv2.putText(frame, f"{int(volper)}%", (10, 40),
                cv2.FONT_ITALIC, 1, (0, 255, 98), 3)
    # tell the volume percentage ,location,font of text,length,rgb color,thickness

    cv2.imshow('Image', frame)  # Show the video
    if cv2.waitKey(1) & 0xff == ord('q'):  # By using spacebar delay will stop
        break

cap.release()  # stop cam
cv2.destroyAllWindows()  # close window
