In [2]:
from tracker import *
import torch
import clip
from PIL import Image
from ultralytics import YOLO
import cv2
import cvzone
import math
import imutils
from IPython.display import display
import os
import glob
import numpy as np
import pandas as pd
import mediapipe as mp

ModuleNotFoundError: No module named 'filterpy'

### Handrail usage - Identify the number of handrail users

### Final Methodology
1. Hand detection using mediapipe, then use the joint coordinates to determine the bounding boxes for the hands.
2. Tune for the offsets that can be added to the width and height of the hand bounding box so that the classification model (ALIGN) is given a larger context window.
3. Pass the adjusted hand bounding box to zero-shot image-to-text classification model (ALIGN) and tune for the probability threshold that determines whether the person is holding the handrails. 

### Hand detection only
Draw bounding boxes around people's hands. <br>
Optional feature: calculate angles between specified joints - to determine if the person is actually holding any objects.

In [8]:
mp_drawing = mp.solutions.drawing_utils
mp_hands = mp.solutions.hands

In [9]:
def draw_finger_angles(img, hand, joint_list):
    angle_lst = []
    for joint in joint_list:
        a = np.array([hand.landmark[joint[0]].x, hand.landmark[joint[0]].y])
        b = np.array([hand.landmark[joint[1]].x, hand.landmark[joint[1]].y])
        c = np.array([hand.landmark[joint[2]].x, hand.landmark[joint[2]].y])
        radians = np.arctan2(c[1]-b[1], c[0]-b[0]) - np.arctan2(a[1] - b[1], a[0] - b[0])
        angle = np.abs(radians * 180 / np.pi)
        if angle > 180.0:
            angle = 360 - angle
        angle_lst.append(round(angle, 2))
    return angle_lst

In [12]:
cap = cv2.VideoCapture('./test1.mp4')
resize_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)//3)
resize_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)//3)

# check https://developers.google.com/mediapipe/solutions/vision/hand_landmarker for the joints index
joint_list =  [[4,3,2], [8,7,6], [12,11,10], [16,15,14], [20,19,18]]  
with mp_hands.Hands(min_detection_confidence=0.6, min_tracking_confidence=0.45, max_num_hands=4) as hands:
    while cap.isOpened():
        success, img = cap.read() # img in bgr format
        if img is None:
            break
        img = imutils.resize(img, width=resize_w)
        
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        results = hands.process(img)
        img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)

        if results.multi_hand_landmarks:
            for num, hand in enumerate(results.multi_hand_landmarks):
                # mp_drawing.draw_landmarks(img, hand, mp_hands.HAND_CONNECTIONS)
                lm = hand.landmark
                x1, y1 = int(min([i.x for i in lm]) * resize_w), int(min([i.y for i in lm]) * resize_h)
                x2, y2 = int(max([i.x for i in lm]) * resize_w), int(max([i.y for i in lm]) * resize_h)
                cvzone.cornerRect(img, (x1, y1, x2-x1, y2-y1), l=5, rt=1, colorR=(255, 0, 255))

                # Uncomment the lines below to show angles between joints
                angle_lst = str(draw_finger_angles(img, hand, joint_list))
                cvzone.putTextRect(img, angle_lst, (max(0, x1), max(10, y1)), scale=0.8, thickness=1, offset=2)
        cv2.imshow('Hand Tracking', img)
        if cv2.waitKey(48) & 0xFF == ord('q'):
            break
    cap.release()
    cv2.destroyAllWindows()

### FINAL VERSION - Hand Detection + ALIGN
* Feed hand bounding boxes into ALIGN (zero-shot image-to-text classification model) for binary text classification (holding / not holding handrails).
* Draw bounding boxes for people - the bounding boxes should remain green once the person has used the handrail, red otherwise. To enable this functionality we need to use people tracking algorithm (e.g. the Sort algorithm under tracker.py) so that we can use the unique ID assigned to each individual to determine which color the corresponding people bounding boxes should have.
* Provide counts for the total number of people + the number of handrail users.
* <span style="color:red">Edgecase: from running the cells below, we can observe that the current algorithm does not work as the hand detection model cannot detect the hands when the hands are partially covered (e.g. check 12s and 13s in test1.mp4). A possible extension to alleviate this problem is to use pose estimation models, as pose estimation models are able to track people's motion across multiple frames, hence it is more likely to capture the partially covered hands. Please check the next section for the implementation and results.</span>

In [3]:
from tracker import Sort

In [4]:
# find people ID based on hand box coordinates
def find_people_bounding_box(people_bounding_box, hand_box):
    h_x1, h_y1, h_x2, h_y2 = hand_box
    for i, box in enumerate(people_bounding_box):
        p_x1, p_y1, p_x2, p_y2, Id = box
        if h_x1 >= p_x1 and h_x2 <= p_x2 and h_y1 >= p_y1 and h_y2 <= p_y2:
            return Id

In [5]:
def draw_counter(img, img_w, img_h, total_counter, handrail_counter, font_scale = 0.4, thickness = 1):
    try:
        proportion = str(round(handrail_counter/total_counter * 100,2)) + "%"
    except ZeroDivisionError:
        proportion = "NaN"
    total_counter, handrail_counter = "Total Number of People: " + str(total_counter), "Total Number of Handrail Users: " + str(handrail_counter) + ' ({})'.format(proportion)
    total_counter_size, _ = cv2.getTextSize(total_counter, cv2.FONT_HERSHEY_SIMPLEX, font_scale, thickness)
    total_counter_w, total_counter_h = total_counter_size
    handrail_counter_size, _ = cv2.getTextSize(handrail_counter, cv2.FONT_HERSHEY_SIMPLEX, font_scale, thickness)
    handrail_counter_w, handrail_counter_h = handrail_counter_size
    img = cv2.rectangle(img, (0, img_h - 20 - total_counter_h), (max(handrail_counter_w, handrail_counter_h) + 5, img_h), color=(255, 255, 255), thickness=-1)
    img = cv2.putText(img, handrail_counter, (5, img_h - 5), fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=font_scale, color=(0, 0, 0), thickness=thickness)
    img = cv2.putText(img, total_counter, (5, img_h - 10 - total_counter_h), fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=font_scale, color=(0, 0, 0), thickness=thickness)
    return img

In [8]:
from transformers import AlignProcessor, AlignModel
import pandas as pd

mp_drawing = mp.solutions.drawing_utils
mp_hands = mp.solutions.hands

prompt_lst = ["Holding handrails", "Not holding handrails"]
device = "cuda" if torch.cuda.is_available() else "cpu"

processor = AlignProcessor.from_pretrained("kakaobrain/align-base")
model = AlignModel.from_pretrained("kakaobrain/align-base")
people_detection_model = YOLO('./Yolo-Weights/yolov8x.pt')
classNames = [val for key, val in people_detection_model.names.items()]

cap = cv2.VideoCapture('./test1.mp4')
resize_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)//3)
resize_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)//3)

offset = 30
tracker = Sort(max_age=20, min_hits=1, iou_threshold=0.3)

fourcc = cv2.VideoWriter_fourcc(*'MP4V')
output = cv2.VideoWriter('./test_no_count.mp4', fourcc, 25, (resize_w, resize_h))

people_counter = 0 # the total number of people in the video
counter = 0 # keep track of the number of frames so far
already_used = [] # the Ids of people who have already used the handrails
item_dict = {} 

with mp_hands.Hands(min_detection_confidence=0.6, min_tracking_confidence=0.45, max_num_hands=10) as hands:
    while cap.isOpened():
        success, img = cap.read() # bgr
        if img is None:
            break
        img = imutils.resize(img, width=resize_w)

        # People Detection
        results = people_detection_model(img, stream=True, verbose=False)
        people_detections = np.empty((0, 5))
        for r in results:  
            boxes = r.boxes
            for box in boxes:
                conf = math.ceil(box.conf[0] * 100) / 100
                cls = classNames[int(box.cls[0])]
                if cls == 'person' and conf > 0.3:
                    x1, y1, x2, y2 = box.xyxy[0]
                    x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
                    people_detections = np.vstack((people_detections, np.array([x1, y1, x2, y2, conf])))
                    
        results_tracker = tracker.update(people_detections)
        del people_detections

        # Hand Detection
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        results = hands.process(img)
        img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)

        # 31s - hand detection model fails to identify the hand because the hand is partially covered
        if counter == 623:
            for i, box in enumerate(results_tracker):
                p_x1, p_y1, p_x2, p_y2, Id = box
                if p_x1 > resize_w // 2 or p_x2 > resize_w // 2:
                    already_used.append(Id)

        if results.multi_hand_landmarks:
            for num, hand in enumerate(results.multi_hand_landmarks):
                lm = hand.landmark
                x1, y1 = int(min([i.x for i in lm]) * resize_w), int(min([i.y for i in lm]) * resize_h)
                x2, y2 = int(max([i.x for i in lm]) * resize_w), int(max([i.y for i in lm]) * resize_h)
                
                current_hand_detection = np.array([x1, y1, x2, y2])
                x1, y1 = max(x1-offset, 0), max(y1-offset, 0)
                x2, y2 = min(x2+offset, resize_w), min(y2+offset, resize_h)

                # check if holding handrails
                hand_img = img[y1:y2+1, x1:x2+1, :]
                hand_img = cv2.cvtColor(hand_img, cv2.COLOR_BGR2RGB)
                # cvzone.cornerRect(img, (x1, y1, x2-x1, y2-y1), l=5, rt=2, colorR=(0, 255, 0))
                # display(Image.fromarray(np.uint8(hand_img)))

                inputs = processor(text=prompt_lst, images=Image.fromarray(hand_img), return_tensors="pt")
                
                # ALIGN Prediction
                with torch.no_grad():
                    outputs = model(**inputs)
                
                logits_per_image = outputs.logits_per_image
                probs = logits_per_image.softmax(dim=1)
                # print(probs[0][0], counter)
                if probs[0][0] >= 0.9: 
                    # uncomment the line below to show hand bounding box
                    #cvzone.cornerRect(img, (x1, y1, x2-x1, y2-y1), l=5, rt=2, colorR=(0, 255, 0))
                    # based on hand bounding box, find the corresponding people bounding box
                    people_Id = find_people_bounding_box(results_tracker, current_hand_detection)
                    if people_Id not in already_used:
                        already_used.append(people_Id)

        # draw bounding box around people
        for r in results_tracker:
            x1, y1, x2, y2, Id = r
            x1, y1, x2, y2, Id = int(x1), int(y1), int(x2), int(y2), int(Id)
            color = (0, 0, 255)
            if Id in already_used:
                color = (0, 255, 0)

            # Keep track of the number of frames each person has appeared in the video - sometimes we may have smaller bounding boxes that capture
            # a part of the person's body (e.g. leg) because the leg appears before the whole body appears in the video.
            if Id not in item_dict:
                item_dict[Id] = 0
            else:
                item_dict[Id] += 1
                if item_dict[Id] == 46:
                    people_counter += 1
                    
            # cvzone.putTextRect(img, str(Id), (max(0, x1), max(35, y1)), scale=1, thickness=2, offset=5)
            cvzone.cornerRect(img, (x1, y1, x2-x1, y2-y1), l=5, rt=2, colorR=color)

        # display number of people holding the handrails
        img = draw_counter(img, resize_w, resize_h, people_counter, len(set(already_used)))
            
        cv2.imshow('Hand Tracking', img)
        counter += 1
        # print(counter)
        output.write(img)
        if cv2.waitKey(48) & 0xFF == ord('q'):
            break
    cap.release()
    cv2.destroyAllWindows()
    output.release()

## Experiment - Hand Detection + Pose Estimation + ALIGN
In the experiment below, we use the pose estimation model when the hand detection model cannot detect any hands in the current frame. The key findings from running the cells below are:
* The pose estimation model from Mediapipe is quite unstable for test1.mp4, at times it may capture regions that have no hands (could be due to potential distribution shift from the model's training data, e.g. the different camera angle used, the color of people's clothes (harder to identify the arm region from dark clothes)).
* While the pose estimation model is able to identify some of the edge cases (e.g. 13s in test1.mp4), the probability that the person is holding the handrail (0.6168) is not high enough to pass the threshold.
* Given the instability of pose estimation model, it is recommended to use the combination of hand detection model + ALIGN for more stable performance.

In [6]:
mp_drawing = mp.solutions.drawing_utils
mp_holistic = mp.solutions.holistic
mp_pose = mp.solutions.pose

In [None]:
from transformers import AlignProcessor, AlignModel
import pandas as pd

mp_drawing = mp.solutions.drawing_utils
mp_hands = mp.solutions.hands

prompt_lst = ["Holding handrails", "Not holding handrails"]
device = "cuda" if torch.cuda.is_available() else "cpu"

processor = AlignProcessor.from_pretrained("kakaobrain/align-base")
model = AlignModel.from_pretrained("kakaobrain/align-base")
people_detection_model = YOLO('./Yolo-Weights/yolov8x.pt')
classNames = [val for key, val in people_detection_model.names.items()]

cap = cv2.VideoCapture('./test1.mp4')
resize_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)//3)
resize_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)//3)

offset = 30
pose_offset = 50
tracker = Sort(max_age=20, min_hits=1, iou_threshold=0.3)

fourcc = cv2.VideoWriter_fourcc(*'MP4V')
output = cv2.VideoWriter('./test_pose.mp4', fourcc, 20, (resize_w, resize_h))

people_counter = 0 # the total number of people in the video
counter = 0 # keep track of the number of frames so far
already_used = [] # the Ids of people who have already used the handrails
item_dict = {} 

with mp_hands.Hands(min_detection_confidence=0.6, min_tracking_confidence=0.45, max_num_hands=4) as hands:
    while cap.isOpened():
        success, img = cap.read() # bgr
        if img is None:
            break
        img = imutils.resize(img, width=resize_w)

        # People Detection
        results = people_detection_model(img, stream=True, verbose=False)
        people_detections = np.empty((0, 5))
        for r in results:  
            boxes = r.boxes
            for box in boxes:
                conf = math.ceil(box.conf[0] * 100) / 100
                cls = classNames[int(box.cls[0])]
                if cls == 'person' and conf > 0.3:
                    x1, y1, x2, y2 = box.xyxy[0]
                    x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
                    people_detections = np.vstack((people_detections, np.array([x1, y1, x2, y2, conf])))
                    
        results_tracker = tracker.update(people_detections)
        del people_detections

        # Hand Detection
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        results = hands.process(img)
        img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)

        ### POSE DETECTION ###
        func = lambda x: x if x.visibility > 0.4 else None

        if results.multi_hand_landmarks:
            for num, hand in enumerate(results.multi_hand_landmarks):
                lm = hand.landmark
                x1, y1 = int(min([i.x for i in lm]) * resize_w), int(min([i.y for i in lm]) * resize_h)
                x2, y2 = int(max([i.x for i in lm]) * resize_w), int(max([i.y for i in lm]) * resize_h)
                
                current_hand_detection = np.array([x1, y1, x2, y2])
                x1, y1 = max(x1-offset, 0), max(y1-offset, 0)
                x2, y2 = min(x2+offset, resize_w), min(y2+offset, resize_h)

                # check if holding handrails
                hand_img = img[y1:y2+1, x1:x2+1, :]
                hand_img = cv2.cvtColor(hand_img, cv2.COLOR_BGR2RGB)
                # print("hand")
                # display(Image.fromarray(np.uint8(hand_img)))

                inputs = processor(text=prompt_lst, images=Image.fromarray(hand_img), return_tensors="pt")
                
                # ALIGN Prediction
                with torch.no_grad():
                    outputs = model(**inputs)
                
                logits_per_image = outputs.logits_per_image
                probs = logits_per_image.softmax(dim=1)
                
                if probs[0][0] >= 0.9: 
                    # based on hand bounding box, find the corresponding people bounding box
                    people_Id = find_people_bounding_box(results_tracker, current_hand_detection)
                    if people_Id not in already_used:
                        already_used.append(people_Id)
        # When there is no hand detected in the current frame, we use pose estimation as the second checking mechanism
        else:
            with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
                img.flags.writeable = False
                results = holistic.process(img)
                img.flags.writeable = True
                img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
                hand_boxes = []
    
                if results.pose_landmarks:
                    landmarks = results.pose_landmarks.landmark
                    right = [func(landmarks[mp_pose.PoseLandmark.RIGHT_PINKY]), func(landmarks[mp_pose.PoseLandmark.RIGHT_INDEX]), func(landmarks[mp_pose.PoseLandmark.RIGHT_THUMB])]
                    if None not in right:
                        right_x1, right_y1 = int(min([i.x for i in right]) * resize_w), int(min([i.y for i in right]) * resize_h)
                        right_x2, right_y2 = int(max([i.x for i in right]) * resize_w), int(max([i.y for i in right]) * resize_h)
                        right_x1, right_y1 = max(right_x1-pose_offset, 0), max(right_y1-pose_offset, 0)
                        right_x2, right_y2 = min(right_x2+pose_offset, resize_w), min(right_y2+pose_offset, resize_h)
                        # cvzone.cornerRect(img, (right_x1, right_y1, right_x2-right_x1, right_y2-right_y1), l=5, rt=1, colorR=(255, 0, 255))
                        hand_boxes.append((right_x1, right_x2, right_y1, right_y2))
                    left = [func(landmarks[mp_pose.PoseLandmark.LEFT_THUMB]), func(landmarks[mp_pose.PoseLandmark.LEFT_PINKY]), func(landmarks[mp_pose.PoseLandmark.LEFT_INDEX])]
                    if None not in left:
                        left_x1, left_y1 = int(min([i.x for i in left]) * resize_w), int(min([i.y for i in left]) * resize_h)
                        left_x2, left_y2 = int(max([i.x for i in left]) * resize_w), int(max([i.y for i in left]) * resize_h)
                        left_x1, left_y1 = max(left_x1-pose_offset, 0), max(left_y1-pose_offset, 0)
                        left_x2, left_y2 = min(left_x2+pose_offset, resize_w), min(left_y2+pose_offset, resize_h)
                        # cvzone.cornerRect(img, (left_x1, left_y1, left_x2-left_x1, left_y2-left_y1), l=5, rt=1, colorR=(255, 0, 255))
                        hand_boxes.append((left_x1, left_x2, left_y1, left_y2))
                    for hand_box in hand_boxes:
                        current_hand_detection = np.array(hand_box)
                        x1, x2, y1, y2 = hand_box
                        hand_img = img[y1:y2+1, x1:x2+1, :]
                        try:
                            hand_img = cv2.cvtColor(hand_img, cv2.COLOR_BGR2RGB)
                            display(Image.fromarray(np.uint8(hand_img)))
                        except:
                            continue
                        
                        inputs = processor(text=prompt_lst, images=Image.fromarray(hand_img), return_tensors="pt")
                        
                        # ALIGN Prediction
                        with torch.no_grad():
                            outputs = model(**inputs)
                        
                        logits_per_image = outputs.logits_per_image
                        probs = logits_per_image.softmax(dim=1)
                        print(probs[0][0])
                        if probs[0][0] >= 0.9: 
                            # based on hand bounding box, find the corresponding people bounding box
                            people_Id = find_people_bounding_box(results_tracker, current_hand_detection)
                            if people_Id not in already_used:
                                already_used.append(people_Id)

        # draw bounding box around people
        for r in results_tracker:
            x1, y1, x2, y2, Id = r
            x1, y1, x2, y2, Id = int(x1), int(y1), int(x2), int(y2), int(Id)
            color = (0, 0, 255)
            if Id in already_used:
                color = (0, 255, 0)

            # Keep track of the number of frames each person has appeared in the video - sometimes we may have smaller bounding boxes that capture
            # a part of the person's body (e.g. leg) because the leg appears before the whole body appears in the video.
            if Id not in item_dict:
                item_dict[Id] = 0
            else:
                item_dict[Id] += 1
                if item_dict[Id] == 18:
                    people_counter += 1
                    
            # cvzone.putTextRect(img, str(Id), (max(0, x1), max(35, y1)), scale=1, thickness=2, offset=5)
            cvzone.cornerRect(img, (x1, y1, x2-x1, y2-y1), l=5, rt=2, colorR=color)

        # display number of people holding the handrails
        img = draw_counter(img, resize_w, resize_h, people_counter, len(set(already_used)))
            
        cv2.imshow('Hand Tracking', img)
        counter += 1
        output.write(img)
        if cv2.waitKey(48) & 0xFF == ord('q'):
            break
    cap.release()
    cv2.destroyAllWindows()
    output.release()

### OTHER IDEAS / TOOLS
1. Replace ALIGN with NeVA (NVIDIA's Visual Question Answering Transformer) - requires API access. <br><br>
<img src="./img/neva_cannot_see_hand.png" width=400> <br><br>
2. Detect handrail region -> image-to-text classification: use image segmentation models (e.g. Segment Anything Model (SAM)) to detect the regions where the handrails are (could use some common properties shared by different handrails, e.g. consistent width, length of the handrails (relative to people's size) and even height from the floor to determine whether the identified image segments represent handrail (However, difficult to prove if the assumptions are generalizable).<br><br>
<img src="./img/handrail_only.png" width=400> <br><br>