In [2]:
import matplotlib.pyplot as plt
import torch

In [12]:
import cv2
import torch
from torchvision import models
import torchvision.transforms as T

# Load the pre-trained keypoint detection model
model = models.detection.keypointrcnn_resnet50_fpn(pretrained=True)
model.eval()

# Open webcam
cap = cv2.VideoCapture(0)  # 0 corresponds to the default camera (usually the built-in webcam)

# Define connections between keypoints
keypoint_connections = [
    (0, 1), (0, 2),  # Connect Nose to Left and Right Eyes
    (1, 3), (2, 4),  # Connect Left and Right Eyes to Left and Right Ears
    (5, 6), (5, 7), (7, 9), (6, 8), (8, 10),  # Connect Shoulders, Elbows, and Wrists
    (5, 11), (6, 12),
    (11, 12), (11, 13), (13, 15), (12, 14), (14, 16)  # Connect Hips, Knees, and Ankles
]

# Indices corresponding to the feet keypoints
left_foot_index = 15
right_foot_index = 16

# Define region of interest (ROI) for legs
min_y_leg_roi = 300  # Adjust this value based on your frame dimensions

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Convert BGR image to RGB
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # Apply the necessary transformations
    transform = T.Compose([T.ToTensor()])
    input_tensor = transform(rgb_frame).unsqueeze(0)

    # Make prediction
    with torch.no_grad():
        predictions = model(input_tensor)

    # Check if predictions contain elements and 'keypoints' is present
    if predictions and 'keypoints' in predictions[0]:
        # Extract keypoints from the prediction
        if predictions[0]['keypoints'].shape[0] > 0:
            keypoints = predictions[0]['keypoints'][0].numpy()
    
            # Draw keypoints on the frame
            for kp in keypoints:
                x, y, prob = map(int, kp)
                if prob > 0.5:  # Draw keypoints with confidence greater than 0.5
                    cv2.circle(frame, (x, y), 5, (0, 255, 0), -1)
    
            # Draw lines connecting keypoints
            for connection in keypoint_connections:
                start_point = connection[0]
                end_point = connection[1]
    
                # Check if both keypoints in a connection have sufficient confidence
                if keypoints[start_point][2] > 0.5 and keypoints[end_point][2] > 0.5:
                    start_coord = tuple(map(int, keypoints[start_point][:2]))
                    end_coord = tuple(map(int, keypoints[end_point][:2]))
    
                    cv2.line(frame, start_coord, end_coord, (0, 255, 0), 2)
    
            # Check if feet keypoints are visible and within the leg ROI before drawing them
            if keypoints[left_foot_index][2] > 0.5 and keypoints[left_foot_index][1] > min_y_leg_roi:
                cv2.circle(frame, tuple(map(int, keypoints[left_foot_index][:2])), 5, (0, 255, 0), -1)
    
            if keypoints[right_foot_index][2] > 0.5 and keypoints[right_foot_index][1] > min_y_leg_roi:
                cv2.circle(frame, tuple(map(int, keypoints[right_foot_index][:2])), 5, (0, 255, 0), -1)

    # Display the frame
    cv2.imshow('Pose Estimation', frame)

    # Break the loop when 'q' key is pressed
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release the webcam and close the window
cap.release()
cv2.destroyAllWindows()

In [6]:
from transformers import pipeline

checkpoint = "vinvino02/glpn-nyu"
depth_estimator = pipeline("depth-estimation", model=checkpoint)

config.json:   0%|          | 0.00/920 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


pytorch_model.bin:   0%|          | 0.00/245M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


In [14]:
from PIL import Image
import time
import torch
import cv2
import numpy as np
from torchvision import transforms

from utils.datasets import letterbox
from utils.general import non_max_suppression_kpt
from utils.plots import output_to_keypoint, plot_skeleton_kpts

def pose_video(frame):
    mapped_img = frame.copy()
    # Letterbox resizing.
    img = letterbox(frame, input_size, stride=64, auto=True)[0]
    #print(img.shape)
    img_ = img.copy()
    # Convert the array to 4D.
    img = transforms.ToTensor()(img)
    # Convert the array to Tensor.
    img = torch.tensor(np.array([img.numpy()]))
    # Load the image into the computation device.
    img = img.to(device)
    
    depth = depth_estimator(Image.fromarray(frame))["depth"]
    # Gradients are stored during training, not required while inference.
    with torch.no_grad():
        t1 = time.time()
        output, _ = model(img)
        
        
        t2 = time.time()
        fps = 1/(t2 - t1)
        output = non_max_suppression_kpt(output, 
                                         0.25,    # Conf. Threshold.
                                         0.65,    # IoU Threshold.
                                         nc=1,   # Number of classes.
                                         nkpt=17, # Number of keypoints.
                                         kpt_label=True)
        
        output = output_to_keypoint(output)
        print(output.shape)
    # Change format [b, c, h, w] to [h, w, c] for displaying the image.
    nimg = img[0].permute(1, 2, 0) * 255
    nimg = nimg.cpu().numpy().astype(np.uint8)
    nimg = cv2.cvtColor(nimg, cv2.COLOR_RGB2BGR)

    for idx in range(output.shape[0]):
        plot_skeleton_kpts(nimg, mapped_img, input_size, output[idx, 7:].T, 3)
        
    return nimg, fps
# Change forward pass input size.
input_size = 960

# Select the device based on hardware configs.
if torch.cuda.is_available():
    device = torch.device("cuda:0")
else:
    device = torch.device("cpu")
print('Selected Device : ', device)

# Load keypoint detection model.
weights = torch.load('yolov7-w6-pose.pt', map_location=device)
model = weights['model']
# Load the model in evaluation mode.
_ = model.float().eval()
# Load the model to computation device [cpu/gpu/tpu]
model.to(device)

# Webcam capture
cap = cv2.VideoCapture(0)  # 0 corresponds to the default camera (usually the built-in webcam)

# May need to change the w, h as letterbox function reshapes the image.
w = 1920#int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
h = 1080#int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

# Video writer initialization
out = cv2.VideoWriter('pose_outputs/webcam_output.mp4',
                      cv2.VideoWriter_fourcc(*'mp4v'),
                      30, (w, h))

if __name__ == '__main__':
    while cap.isOpened():
        ret, frame = cap.read()

        if not ret:
            print('Unable to read frame. Exiting ..')
            break

        mapped_img = frame.copy()
        # Letterbox resizing.
        img = letterbox(frame, input_size, stride=64, auto=True)[0]
        #print(img.shape)
        img_ = img.copy()
        # Convert the array to 4D.
        img = transforms.ToTensor()(img)
        # Convert the array to Tensor.
        img = torch.tensor(np.array([img.numpy()]))
        # Load the image into the computation device.
        img = img.to(device)
        pil_img = Image.fromarray(frame)
        depth = depth_estimator(pil_img)["predicted_depth"]
        # Gradients are stored during training, not required while inference.
        with torch.no_grad():
            t1 = time.time()
            output, _ = model(img)
            
            
            t2 = time.time()
            fps = 1/(t2 - t1)
            output = non_max_suppression_kpt(output, 
                                             0.25,    # Conf. Threshold.
                                             0.65,    # IoU Threshold.
                                             nc=1,   # Number of classes.
                                             nkpt=17, # Number of keypoints.
                                             kpt_label=True)
            
            output = output_to_keypoint(output)
            print(output.shape)
        # Change format [b, c, h, w] to [h, w, c] for displaying the image.
        nimg = img[0].permute(1, 2, 0) * 255
        nimg = nimg.cpu().numpy().astype(np.uint8)
        nimg = cv2.cvtColor(nimg, cv2.COLOR_RGB2BGR)
    
        for idx in range(output.shape[0]):
            plot_skeleton_kpts(nimg, mapped_img, input_size, output[idx, 7:].T, 3)
        cv2.putText(nimg, 'FPS : {:.2f}'.format(fps), (200, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2,
                    cv2.LINE_AA)
        cv2.putText(nimg, 'YOLOv7', (20, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)

        cv2.imshow('Output', nimg[..., ::-1])
        out.write(nimg[..., ::-1])
        key = cv2.waitKey(1)
        if key == ord('q'):
            break

    cap.release()
    out.release()
    cv2.destroyAllWindows()

Selected Device :  cuda:0
(0,)
(1, 58)
(1, 58)
(1, 58)
(1, 58)
(1, 58)
(1, 58)
(1, 58)
(1, 58)
(1, 58)
(1, 58)
(1, 58)
(1, 58)
(1, 58)
(1, 58)


In [23]:
%matplotlib inline
import matplotlib.pyplot as plt
import torchvision.transforms as T
transform = T.ToPILImage()
transform(depth).show()