# MoveNet Pose Estimation

## MoveNet pose estimation on still image

In [1]:
%pip install tensorflow
%pip install tensorflow_hub
%pip install numpy
%pip install opencv-python


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;4

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import cv2
import time

# Load the MoveNet model from TensorFlow Hub
movenet = hub.load("https://www.kaggle.com/models/google/movenet/TensorFlow2/singlepose-thunder/4")

# Function to perform pose detection on a static image
def pose_estimate(image_path):
    # Preprocess start time
    preprocess_start = time.time()

    # Read the image
    image = cv2.imread(image_path)
    # Convert image to RGB (MoveNet expects RGB images)
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    # Resize image to the expected input size of MoveNet
    image_resized = tf.image.resize_with_pad(tf.expand_dims(image_rgb, axis=0), 256, 256) #256 for Thunder
    # Convert the resized image tensor to a NumPy array with dtype uint8
    image_np = image_resized.numpy().astype(np.int32)

    # Preprocess end time
    preprocess_end = time.time()
    # Preprocess time calculation and output
    preprocess_time = (preprocess_end - preprocess_start) * 1000
    print(f"Preprocess time: {preprocess_time:.1f}ms")

    # Inference time start
    inference_start = time.time()

    # Perform inference
    outputs = movenet.signatures["serving_default"](tf.constant(image_np))
    # Extract the keypoints
    keypoints = outputs['output_0'].numpy()

    # Inference time end time
    inference_end = time.time()
    inference_time = (inference_end - inference_start) * 1000
    print(f"Inference time: {inference_time:.1f}ms")

    # Return the keypoints
    return keypoints


image_path = "/content/frame1.jpeg"

# Perform pose detection on static image
keypoints = pose_estimate(image_path)

for keypoint in keypoints[0][0]:
  print(f'[{keypoint[1]:.4f} {keypoint[0]:.4f}]')


## MoveNet pose estimation on video

In [None]:
!wget -q -O dance.gif https://github.com/tensorflow/tfjs-models/raw/master/pose-detection/assets/dance_input.gif

In [None]:
# Load the input image.
vid_path = 'dance.gif'
# image = tf.io.read_file(image_path)
# image = tf.image.decode_gif(image)

In [5]:
vid_path = "/Users/nick/Documents/GitHub/MoveNet/ADL.mp4"

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import cv2
import pandas as pd

vid_path = "ADL.mp4"  # Replace with the path to your input video

# Function to perform pose estimation on a video
def MoveNet_detect_pose_sequence(vid_path):
    
    # Load the MoveNet model from TensorFlow Hub
    try:
        movenet = hub.load("https://www.kaggle.com/models/google/movenet/TensorFlow2/singlepose-thunder/4")
        print("MoveNet Pose Estimation Model loaded successfully.")
    except Exception as error:
        print(f"Failed to load the model: {error}")

    # Set a threshold for average confidence
    POSE_CONFIDENCE_THRESHOLD = 0.25
    
    # Load the video
    vid = cv2.VideoCapture(vid_path)
    frames = []
    
    # Read frames from the video
    while True:
        ret, frame = vid.read()
        if not ret:
            break
        frames.append(frame)
    
    # Initialize an empty list to store keypoints for each frame
    all_keypoints = []
    
    # Keypoint names based on the MoveNet model
    keypoint_names = [
    'Nose', 'Left Shoulder', 'Right Shoulder', 'Left Elbow', 'Right Elbow',
    'Left Wrist', 'Right Wrist', 'Left Hip', 'Right Hip',
    'Left Knee', 'Right Knee', 'Left Ankle', 'Right Ankle'
    ]   

    # Indices of keypoints to keep (excluding 'Left Eye', 'Right Eye', 'Left Ear', 'Right Ear')
    keypoint_indices = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

    # Iterate through each frame
    for frame in frames:
        # Convert frame to RGB
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        # Resize frame to the expected input size of MoveNet
        frame_resized = tf.image.resize_with_pad(tf.expand_dims(frame_rgb, axis=0), 256, 256) # 256 for thunder
        # Convert the resized frame tensor to a NumPy array with dtype uint8
        frame_np = frame_resized.numpy().astype(np.int32)
        # Perform inference
        outputs = movenet.signatures["serving_default"](tf.constant(frame_np))
        # Extract the keypoints
        keypoints = outputs['output_0'].numpy()
        
        # Initialize a dictionary to store keypoints for the current frame
        frame_keypoints = {}
        
        average_confidence = np.mean(keypoints[:, :, :, 2]) # Calculate average confidence

        if average_confidence < POSE_CONFIDENCE_THRESHOLD:
            keypoints = np.zeros_like(keypoints)  # Discard detection
        else: 
            for idx in keypoint_indices:
                keypoint = keypoints[0][0][idx]
                x, y = keypoint[1], keypoint[0]
                frame_keypoints[f'{keypoint_names[idx]}_X'] = x
                frame_keypoints[f'{keypoint_names[idx]}_Y'] = y

        # Append the frame keypoints to the all_keypoints list
        all_keypoints.append(frame_keypoints)

        print(f"Processed frame {len(all_keypoints)}")
    
    df = pd.DataFrame(all_keypoints)
    # Return keypoints for all frames
    return df

# Perform pose estimation on the input video
result = MoveNet_detect_pose_sequence(vid_path)
print(result)

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import cv2
# from google.colab.patches import cv2_imshow
import time
import imageio

# vid_path = "/content/ADL.mp4"  # Replace with the path to your input video

# Function to perform pose estimation on a video
def detect_pose_sequence(vid_path):
    # Load the MoveNet model from TensorFlow Hub
    movenet = hub.load("https://www.kaggle.com/models/google/movenet/TensorFlow2/singlepose-thunder/4")

    # Set a threshold for average confidence
    POSE_CONFIDENCE_THRESHOLD = 0.25

    # Load the video
    vid = cv2.VideoCapture(vid_path)
    frames = []

    # Read frames from the video
    while True:
        ret, frame = vid.read()
        if not ret:
            break
        frames.append(frame)

    # Initialize an empty list to store keypoints for each frame
    all_keypoints = []

    # Iterate through each frame
    for frame_index, frame in enumerate(frames):
        # Convert frame to RGB
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        # Resize frame to the expected input size of MoveNet
        frame_resized = tf.image.resize_with_pad(tf.expand_dims(frame_rgb, axis=0), 256, 256) # 256 for thunder
        # Convert the resized frame tensor to a NumPy array with dtype uint8
        frame_np = frame_resized.numpy().astype(np.int32)
        # Perform inference
        outputs = movenet.signatures["serving_default"](tf.constant(frame_np))
        # Extract the keypoints
        keypoints = outputs['output_0'].numpy()
        # Append keypoints to the list
        all_keypoints.append(keypoints)

        average_confidence = np.mean(keypoints[:, :, :, 2]) # Calculate average confidence

        print(f'Frame: {frame_index}')
        if average_confidence < POSE_CONFIDENCE_THRESHOLD:
          keypoints = np.zeros_like(keypoints)  # Discard detection
          print("No person detected in this frame.")
        else:
          for keypoint in keypoints[0][0]:
            print(f'[{keypoint[1]:.4f} {keypoint[0]:.4f} {keypoint[2]:.4f}]')

    # Return keypoints for all frames
    return all_keypoints

sequence_keypoints = detect_pose_sequence(vid_path)

In [1]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import cv2
import pandas as pd

vid_path = "ADL.mp4"  # Replace with the path to your input video

def draw_keypoints(frame, keypoints, conf, threshold=0.5):
    # Define the connections between keypoints
    connections = [
    (0, 1), (0, 2), (1, 3), (2, 4), (0, 5), (0, 6), (5, 6), (5, 7), (6, 8), (7, 9), (8, 10), (5, 11), (6, 12), (11, 12), (11, 13), (12, 14), (13, 15), (14, 16)
    ]
    # Draw keypoints
    for i, (x, y) in enumerate(keypoints):
        if conf[i] > threshold:
            cv2.circle(frame, (int(x * frame.shape[1]), int(y * frame.shape[0])), 3, (0, 255, 0), -1)
    
    # Draw connections
    for (start, end) in connections:
        if conf[start] > threshold and conf[end] > threshold:
            start_point = (int(keypoints[start][0] * frame.shape[1]), int(keypoints[start][1] * frame.shape[0]))
            end_point = (int(keypoints[end][0] * frame.shape[1]), int(keypoints[end][1] * frame.shape[0]))
            cv2.line(frame, start_point, end_point, (0, 255, 0), 2)

# Function to perform pose estimation on a video
def MoveNet_detect_pose_sequence(vid_path):
    
    # Load the MoveNet model from TensorFlow Hub
    try:
        movenet = hub.load("https://www.kaggle.com/models/google/movenet/TensorFlow2/singlepose-thunder/4")
        print("MoveNet Pose Estimation Model loaded successfully.")
    except Exception as error:
        print(f"Failed to load the model: {error}")

    # Set a threshold for average confidence
    POSE_CONFIDENCE_THRESHOLD = 0.25
    
    # Load the video
    vid = cv2.VideoCapture(vid_path)
    frames = []
    
    # Read frames from the video
    while True:
        ret, frame = vid.read()
        if not ret:
            break
        frames.append(frame)
    
    # Initialize an empty list to store keypoints for each frame
    all_keypoints = []
    
    # Keypoint names based on the MoveNet model
    keypoint_names = [
        'Nose', 'Left Shoulder', 'Right Shoulder', 'Left Elbow', 'Right Elbow',
        'Left Wrist', 'Right Wrist', 'Left Hip', 'Right Hip',
        'Left Knee', 'Right Knee', 'Left Ankle', 'Right Ankle'
    ]   

    # Indices of keypoints to keep (excluding 'Left Eye', 'Right Eye', 'Left Ear', 'Right Ear')
    keypoint_indices = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

    # Iterate through each frame
    for frame in frames:
        # Convert frame to RGB
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        # Resize frame to the expected input size of MoveNet
        frame_resized = tf.image.resize_with_pad(tf.expand_dims(frame_rgb, axis=0), 256, 256) # 256 for thunder
        # Convert the resized frame tensor to a NumPy array with dtype uint8
        frame_np = frame_resized.numpy().astype(np.int32)
        # Perform inference
        outputs = movenet.signatures["serving_default"](tf.constant(frame_np))
        # Extract the keypoints
        keypoints = outputs['output_0'].numpy()
        
        # Initialize a dictionary to store keypoints for the current frame
        frame_keypoints = {}
        
        average_confidence = np.mean(keypoints[:, :, :, 2]) # Calculate average confidence

        if average_confidence < POSE_CONFIDENCE_THRESHOLD:
            keypoints = np.zeros_like(keypoints)  # Discard detection
        else: 
            for idx in keypoint_indices:
                keypoint = keypoints[0][0][idx]
                x, y = keypoint[1], keypoint[0]
                frame_keypoints[f'{keypoint_names[idx]}_X'] = x
                frame_keypoints[f'{keypoint_names[idx]}_Y'] = y

            # Draw keypoints and connections on the frame
            draw_keypoints(frame, keypoints[0][0][:, :2], keypoints[0][0][:, 2])

        # Append the frame keypoints to the all_keypoints list
        all_keypoints.append(frame_keypoints)

        print(f"Processed frame {len(all_keypoints)}")

        # Display the frame with keypoints
        cv2.imshow('Pose Estimation', frame)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
    
    df = pd.DataFrame(all_keypoints)
    # Return keypoints for all frames
    return df

# Perform pose estimation on the input video
result = MoveNet_detect_pose_sequence(vid_path)
print(result)

# Release the video capture object and close all OpenCV windows
cv2.destroyAllWindows()

2024-10-01 17:33:40.270095: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Pro
2024-10-01 17:33:40.270121: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2024-10-01 17:33:40.270124: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2024-10-01 17:33:40.270137: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-10-01 17:33:40.270146: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


MoveNet Pose Estimation Model loaded successfully.


2024-10-01 17:33:46.556072: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


Processed frame 1
Processed frame 2
Processed frame 3
Processed frame 4
Processed frame 5
Processed frame 6
Processed frame 7
Processed frame 8
Processed frame 9
Processed frame 10
Processed frame 11
Processed frame 12
Processed frame 13
Processed frame 14
Processed frame 15
Processed frame 16
Processed frame 17
Processed frame 18
Processed frame 19
Processed frame 20
Processed frame 21
Processed frame 22
Processed frame 23
Processed frame 24
Processed frame 25
Processed frame 26
Processed frame 27
Processed frame 28
Processed frame 29
Processed frame 30
Processed frame 31
Processed frame 32
Processed frame 33
Processed frame 34
Processed frame 35
Processed frame 36
Processed frame 37
Processed frame 38
Processed frame 39
Processed frame 40
Processed frame 41
Processed frame 42
Processed frame 43
Processed frame 44
Processed frame 45
Processed frame 46
Processed frame 47
Processed frame 48
Processed frame 49
Processed frame 50
Processed frame 51
Processed frame 52
Processed frame 53
Pr

In [1]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import cv2
import pandas as pd

vid_path = "ADL.mp4"  # Replace with the path to your input video

def draw_keypoints(frame, keypoints, conf, threshold=0.5):
    # Define the connections between keypoints
    connections = [
        (0, 1), (0, 2), (1, 3), (2, 4), (3, 5), (4, 6), (5, 7), (6, 8), (7, 9), (8, 10), (5, 11), (6, 12), (11, 12)
    ]
    
    # Draw keypoints
    for i, (x, y) in enumerate(keypoints):
        if conf[i] > threshold:
            cv2.circle(frame, (int(x), int(y)), 3, (0, 255, 0), -1)
    
    # Draw connections
    for (start, end) in connections:
        if conf[start] > threshold and conf[end] > threshold:
            start_point = (int(keypoints[start][0]), int(keypoints[start][1]))
            end_point = (int(keypoints[end][0]), int(keypoints[end][1]))
            cv2.line(frame, start_point, end_point, (0, 255, 0), 2)

# Function to perform pose estimation on a video
def MoveNet_detect_pose_sequence(vid_path):
    
    # Load the MoveNet model from TensorFlow Hub
    try:
        movenet = hub.load("https://tfhub.dev/google/movenet/singlepose/thunder/4")
        print("MoveNet Pose Estimation Model loaded successfully.")
    except Exception as error:
        print(f"Failed to load the model: {error}")

    # Set a threshold for average confidence
    POSE_CONFIDENCE_THRESHOLD = 0.25
    
    # Load the video
    vid = cv2.VideoCapture(vid_path)
    frames = []
    
    # Read frames from the video
    while True:
        ret, frame = vid.read()
        if not ret:
            break
        frames.append(frame)
    
    # Initialize an empty list to store keypoints for each frame
    all_keypoints = []
    
    # Keypoint names based on the MoveNet model
    keypoint_names = [
        'Nose', 'Left Shoulder', 'Right Shoulder', 'Left Elbow', 'Right Elbow',
        'Left Wrist', 'Right Wrist', 'Left Hip', 'Right Hip',
        'Left Knee', 'Right Knee', 'Left Ankle', 'Right Ankle'
    ]   

    # Indices of keypoints to keep (excluding 'Left Eye', 'Right Eye', 'Left Ear', 'Right Ear')
    keypoint_indices = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

    # Iterate through each frame
    for frame in frames:
        # Convert frame to RGB
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        # Resize frame to the expected input size of MoveNet
        frame_resized = tf.image.resize_with_pad(tf.expand_dims(frame_rgb, axis=0), 256, 256) # 256 for thunder
        # Convert the resized frame tensor to a NumPy array with dtype uint8
        frame_np = frame_resized.numpy().astype(np.int32)
        # Perform inference
        outputs = movenet.signatures["serving_default"](tf.constant(frame_np))
        # Extract the keypoints
        keypoints = outputs['output_0'].numpy()
        
        # Initialize a dictionary to store keypoints for the current frame
        frame_keypoints = {}
        
        average_confidence = np.mean(keypoints[:, :, :, 2]) # Calculate average confidence

        if average_confidence < POSE_CONFIDENCE_THRESHOLD:
            keypoints = np.zeros_like(keypoints)  # Discard detection
        else: 
            for idx in keypoint_indices:
                keypoint = keypoints[0][0][idx]
                x, y = keypoint[1] * frame.shape[1], keypoint[0] * frame.shape[0]  # Scale keypoints to original frame size
                frame_keypoints[f'{keypoint_names[idx]}_X'] = x 
                frame_keypoints[f'{keypoint_names[idx]}_Y'] = y

            # Draw keypoints and connections on the frame
            draw_keypoints(frame, keypoints[0][0][:, :2] * [frame.shape[1], frame.shape[0]], keypoints[0][0][:, 2])

        # Append the frame keypoints to the all_keypoints list
        all_keypoints.append(frame_keypoints)

        print(f"Processed frame {len(all_keypoints)}")

        # Display the frame with keypoints
        cv2.imshow('Pose Estimation', frame)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
    
    df = pd.DataFrame(all_keypoints)
    # Return keypoints for all frames
    return df

# Perform pose estimation on the input video
result = MoveNet_detect_pose_sequence(vid_path)
print(result)

# Release the video capture object and close all OpenCV windows
cv2.destroyAllWindows()

2024-10-01 17:35:51.536341: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Pro
2024-10-01 17:35:51.536360: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2024-10-01 17:35:51.536363: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2024-10-01 17:35:51.536376: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-10-01 17:35:51.536386: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


MoveNet Pose Estimation Model loaded successfully.


2024-10-01 17:35:57.669869: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


Processed frame 1
Processed frame 2
Processed frame 3
Processed frame 4
Processed frame 5
Processed frame 6
Processed frame 7
Processed frame 8
Processed frame 9
Processed frame 10
Processed frame 11
Processed frame 12
Processed frame 13
Processed frame 14
Processed frame 15
Processed frame 16
Processed frame 17
Processed frame 18
Processed frame 19
Processed frame 20
Processed frame 21
Processed frame 22
Processed frame 23
Processed frame 24
Processed frame 25
Processed frame 26
Processed frame 27
Processed frame 28
Processed frame 29
Processed frame 30
Processed frame 31
Processed frame 32
Processed frame 33
Processed frame 34
Processed frame 35
Processed frame 36
Processed frame 37
Processed frame 38
Processed frame 39
Processed frame 40
Processed frame 41
Processed frame 42
Processed frame 43
Processed frame 44
Processed frame 45
Processed frame 46
Processed frame 47
Processed frame 48
Processed frame 49
Processed frame 50
Processed frame 51
Processed frame 52
Processed frame 53
Pr

In [2]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import cv2

def draw_keypoints(frame, keypoints, conf, threshold=0.5):
    # Define the connections between keypoints
    connections = [
        (0, 1), (0, 2), (1, 3), (2, 4), (0, 5), (0, 6), (5, 6), (5, 7), (6, 8), (7, 9), (8, 10), (5, 11), (6, 12), (11, 12), (11, 13), (12, 14), (13, 15), (14, 16)
    ]
    
    # Draw keypoints
    for i, (x, y) in enumerate(keypoints):
        if conf[i] > threshold:
            cv2.circle(frame, (int(y), int(x)), 3, (0, 255, 0), -1)
    
    # Draw connections
    for (start, end) in connections:
        if conf[start] > threshold and conf[end] > threshold:
            start_point = (int(keypoints[start][1]), int(keypoints[start][0]))
            end_point = (int(keypoints[end][1]), int(keypoints[end][0]))
            cv2.line(frame, start_point, end_point, (0, 255, 0), 2)

def pose_estimate(video_path):
    # Load the MoveNet Thunder model from TensorFlow Hub
    model = hub.load("https://tfhub.dev/google/movenet/singlepose/thunder/4")

    # Open the video file
    cap = cv2.VideoCapture(video_path)

    # Frame index counter
    frame_index = 0

    # Loop through the video frames
    while cap.isOpened():
        # Read a frame from the video
        success, frame = cap.read()

        if success:
            # Convert frame to RGB
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            # Resize frame to the expected input size of MoveNet
            frame_resized = tf.image.resize_with_pad(tf.expand_dims(frame_rgb, axis=0), 256, 256)
            # Convert the resized frame tensor to a NumPy array with dtype uint8
            frame_np = frame_resized.numpy().astype(np.int32)
            # Perform inference
            outputs = model.signatures["serving_default"](tf.constant(frame_np))
            # Extract the keypoints
            keypoints = outputs['output_0'].numpy()[0, 0, :, :2]
            conf = outputs['output_0'].numpy()[0, 0, :, 2]

            # Scale keypoints to original frame size
            keypoints[:, 0] *= frame.shape[1]
            keypoints[:, 1] *= frame.shape[0]

            # Draw keypoints and connections on the frame
            draw_keypoints(frame, keypoints, conf)

            # Print frame index
            print(f"Frame index: {frame_index}")

            # Display the frame with keypoints
            cv2.imshow('Pose Estimation', frame)

            # Increment frame index
            frame_index += 1

            # Break the loop if 'q' is pressed
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break
        else:
            # Break the loop if the end of the video is reached
            break

    # Release the video capture object and close all OpenCV windows
    cap.release()
    cv2.destroyAllWindows()

# Perform pose estimation on the input video
vid_path = "ADL.mp4"  # Replace with the path to your input video
pose_estimate(vid_path)

2024-10-01 19:04:28.063306: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


Frame index: 0
Frame index: 1
Frame index: 2
Frame index: 3
Frame index: 4
Frame index: 5
Frame index: 6
Frame index: 7
Frame index: 8
Frame index: 9
Frame index: 10
Frame index: 11
Frame index: 12
Frame index: 13
Frame index: 14
Frame index: 15
Frame index: 16
Frame index: 17
Frame index: 18
Frame index: 19
Frame index: 20
Frame index: 21
Frame index: 22
Frame index: 23
Frame index: 24
Frame index: 25
Frame index: 26
Frame index: 27
Frame index: 28
Frame index: 29
Frame index: 30
Frame index: 31
Frame index: 32
Frame index: 33
Frame index: 34
Frame index: 35
Frame index: 36
Frame index: 37
Frame index: 38
Frame index: 39
Frame index: 40
Frame index: 41
Frame index: 42
Frame index: 43
Frame index: 44
Frame index: 45
Frame index: 46
Frame index: 47
Frame index: 48
Frame index: 49
Frame index: 50
Frame index: 51
Frame index: 52
Frame index: 53
Frame index: 54
Frame index: 55
Frame index: 56
Frame index: 57
Frame index: 58
Frame index: 59
Frame index: 60
Frame index: 61
Frame index: 62
Fr

: 

In [1]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import cv2
from IPython.display import display, Image
from tqdm.notebook import tqdm

# Confidence score to determine whether a keypoint prediction is reliable.
MIN_CROP_KEYPOINT_SCORE = 0.2

# Define the keypoint dictionary
KEYPOINT_DICT = {
    'nose': 0, 'left_eye': 1, 'right_eye': 2, 'left_ear': 3, 'right_ear': 4,
    'left_shoulder': 5, 'right_shoulder': 6, 'left_elbow': 7, 'right_elbow': 8,
    'left_wrist': 9, 'right_wrist': 10, 'left_hip': 11, 'right_hip': 12,
    'left_knee': 13, 'right_knee': 14, 'left_ankle': 15, 'right_ankle': 16
}

def init_crop_region(image_height, image_width):
    """Defines the default crop region."""
    if image_width > image_height:
        box_height = image_width / image_height
        box_width = 1.0
        y_min = (image_height / 2 - image_width / 2) / image_height
        x_min = 0.0
    else:
        box_height = 1.0
        box_width = image_height / image_width
        y_min = 0.0
        x_min = (image_width / 2 - image_height / 2) / image_width

    return {
        'y_min': y_min,
        'x_min': x_min,
        'y_max': y_min + box_height,
        'x_max': x_min + box_width,
        'height': box_height,
        'width': box_width
    }

def torso_visible(keypoints):
    """Checks whether there are enough torso keypoints."""
    return ((keypoints[0, 0, KEYPOINT_DICT['left_hip'], 2] > MIN_CROP_KEYPOINT_SCORE or
             keypoints[0, 0, KEYPOINT_DICT['right_hip'], 2] > MIN_CROP_KEYPOINT_SCORE) and
            (keypoints[0, 0, KEYPOINT_DICT['left_shoulder'], 2] > MIN_CROP_KEYPOINT_SCORE or
             keypoints[0, 0, KEYPOINT_DICT['right_shoulder'], 2] > MIN_CROP_KEYPOINT_SCORE))

def determine_torso_and_body_range(keypoints, target_keypoints, center_y, center_x):
    """Calculates the maximum distance from each keypoint to the center location."""
    torso_joints = ['left_shoulder', 'right_shoulder', 'left_hip', 'right_hip']
    max_torso_yrange = 0.0
    max_torso_xrange = 0.0
    for joint in torso_joints:
        dist_y = abs(center_y - target_keypoints[joint][0])
        dist_x = abs(center_x - target_keypoints[joint][1])
        if dist_y > max_torso_yrange:
            max_torso_yrange = dist_y
        if dist_x > max_torso_xrange:
            max_torso_xrange = dist_x

    max_body_yrange = 0.0
    max_body_xrange = 0.0
    for joint in KEYPOINT_DICT.keys():
        if keypoints[0, 0, KEYPOINT_DICT[joint], 2] < MIN_CROP_KEYPOINT_SCORE:
            continue
        dist_y = abs(center_y - target_keypoints[joint][0])
        dist_x = abs(center_x - target_keypoints[joint][1])
        if dist_y > max_body_yrange:
            max_body_yrange = dist_y
        if dist_x > max_body_xrange:
            max_body_xrange = dist_x

    return [max_torso_yrange, max_torso_xrange, max_body_yrange, max_body_xrange]

def determine_crop_region(keypoints, image_height, image_width):
    """Determines the region to crop the image for the model to run inference on."""
    target_keypoints = {}
    for joint in KEYPOINT_DICT.keys():
        target_keypoints[joint] = [
            keypoints[0, 0, KEYPOINT_DICT[joint], 0] * image_height,
            keypoints[0, 0, KEYPOINT_DICT[joint], 1] * image_width
        ]

    if torso_visible(keypoints):
        center_y = (target_keypoints['left_hip'][0] + target_keypoints['right_hip'][0]) / 2
        center_x = (target_keypoints['left_hip'][1] + target_keypoints['right_hip'][1]) / 2

        (max_torso_yrange, max_torso_xrange, max_body_yrange, max_body_xrange) = determine_torso_and_body_range(
            keypoints, target_keypoints, center_y, center_x)

        crop_length_half = np.amax([max_torso_xrange * 1.9, max_torso_yrange * 1.9, max_body_yrange * 1.2, max_body_xrange * 1.2])

        tmp = np.array([center_x, image_width - center_x, center_y, image_height - center_y])
        crop_length_half = np.amin([crop_length_half, np.amax(tmp)])

        crop_corner = [center_y - crop_length_half, center_x - crop_length_half]

        if crop_length_half > max(image_width, image_height) / 2:
            return init_crop_region(image_height, image_width)
        else:
            crop_length = crop_length_half * 2
            return {
                'y_min': crop_corner[0] / image_height,
                'x_min': crop_corner[1] / image_width,
                'y_max': (crop_corner[0] + crop_length) / image_height,
                'x_max': (crop_corner[1] + crop_length) / image_width,
                'height': (crop_corner[0] + crop_length) / image_height - crop_corner[0] / image_height,
                'width': (crop_corner[1] + crop_length) / image_width - crop_corner[1] / image_width
            }
    else:
        return init_crop_region(image_height, image_width)

def crop_and_resize(image, crop_region, crop_size):
    """Crops and resize the image to prepare for the model input."""
    boxes = [[crop_region['y_min'], crop_region['x_min'], crop_region['y_max'], crop_region['x_max']]]
    output_image = tf.image.crop_and_resize(image, box_indices=[0], boxes=boxes, crop_size=crop_size)
    return output_image

def run_inference(movenet, image, crop_region, crop_size):
    """Runs model inference on the cropped region."""
    image_height, image_width, _ = image.shape
    input_image = crop_and_resize(tf.expand_dims(image, axis=0), crop_region, crop_size=crop_size)
    # Run model inference.
    keypoints_with_scores = movenet(input_image)
    # Update the coordinates.
    for idx in range(17):
        keypoints_with_scores[0, 0, idx, 0] = (
            crop_region['y_min'] * image_height +
            crop_region['height'] * image_height *
            keypoints_with_scores[0, 0, idx, 0]) / image_height
        keypoints_with_scores[0, 0, idx, 1] = (
            crop_region['x_min'] * image_width +
            crop_region['width'] * image_width *
            keypoints_with_scores[0, 0, idx, 1]) / image_width
    return keypoints_with_scores

def draw_keypoints(frame, keypoints, conf, threshold=0.5):
    # Define the connections between keypoints
    connections = [
        (0, 1), (0, 2), (1, 3), (2, 4), (0, 5), (0, 6), (5, 6), (5, 7), (6, 8), (7, 9), (8, 10), (5, 11), (6, 12), (11, 12), (11, 13), (12, 14), (13, 15), (14, 16)
    ]
    
    # Draw keypoints
    for i, (x, y) in enumerate(keypoints):
        if conf[i] > threshold:
            cv2.circle(frame, (int(y), int(x)), 3, (0, 255, 0), -1)
    
    # Draw connections
    for (start, end) in connections:
        if conf[start] > threshold and conf[end] > threshold:
            start_point = (int(keypoints[start][1]), int(keypoints[start][0]))
            end_point = (int(keypoints[end][1]), int(keypoints[end][0]))
            cv2.line(frame, start_point, end_point, (0, 255, 0), 2)

def draw_prediction_on_image(image, keypoints_with_scores, crop_region=None, close_figure=True, output_image_height=None):
    """Draws the keypoints and edges on the image."""
    keypoints = keypoints_with_scores[0, 0, :, :2]
    conf = keypoints_with_scores[0, 0, :, 2]
    draw_keypoints(image, keypoints, conf)
    return image

def to_gif(images, duration=100):
    """Converts a list of images to a GIF."""
    import imageio
    imageio.mimsave('output.gif', images, duration=duration)

# Load the MoveNet Thunder model from TensorFlow Hub
movenet = hub.load("https://tfhub.dev/google/movenet/singlepose/thunder/4")

# Load the input video using OpenCV
video_path = "ADL.mp4"  # Replace with the path to your input video
cap = cv2.VideoCapture(video_path)

# Get video properties
num_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
image_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
image_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))

crop_region = init_crop_region(image_height, image_width)

output_images = []
bar = tqdm(total=num_frames)

# Set a threshold for average confidence
POSE_CONFIDENCE_THRESHOLD = 0.25

for frame_idx in range(num_frames):
    ret, frame = cap.read()
    if not ret:
        break

    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    keypoints_with_scores = run_inference(
        movenet, frame_rgb, crop_region,
        crop_size=[256, 256])

    output_images.append(draw_prediction_on_image(
        frame, keypoints_with_scores, crop_region=None,
        close_figure=True, output_image_height=300))
    crop_region = determine_crop_region(
        keypoints_with_scores, image_height, image_width)
    bar.update(1)
    print('Frame: ', frame_idx)
    print(keypoints_with_scores)

    keypoints_with_scores_xy = keypoints_with_scores.copy()
    keypoints_with_scores_xy[0, 0, :, 0], keypoints_with_scores_xy[0, 0, :, 1] = keypoints_with_scores[0, 0, :, 1], keypoints_with_scores[0, 0, :, 0]

    average_confidence = np.mean(keypoints_with_scores[:, :, :, 2])  # Calculate average confidence
    if average_confidence < POSE_CONFIDENCE_THRESHOLD:
        keypoints_with_scores = np.zeros_like(keypoints_with_scores)  # Discard detection
        print("No person detected in this frame.")
    else:
        print('X and Y', keypoints_with_scores_xy)

# Release the video capture object
cap.release()

# Prepare gif visualization.
output = np.stack(output_images, axis=0)
to_gif(output, duration=100)

# Display the GIF
with open("output.gif", "rb") as f:
    display(Image(data=f.read(), format='png'))

2024-10-01 18:20:29.329217: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Pro
2024-10-01 18:20:29.329243: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2024-10-01 18:20:29.329246: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2024-10-01 18:20:29.329261: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-10-01 18:20:29.329271: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


ImportError: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html

# YOLOv8 Pose Estimation

## YOLOv8 pose estimation on still image

In [None]:
# Installing Ultralytics package through PIP which allowing for downloading of the the latest pre-trained model
%pip install ultralytics

In [None]:
import ultralytics
from ultralytics import YOLO

# Load the YOLOv8 pose estimation model
model = YOLO('yolov8n-pose.pt') # You can choose different variants like yolov8n-pose, yolov8m-pose, etc.

# Load the input image
image_pathYOLO = '/content/frame1.jpeg'  # Replace with your image path


# Run the model inference on the image
results = model(image_pathYOLO)

# Extract keypoints for each person in the image
for result in results:
    for kp in result.keypoints:
        print(f"Keypoints for a person: {kp.xyn}")
        # kp.xyn contains (x normalized, y normalized) for each keypoint


## YOLOv8 pose estimation on video

In [None]:
%pip install ultralytics

In [1]:
from ultralytics import YOLO
import cv2
import pandas as pd


def pose_estimate(video_path):
  # Load the YOLOv8 pose estimation model
  model = YOLO('yolov8n-pose.pt')

  # # Load the input video
  # video_path = '/content/ADL.mp4'  # Replace with your video path

  # Open the video file
  cap = cv2.VideoCapture(video_path)

  # Frame index counter
  frame_index = 0

  # Loop through the video frames
  while cap.isOpened():
    # Read a frame from the video
    success, frame = cap.read()

    if success:
      # Run YOLOv8 inference on the frame
      results = model(frame)

      # Print frame index
      print(f"Frame index: {frame_index}")

      # Extract keypoints for each person in the frame
      for result in results:
        keypoints = result.keypoints.xyn
        conf = result.keypoints.conf
        for i in range(len(keypoints[0])):
          print(f"Keypoint {i}: {keypoints[0][i]}, Confidence: {conf[0][i]}")

      # Increment frame index
      frame_index += 1

    else:
      # Break the loop if the end of the video is reached
      break


  return results

  # Release the video capture object
  cap.release()

vid_path = "/Users/nick/Documents/GitHub/MoveNet/ADL.mp4"

keypoints = pose_estimate(vid_path)



0: 384x640 (no detections), 81.6ms
Speed: 11.3ms preprocess, 81.6ms inference, 5.1ms postprocess per image at shape (1, 3, 384, 640)
Frame index: 0

0: 384x640 (no detections), 57.4ms
Speed: 3.0ms preprocess, 57.4ms inference, 0.3ms postprocess per image at shape (1, 3, 384, 640)
Frame index: 1

0: 384x640 (no detections), 70.7ms
Speed: 1.5ms preprocess, 70.7ms inference, 0.3ms postprocess per image at shape (1, 3, 384, 640)
Frame index: 2

0: 384x640 (no detections), 122.1ms
Speed: 2.3ms preprocess, 122.1ms inference, 0.3ms postprocess per image at shape (1, 3, 384, 640)
Frame index: 3

0: 384x640 (no detections), 141.9ms
Speed: 2.8ms preprocess, 141.9ms inference, 0.3ms postprocess per image at shape (1, 3, 384, 640)
Frame index: 4

0: 384x640 (no detections), 63.3ms
Speed: 1.5ms preprocess, 63.3ms inference, 0.5ms postprocess per image at shape (1, 3, 384, 640)
Frame index: 5

0: 384x640 (no detections), 70.3ms
Speed: 1.8ms preprocess, 70.3ms inference, 0.9ms postprocess per image 

In [34]:
from ultralytics import YOLO
import cv2
import pandas as pd

# Define a dictionary that maps keypoint indices to their names
keypoints_dict = {
    0: 'Nose',
    1: 'Left Eye',
    2: 'Right Eye',
    3: 'Left Ear',
    4: 'Right Ear',
    5: 'Left Shoulder',
    6: 'Right Shoulder',
    7: 'Left Elbow',
    8: 'Right Elbow',
    9: 'Left Wrist',
    10: 'Right Wrist',
    11: 'Left Hip',
    12: 'Right Hip',
    13: 'Left Knee',
    14: 'Right Knee',
    15: 'Left Ankle',
    16: 'Right Ankle'
}


def draw_keypoints(frame, keypoints, conf, threshold=0.5):
    # Define the connections between keypoints
    connections = [
    (0, 1), (0, 2), (1, 3), (2, 4), (0, 5), (0, 6), (5, 6), (5, 7), (6, 8), (7, 9), (8, 10), (5, 11), (6, 12), (11, 12), (11, 13), (12, 14), (13, 15), (14, 16)
    ]
    
    # Draw keypoints
    for i, (x, y) in enumerate(keypoints):
        if conf[i] > threshold:
            cv2.circle(frame, (int(x * frame.shape[1]), int(y * frame.shape[0])), 3, (0, 255, 0), -1)
    
    # Draw connections
    for (start, end) in connections:
        if conf[start] > threshold and conf[end] > threshold:
            start_point = (int(keypoints[start][0] * frame.shape[1]), int(keypoints[start][1] * frame.shape[0]))
            end_point = (int(keypoints[end][0] * frame.shape[1]), int(keypoints[end][1] * frame.shape[0]))
            cv2.line(frame, start_point, end_point, (0, 255, 0), 2)

In [2]:
def pose_estimate(video_path):
    # Load the YOLOv8 pose estimation model
    model = YOLO('yolov8n-pose.pt')

    # Open the video file
    cap = cv2.VideoCapture(video_path)

    # Frame index counter
    frame_index = 0

    # List to store keypoints
    all_keypoints = []

    # Loop through the video frames
    while cap.isOpened():
        # Read a frame from the video
        success, frame = cap.read()

        if success:
            # Run YOLOv8 inference on the frame
            results = model(frame)

            # Print frame index
            print(f"Frame index: {frame_index}")

            # Extract keypoints for each person in the frame
            for result in results:
                keypoints = result.keypoints.xyn[0].tolist()  # Convert tensor to list
                conf = result.keypoints.conf
                if conf is not None:
                    conf = conf[0].tolist() # Convert to list
                    draw_keypoints(frame, keypoints, conf)

                # Append keypoints to the list
                keypoints_dict_frame = {}
                for i, (x, y) in enumerate(keypoints):
                    keypoint_name = keypoints_dict.get(i, f'Keypoint_{i}')
                    keypoints_dict_frame[f'{keypoint_name}_X'] = x
                    keypoints_dict_frame[f'{keypoint_name}_Y'] = y
                all_keypoints.append(keypoints_dict_frame)

            # Increment frame index
            frame_index += 1
            
            # Display the frame with keypoints
            cv2.imshow('Pose Estimation', frame)
            print
        
            # Break the loop if 'q' is pressed
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break
        else:
            # Break the loop if the end of the video is reached
            break

    # Release the video capture object and close all OpenCV windows
    cap.release()
    cv2.destroyAllWindows()

    # Convert the list of keypoints to a DataFrame
    df = pd.DataFrame(all_keypoints)
    return df

vid_path = "/Users/nick/Documents/GitHub/MoveNet/ADL.mp4"

In [3]:
YOLOv8 = pose_estimate(vid_path)
print(YOLOv8)


0: 384x640 (no detections), 44.5ms
Speed: 1.6ms preprocess, 44.5ms inference, 0.2ms postprocess per image at shape (1, 3, 384, 640)
Frame index: 0

0: 384x640 (no detections), 40.3ms
Speed: 1.6ms preprocess, 40.3ms inference, 0.3ms postprocess per image at shape (1, 3, 384, 640)
Frame index: 1

0: 384x640 (no detections), 42.9ms
Speed: 1.9ms preprocess, 42.9ms inference, 0.3ms postprocess per image at shape (1, 3, 384, 640)
Frame index: 2

0: 384x640 (no detections), 44.7ms
Speed: 1.5ms preprocess, 44.7ms inference, 0.2ms postprocess per image at shape (1, 3, 384, 640)
Frame index: 3

0: 384x640 (no detections), 41.7ms
Speed: 1.5ms preprocess, 41.7ms inference, 0.3ms postprocess per image at shape (1, 3, 384, 640)
Frame index: 4

0: 384x640 (no detections), 37.1ms
Speed: 1.6ms preprocess, 37.1ms inference, 0.2ms postprocess per image at shape (1, 3, 384, 640)
Frame index: 5

0: 384x640 (no detections), 43.5ms
Speed: 1.6ms preprocess, 43.5ms inference, 0.2ms postprocess per image at sh

NameError: name 'draw_keypoints' is not defined

: 