# MoveNet Pose Estimation Adaption with Visualisation

## Original MoveNet with Video

Custom implementation of MoveNet adapting from MoveNet codebase.

Refactoring helper functions, adding functions for dataframe conversion.

Restructured inference functions to load video files.

### Preping functions

#### Install required packages

In [3]:
%pip install opencv-python
%pip install tensorflow
%pip install tensorflow-hub
%pip install numpy
%pip install pandas


You should consider upgrading via the '/Users/nick/Documents/GitHub/MoveNet/MoveNet/bin/python -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.
You should consider upgrading via the '/Users/nick/Documents/GitHub/MoveNet/MoveNet/bin/python -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.
You should consider upgrading via the '/Users/nick/Documents/GitHub/MoveNet/MoveNet/bin/python -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.
You should consider upgrading via the '/Users/nick/Documents/GitHub/MoveNet/MoveNet/bin/python -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.
You should consider upgrading via the '/Users/nick/Documents/GitHub/MoveNet/MoveNet/bin/python -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to us

#### Import required packages

In [4]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import cv2
import pandas as pd

# Import matplotlib libraries
from matplotlib import pyplot as plt
from matplotlib.collections import LineCollection
import matplotlib.patches as patches

#### Helper functions

In [5]:
# Dictionary that maps from joint names to keypoint indices.
KEYPOINT_DICT = {
    'nose': 0,
    'left_eye': 1,
    'right_eye': 2,
    'left_ear': 3,
    'right_ear': 4,
    'left_shoulder': 5,
    'right_shoulder': 6,
    'left_elbow': 7,
    'right_elbow': 8,
    'left_wrist': 9,
    'right_wrist': 10,
    'left_hip': 11,
    'right_hip': 12,
    'left_knee': 13,
    'right_knee': 14,
    'left_ankle': 15,
    'right_ankle': 16
}


KEYPOINT_EDGE_INDS_TO_COLOR = {
    (0, 1): 'm',
    (0, 2): 'c',
    (1, 3): 'm',
    (2, 4): 'c',
    (0, 5): 'm',
    (0, 6): 'c',
    (5, 7): 'm',
    (7, 9): 'm',
    (6, 8): 'c',
    (8, 10): 'c',
    (5, 6): 'y',
    (5, 11): 'm',
    (6, 12): 'c',
    (11, 12): 'y',
    (11, 13): 'm',
    (13, 15): 'm',
    (12, 14): 'c',
    (14, 16): 'c'
}

def _keypoints_and_edges_for_display(keypoints_with_scores,
                                     height,
                                     width,
                                     keypoint_threshold=0.3):
  """Returns high confidence keypoints and edges for visualization.

  Args:
    keypoints_with_scores: A numpy array with shape [1, 1, 17, 3] representing
      the keypoint coordinates and scores returned from the MoveNet model.
    height: height of the image in pixels.
    width: width of the image in pixels.
    keypoint_threshold: minimum confidence score for a keypoint to be
      visualized.

  Returns:
    A (keypoints_xy, edges_xy, edge_colors) containing:
      * the coordinates of all keypoints of all detected entities;
      * the coordinates of all skeleton edges of all detected entities;
      * the colors in which the edges should be plotted.
  """
  keypoints_all = []
  keypoint_edges_all = []
  edge_colors = []
  num_instances, _, _, _ = keypoints_with_scores.shape
  for idx in range(num_instances):
    kpts_x = keypoints_with_scores[0, idx, :, 1]
    kpts_y = keypoints_with_scores[0, idx, :, 0]
    kpts_scores = keypoints_with_scores[0, idx, :, 2]
    kpts_absolute_xy = np.stack(
        [width * np.array(kpts_x), height * np.array(kpts_y)], axis=-1)
    kpts_above_thresh_absolute = kpts_absolute_xy[
        kpts_scores > keypoint_threshold, :]
    keypoints_all.append(kpts_above_thresh_absolute)

    for edge_pair, color in KEYPOINT_EDGE_INDS_TO_COLOR.items():
      if (kpts_scores[edge_pair[0]] > keypoint_threshold and
          kpts_scores[edge_pair[1]] > keypoint_threshold):
        x_start = kpts_absolute_xy[edge_pair[0], 0]
        y_start = kpts_absolute_xy[edge_pair[0], 1]
        x_end = kpts_absolute_xy[edge_pair[1], 0]
        y_end = kpts_absolute_xy[edge_pair[1], 1]
        line_seg = np.array([[x_start, y_start], [x_end, y_end]])
        keypoint_edges_all.append(line_seg)
        edge_colors.append(color)
  if keypoints_all:
    keypoints_xy = np.concatenate(keypoints_all, axis=0)
  else:
    keypoints_xy = np.zeros((0, 17, 2))

  if keypoint_edges_all:
    edges_xy = np.stack(keypoint_edges_all, axis=0)
  else:
    edges_xy = np.zeros((0, 2, 2))
  return keypoints_xy, edges_xy, edge_colors

def draw_prediction_on_image(
    image, keypoints_with_scores, crop_region=None, close_figure=False,
    output_image_height=None):
  """Draws the keypoint predictions on image.

  Args:
    image: A numpy array with shape [height, width, channel] representing the
      pixel values of the input image.
    keypoints_with_scores: A numpy array with shape [1, 1, 17, 3] representing
      the keypoint coordinates and scores returned from the MoveNet model.
    crop_region: A dictionary that defines the coordinates of the bounding box
      of the crop region in normalized coordinates (see the init_crop_region
      function below for more detail). If provided, this function will also
      draw the bounding box on the image.
    output_image_height: An integer indicating the height of the output image.
      Note that the image aspect ratio will be the same as the input image.

  Returns:
    A numpy array with shape [out_height, out_width, channel] representing the
    image overlaid with keypoint predictions.
  """
  height, width, channel = image.shape
  aspect_ratio = float(width) / height
  fig, ax = plt.subplots(figsize=(12 * aspect_ratio, 12))
  # To remove the huge white borders
  fig.tight_layout(pad=0)
  ax.margins(0)
  ax.set_yticklabels([])
  ax.set_xticklabels([])
  plt.axis('off')

  im = ax.imshow(image)
  line_segments = LineCollection([], linewidths=(4), linestyle='solid')
  ax.add_collection(line_segments)
  # Turn off tick labels
  scat = ax.scatter([], [], s=60, color='#FF1493', zorder=3)

  (keypoint_locs, keypoint_edges,
   edge_colors) = _keypoints_and_edges_for_display(
       keypoints_with_scores, height, width)

  line_segments.set_segments(keypoint_edges)
  line_segments.set_color(edge_colors)
  if keypoint_edges.shape[0]:
    line_segments.set_segments(keypoint_edges)
    line_segments.set_color(edge_colors)
  if keypoint_locs.shape[0]:
    scat.set_offsets(keypoint_locs)

  # if crop_region is not None:
  #   xmin = max(crop_region['x_min'] * width, 0.0)
  #   ymin = max(crop_region['y_min'] * height, 0.0)
  #   rec_width = min(crop_region['x_max'], 0.99) * width - xmin
  #   rec_height = min(crop_region['y_max'], 0.99) * height - ymin 
  #   rect = patches.Rectangle(
  #     (xmin,ymin),rec_width,rec_height,
  #     linewidth=2,edgecolor='g',facecolor='none')    
  #   ax.add_patch(rect)

  fig.canvas.draw()
  image_from_plot = np.frombuffer(fig.canvas.buffer_rgba(), dtype=np.uint8)
  image_from_plot = image_from_plot.reshape(fig.canvas.get_width_height()[::-1] + (4,))
  plt.close(fig)
  if output_image_height is not None:
    output_image_width = int(output_image_height / height * width)
    image_from_plot = cv2.resize(
        image_from_plot, dsize=(output_image_width, output_image_height),
        interpolation=cv2.INTER_CUBIC)
  return image_from_plot

#### Loading MoveNet - Single Pose Thunder 4

In [6]:
# Load the MoveNet model from TensorFlow Hub
try:
    module = hub.load("https://www.kaggle.com/models/google/movenet/TensorFlow2/singlepose-thunder/4")
    input_size = 256
except Exception as error:
    print(f"Failed to load the model: {error}")

def movenet(input_image):
    """Runs detection on an input image.

    Args:
        input_image: A [1, height, width, 3] tensor represents the input image
        pixels. Note that the height/width should already be resized and match the
        expected input resolution of the model before passing into this function.

    Returns:
        A [1, 1, 17, 3] float numpy array representing the predicted keypoint
        coordinates and scores.
    """
    model = module.signatures['serving_default']

    # SavedModel format expects tensor type of int32.
    input_image = tf.cast(input_image, dtype=tf.int32)
    # Run model inference.
    outputs = model(input_image)
    # Output is a [1, 1, 17, 3] tensor.
    keypoints_with_scores = outputs['output_0'].numpy()
    
    return keypoints_with_scores

2024-10-03 16:15:34.822508: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Pro
2024-10-03 16:15:34.822535: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2024-10-03 16:15:34.822544: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2024-10-03 16:15:34.822560: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-10-03 16:15:34.822570: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


### Video (Image Sequence) Example

#### Cropping Algorithm

In [7]:
# Confidence score to determine whether a keypoint prediction is reliable.
MIN_CROP_KEYPOINT_SCORE = 0.2

def init_crop_region(image_height, image_width):
  """Defines the default crop region.

  The function provides the initial crop region (pads the full image from both
  sides to make it a square image) when the algorithm cannot reliably determine
  the crop region from the previous frame.
  """
  if image_width > image_height:
    box_height = image_width / image_height
    box_width = 1.0
    y_min = (image_height / 2 - image_width / 2) / image_height
    x_min = 0.0
  else:
    box_height = 1.0
    box_width = image_height / image_width
    y_min = 0.0
    x_min = (image_width / 2 - image_height / 2) / image_width

  return {
    'y_min': y_min,
    'x_min': x_min,
    'y_max': y_min + box_height,
    'x_max': x_min + box_width,
    'height': box_height,
    'width': box_width
  }

def torso_visible(keypoints):
  """Checks whether there are enough torso keypoints.

  This function checks whether the model is confident at predicting one of the
  shoulders/hips which is required to determine a good crop region.
  """
  return ((keypoints[0, 0, KEYPOINT_DICT['left_hip'], 2] >
           MIN_CROP_KEYPOINT_SCORE or
          keypoints[0, 0, KEYPOINT_DICT['right_hip'], 2] >
           MIN_CROP_KEYPOINT_SCORE) and
          (keypoints[0, 0, KEYPOINT_DICT['left_shoulder'], 2] >
           MIN_CROP_KEYPOINT_SCORE or
          keypoints[0, 0, KEYPOINT_DICT['right_shoulder'], 2] >
           MIN_CROP_KEYPOINT_SCORE))

def determine_torso_and_body_range(
    keypoints, target_keypoints, center_y, center_x):
  """Calculates the maximum distance from each keypoints to the center location.

  The function returns the maximum distances from the two sets of keypoints:
  full 17 keypoints and 4 torso keypoints. The returned information will be
  used to determine the crop size. See determineCropRegion for more detail.
  """
  torso_joints = ['left_shoulder', 'right_shoulder', 'left_hip', 'right_hip']
  max_torso_yrange = 0.0
  max_torso_xrange = 0.0
  for joint in torso_joints:
    dist_y = abs(center_y - target_keypoints[joint][0])
    dist_x = abs(center_x - target_keypoints[joint][1])
    if dist_y > max_torso_yrange:
      max_torso_yrange = dist_y
    if dist_x > max_torso_xrange:
      max_torso_xrange = dist_x

  max_body_yrange = 0.0
  max_body_xrange = 0.0
  for joint in KEYPOINT_DICT.keys():
    if keypoints[0, 0, KEYPOINT_DICT[joint], 2] < MIN_CROP_KEYPOINT_SCORE:
      continue
    dist_y = abs(center_y - target_keypoints[joint][0]);
    dist_x = abs(center_x - target_keypoints[joint][1]);
    if dist_y > max_body_yrange:
      max_body_yrange = dist_y

    if dist_x > max_body_xrange:
      max_body_xrange = dist_x

  return [max_torso_yrange, max_torso_xrange, max_body_yrange, max_body_xrange]

def determine_crop_region(
      keypoints, image_height,
      image_width):
  """Determines the region to crop the image for the model to run inference on.

  The algorithm uses the detected joints from the previous frame to estimate
  the square region that encloses the full body of the target person and
  centers at the midpoint of two hip joints. The crop size is determined by
  the distances between each joints and the center point.
  When the model is not confident with the four torso joint predictions, the
  function returns a default crop which is the full image padded to square.
  """
  target_keypoints = {}
  for joint in KEYPOINT_DICT.keys():
    target_keypoints[joint] = [
      keypoints[0, 0, KEYPOINT_DICT[joint], 0] * image_height,
      keypoints[0, 0, KEYPOINT_DICT[joint], 1] * image_width
    ]

  if torso_visible(keypoints):
    center_y = (target_keypoints['left_hip'][0] +
                target_keypoints['right_hip'][0]) / 2;
    center_x = (target_keypoints['left_hip'][1] +
                target_keypoints['right_hip'][1]) / 2;

    (max_torso_yrange, max_torso_xrange,
      max_body_yrange, max_body_xrange) = determine_torso_and_body_range(
          keypoints, target_keypoints, center_y, center_x)

    crop_length_half = np.amax(
        [max_torso_xrange * 1.9, max_torso_yrange * 1.9,
          max_body_yrange * 1.2, max_body_xrange * 1.2])

    tmp = np.array(
        [center_x, image_width - center_x, center_y, image_height - center_y])
    crop_length_half = np.amin(
        [crop_length_half, np.amax(tmp)]);

    crop_corner = [center_y - crop_length_half, center_x - crop_length_half];

    if crop_length_half > max(image_width, image_height) / 2:
      return init_crop_region(image_height, image_width)
    else:
      crop_length = crop_length_half * 2;
      return {
        'y_min': crop_corner[0] / image_height,
        'x_min': crop_corner[1] / image_width,
        'y_max': (crop_corner[0] + crop_length) / image_height,
        'x_max': (crop_corner[1] + crop_length) / image_width,
        'height': (crop_corner[0] + crop_length) / image_height -
            crop_corner[0] / image_height,
        'width': (crop_corner[1] + crop_length) / image_width -
            crop_corner[1] / image_width
      }
  else:
    return init_crop_region(image_height, image_width)

def crop_and_resize(image, crop_region, crop_size):
  """Crops and resize the image to prepare for the model input."""
  boxes=[[crop_region['y_min'], crop_region['x_min'],
          crop_region['y_max'], crop_region['x_max']]]
  output_image = tf.image.crop_and_resize(
      image, box_indices=[0], boxes=boxes, crop_size=crop_size)
  return output_image

def run_inference(movenet, image, crop_region, crop_size):
  """Runs model inference on the cropped region.

  The function runs the model inference on the cropped region and updates the
  model output to the original image coordinate system.
  """
  image_height, image_width, _ = image.shape
  input_image = crop_and_resize(
    tf.expand_dims(image, axis=0), crop_region, crop_size=crop_size)
  # Run model inference.
  keypoints_with_scores = movenet(input_image)
  # Update the coordinates.
  for idx in range(17):
    keypoints_with_scores[0, 0, idx, 0] = (
        crop_region['y_min'] * image_height +
        crop_region['height'] * image_height *
        keypoints_with_scores[0, 0, idx, 0]) / image_height
    keypoints_with_scores[0, 0, idx, 1] = (
        crop_region['x_min'] * image_width +
        crop_region['width'] * image_width *
        keypoints_with_scores[0, 0, idx, 1]) / image_width
  return keypoints_with_scores

#### Load the Video

In [21]:
# Load the input video.
video_path = 'ADL.mp4'
cap = cv2.VideoCapture(video_path)

if not cap.isOpened():
    print("Error: Could not open video.")

# Read the video frame by frame
frames = []
while True:
    ret, frame = cap.read()
    if not ret:
        break
    # Convert the frame from BGR to RGB
    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    frames.append(frame)
cap.release()

#### Convert the List of Frames to a 4-D Tensor

In [22]:
# Convert the frames to a 4-D tensor
frames_np = np.array(frames)
image = tf.convert_to_tensor(frames_np, dtype=tf.uint8)

#### Adding bounding box

In [8]:
def add_bounding_box(image, keypoints_with_scores, threshold=0.3, margin=0.13):
    """
    Add a bounding box to the image based on the keypoints with an additional margin.

    Args:
        image (numpy.ndarray): The image with keypoints.
        keypoints_with_scores (numpy.ndarray): The keypoints with scores.
        threshold (float): The confidence threshold to consider a keypoint.
        margin (float): The margin to add around the bounding box as a percentage of the box dimensions.

    Returns:
        numpy.ndarray: The image with the bounding box.
    """
    # Extract keypoints
    keypoints = keypoints_with_scores[0, 0, :, :2]
    scores = keypoints_with_scores[0, 0, :, 2]

    # Filter keypoints based on the confidence threshold
    valid_keypoints = keypoints[scores > threshold]

    if valid_keypoints.size == 0:
        return image

    # Calculate the bounding box coordinates
    x_min = np.min(valid_keypoints[:, 1])
    y_min = np.min(valid_keypoints[:, 0])
    x_max = np.max(valid_keypoints[:, 1])
    y_max = np.max(valid_keypoints[:, 0])

    # Convert to integer coordinates
    x_min = int(x_min * image.shape[1])
    y_min = int(y_min * image.shape[0])
    x_max = int(x_max * image.shape[1])
    y_max = int(y_max * image.shape[0])

    # Add margin to the bounding box
    box_width = x_max - x_min
    box_height = y_max - y_min
    x_min = max(0, x_min - int(margin * box_width))
    y_min = max(0, y_min - int(margin * box_height))
    x_max = min(image.shape[1], x_max + int(margin * box_width))
    y_max = min(image.shape[0], y_max + int(margin * box_height))

    # Draw the bounding box on the image
    cv2.rectangle(image, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)

    return image

#### Datraframe

In [24]:
def keypoints_to_dataframe(keypoints_with_scores, frame_index):
    """
    Converts keypoints with scores to a pandas DataFrame, reorganizes the columns, and removes eye and ear columns.
    
    Args:
      keypoints_with_scores (numpy.ndarray): A numpy array of shape 
      (1, 1, 17, 3) containing keypoints and their scores. The first 
      dimension is the batch size, the second dimension is the number 
      of instances, the third dimension is the number of keypoints 
      (17 for MoveNet), and the fourth dimension contains the 
      coordinates (x, y) and the score.
      frame_index (int): The index of the frame.
    
    Returns:
      pandas.DataFrame: A DataFrame containing the keypoints' coordinates 
      with columns named after the keypoint names followed by '_X' 
      and '_Y' for the x and y coordinates respectively, reorganized 
      such that x-coordinates come before y-coordinates for each keypoint, 
      and with eye and ear columns removed.
    """
    keypoints = keypoints_with_scores[0, 0, :, :2]  # Extract keypoints
    keypoint_names = [
        'Nose', 'Left Eye', 'Right Eye', 'Left Ear', 'Right Ear', 'Left Shoulder', 'Right Shoulder', 
        'Left Elbow', 'Right Elbow', 'Left Wrist', 'Right Wrist', 'Left Hip', 'Right Hip', 
        'Left Knee', 'Right Knee', 'Left Ankle', 'Right Ankle'
    ]
    
    # Create column names
    columns = []
    for name in keypoint_names:
        columns.append(f'{name}_Y')
        columns.append(f'{name}_X')

    # Flatten the keypoints array and create a DataFrame
    keypoints_flat = keypoints.flatten()
    df = pd.DataFrame([keypoints_flat], columns=columns)
    
    # Reorganize columns so that x comes before y
    x_columns = [col for col in columns if '_X' in col]
    y_columns = [col for col in columns if '_Y' in col]
    reorganized_columns = []
    for x_col, y_col in zip(x_columns, y_columns):
        reorganized_columns.append(x_col)
        reorganized_columns.append(y_col)
    
    # # Add frame index column
    # df['frame_idx'] = frame_index
    
    # # Reorder columns to have frame_idx first
    # df = df[['frame_idx'] + reorganized_columns]
    
    # Remove eye and ear columns
    columns_to_remove = [
        'Left Eye_Y', 'Left Eye_X', 
        'Right Eye_Y', 'Right Eye_X', 
        'Left Ear_Y', 'Left Ear_X', 
        'Right Ear_Y', 'Right Ear_X'
    ]
    df = df.drop(columns=[col for col in columns_to_remove if col in df.columns])
    
    return df

#### Pose Estimation Inference with Cropping Algorithm

In [12]:
import cv2

def load_video(video_path):
    # Load the input video.
    cap = cv2.VideoCapture(video_path)
    
    if not cap.isOpened():
        print("Error: Could not load video.")
        return

    # Read the video frame by frame
    frames = []
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        # Convert the frame from BGR to RGB
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frames.append(frame)
    
    # Release the video capture object
    cap.release()
    
    return frames

def infer_from_video(frames, movenet, input_size, init_crop_region, run_inference, draw_prediction_on_image, determine_crop_region, confidence_threshold=0.28):
    """
    Perform inference on a video using the MoveNet model and return keypoints as a DataFrame.
    Args:
      vid_path (str): Path to the input video file.
      movenet (tf.Module): The MoveNet model.
      input_size (int): The size of the input image for the model.
      init_crop_region (function): Function to initialize the crop region.
      run_inference (function): Function to run inference on a single frame.
      draw_prediction_on_image (function): Function to draw predictions on an image.
      determine_crop_region (function): Function to determine the crop region for the next frame.
      confidence_threshold (float): Minimum confidence score for keypoints to be included.
    Returns:
      pd.DataFrame: DataFrame containing keypoints coordinates each frame.
    """
    # # Load the input video.
    # cap = cv2.VideoCapture(vid_path)
    
    # if not cap.isOpened():
    #     print("Error: Could not load video.")
    #     return

    # # Read the video frame by frame
    # frames = []
    # while True:
    #     ret, frame = cap.read()
    #     if not ret:
    #         break
    #     # Convert the frame from BGR to RGB
    #     frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    #     frames.append(frame)
    
    # # Release the video capture object
    # cap.release()
    
    # Convert the frames to a 4-D tensor
    frames_np = np.array(frames)
    image = tf.convert_to_tensor(frames_np, dtype=tf.uint8)

    # Load the input image.
    num_frames, image_height, image_width, _ = image.shape
    crop_region = init_crop_region(image_height, image_width)

    # Run the model on the video frames one by one
    output_images = []
    keypoints_list = []
    for frame_idx in range(num_frames):
        # Run inference on the frame
        keypoints_with_scores = run_inference(
            movenet, image[frame_idx, :, :, :], crop_region,
            crop_size=[input_size, input_size])

        # Set coordinates with confidence scores below the threshold to -1
        keypoints_with_scores[0, 0, keypoints_with_scores[0, 0, :, 2] < confidence_threshold, :2] = -1

        # Convert keypoints to DataFrame and add frame index
        keypoints_df = keypoints_to_dataframe(keypoints_with_scores, frame_idx)
        keypoints_list.append(keypoints_df)

        # Draw predictions on the frame
        output_image = draw_prediction_on_image(
            image[frame_idx, :, :, :].numpy().astype(np.int32),
            keypoints_with_scores, crop_region=None,
            close_figure=True, output_image_height=300)

        # Add bounding box to the output image
        output_image_with_bbox = add_bounding_box(output_image, keypoints_with_scores)
        
        # Append the output image to the list
        output_images.append(output_image_with_bbox)
        
        # Determine the crop region for the next frame
        crop_region = determine_crop_region(
            keypoints_with_scores, image_height, image_width)

    # Concatenate all keypoints DataFrames
    all_keypoints_df = pd.concat(keypoints_list, ignore_index=True)
    return all_keypoints_df

In [20]:
import cv2
import numpy as np
import tensorflow as tf
import pandas as pd
import time

def load_video(video_path):
    stream = cv2.VideoCapture(video_path)
    
    if not stream.isOpened():
        print("Error: Could not load video.")
        return None
    
    return stream

def display_processed_frame(frame, delay):
    cv2.imshow('Processed Frame', frame)
    time.sleep(delay / 1000.0)  # Convert delay to seconds

    # Check if the user wants to exit
    if cv2.waitKey(1) & 0xFF == ord('q'):
        return False  # Signal to stop the display
    return True  # Continue the display

def infer_from_video_in_batches(video_path, movenet, input_size, init_crop_region, run_inference, draw_prediction_on_image, determine_crop_region, confidence_threshold=0.28, batch_size=10):
    stream = load_video(video_path)
    if stream is None:
        return
    
    # Get video properties
    num_frames = int(stream.get(cv2.CAP_PROP_FRAME_COUNT))
    image_height = int(stream.get(cv2.CAP_PROP_FRAME_HEIGHT))
    image_width = int(stream.get(cv2.CAP_PROP_FRAME_WIDTH))
    fps = stream.get(cv2.CAP_PROP_FPS)
    delay = int(1000 / fps)  # Delay in milliseconds for real-time display
    
    crop_region = init_crop_region(image_height, image_width)
    
    keypoints_list = []
    frames_batch = []
    frame_idx = 0
    batch_idx = 0
    keep_displaying = True
    
    while True:
        ret, frame = stream.read()
        if not ret:
            break
        
        # Convert frame to RGB
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frames_batch.append(frame_rgb)
        frame_idx += 1
        
        # Process batch if we've collected enough frames
        if len(frames_batch) == batch_size or frame_idx == num_frames:
            # Convert frames batch to a tensor
            frames_np = np.array(frames_batch)
            frames_tensor = tf.convert_to_tensor(frames_np, dtype=tf.uint8)
            
            # Process each frame in the batch
            for i, frame_rgb in enumerate(frames_batch):
                # Run inference on the frame
                keypoints_with_scores = run_inference(
                    movenet, frames_tensor[i], crop_region, crop_size=[input_size, input_size])
                
                # Set coordinates with low confidence to -1
                keypoints_with_scores[0, 0, keypoints_with_scores[0, 0, :, 2] < confidence_threshold, :2] = -1

                # Convert keypoints to DataFrame
                keypoints_df = keypoints_to_dataframe(keypoints_with_scores, frame_idx - len(frames_batch) + i)
                keypoints_df.insert(0, 'batch_idx', batch_idx)
                keypoints_df.insert(1, 'batch_frame', f"{frame_idx - len(frames_batch)}-{frame_idx - 1}")
                keypoints_list.append(keypoints_df)

                # Draw predictions and add bounding box
                output_image = draw_prediction_on_image(frame_rgb, keypoints_with_scores)
                output_image_with_bbox = add_bounding_box(output_image, keypoints_with_scores)

                # Convert frame back to BGR for display
                output_image_with_bbox_bgr = cv2.cvtColor(output_image_with_bbox, cv2.COLOR_RGB2BGR)

                # Display the frame using the separate display function
                keep_displaying = display_processed_frame(output_image_with_bbox_bgr, delay)

                if not keep_displaying:
                    break
            
            # Clear the batch for the next set of frames
            frames_batch = []
            batch_idx += 1
    
    # Concatenate all keypoints DataFrames
    all_keypoints_df = pd.concat(keypoints_list, ignore_index=True)
    return all_keypoints_df

In [29]:
import cv2
import numpy as np
import tensorflow as tf
import pandas as pd
import time

def load_video(video_path):
    stream = cv2.VideoCapture(video_path)
    
    if not stream.isOpened():
        print("Error: Could not load video.")
        return None
    
    return stream

def display_processed_frame(frame):
    """
    Displays the processed frame as fast as possible, maintaining real-time performance.
    
    Args:
        frame (np.ndarray): The frame to display.
    
    Returns:
        bool: Whether to continue displaying (False if 'q' is pressed).
    """
    cv2.imshow('Processed Frame', frame)

    # Check if the user wants to exit
    if cv2.waitKey(1) & 0xFF == ord('q'):
        return False  # Signal to stop the display
    return True  # Continue the display

def infer_from_video_in_batches(video_path, movenet, input_size, init_crop_region, run_inference, draw_prediction_on_image, determine_crop_region, confidence_threshold=0.28, batch_size=10):
    stream = load_video(video_path)
    if stream is None:
        return
    
    # Get video properties
    num_frames = int(stream.get(cv2.CAP_PROP_FRAME_COUNT))
    image_height = int(stream.get(cv2.CAP_PROP_FRAME_HEIGHT))
    image_width = int(stream.get(cv2.CAP_PROP_FRAME_WIDTH))
    fps = stream.get(cv2.CAP_PROP_FPS)
    delay_per_frame = 1000 / fps  # Time per frame in milliseconds for real-time synchronization
    
    crop_region = init_crop_region(image_height, image_width)
    
    keypoints_list = []
    frames_batch = []
    frame_idx = 0
    batch_idx = 0
    keep_displaying = True
    
    while True:
        start_time = time.time()  # Track when processing of the frame starts

        ret, frame = stream.read()
        if not ret:
            break
        
        # Convert frame to RGB
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frames_batch.append(frame_rgb)
        frame_idx += 1
        
        # Process batch if we've collected enough frames
        if len(frames_batch) == batch_size or frame_idx == num_frames:
            # Convert frames batch to a tensor
            frames_np = np.array(frames_batch)
            frames_tensor = tf.convert_to_tensor(frames_np, dtype=tf.uint8)
            
            batch_keypoints_list = []
            # Process each frame in the batch
            for i, frame_rgb in enumerate(frames_batch):
                # Run inference on the frame
                keypoints_with_scores = run_inference(
                    movenet, frames_tensor[i], crop_region, crop_size=[input_size, input_size])
                
                # Set coordinates with low confidence to -1
                keypoints_with_scores[0, 0, keypoints_with_scores[0, 0, :, 2] < confidence_threshold, :2] = -1

                # Convert keypoints to DataFrame
                keypoints_df = keypoints_to_dataframe(keypoints_with_scores, frame_idx - len(frames_batch) + i)
                keypoints_df.insert(0, 'frame_idx', frame_idx - len(frames_batch) + i)
                keypoints_df.insert(1, 'batch_idx', batch_idx)
                batch_keypoints_list.append(keypoints_df)

                # Draw predictions and add bounding box
                output_image = draw_prediction_on_image(frame_rgb, keypoints_with_scores)
                output_image_with_bbox = add_bounding_box(output_image, keypoints_with_scores)

                # Convert frame back to BGR for display
                output_image_with_bbox_bgr = cv2.cvtColor(output_image_with_bbox, cv2.COLOR_RGB2BGR)

                # Display the frame using the separate display function
                keep_displaying = display_processed_frame(output_image_with_bbox_bgr)

                if not keep_displaying:
                    break

                # Ensure real-time display by accounting for processing time
                processing_time = (time.time() - start_time) * 1000  # Processing time in milliseconds
                if delay_per_frame > processing_time:
                    time.sleep((delay_per_frame - processing_time) / 1000)  # Wait if ahead of schedule
            
            # Clear the batch for the next set of frames
            frames_batch = []
            batch_idx += 1

            # Concatenate keypoints from the current batch
            batch_keypoints_df = pd.concat(batch_keypoints_list, ignore_index=True)

            # Output the DataFrame for the batch
            yield batch_keypoints_df  # Return the DataFrame for this batch for further processing

        if not keep_displaying:
            break
    
    # Release the video capture object
    stream.release()
    cv2.destroyAllWindows()

In [34]:
# Initialize the MoveNet model and other necessary components
video_path = "ADL.mp4"
for batch_df in infer_from_video_in_batches(video_path, movenet, input_size, init_crop_region, run_inference, draw_prediction_on_image, determine_crop_region):
    # Perform further calculations with batch_df
    print(f"Processed batch with {len(batch_df)} keypoints")
    # Example: You could calculate velocity/acceleration, save the batch, etc.

Processed batch with 10 keypoints
Processed batch with 10 keypoints
Processed batch with 10 keypoints
Processed batch with 10 keypoints
Processed batch with 10 keypoints
Processed batch with 10 keypoints
Processed batch with 10 keypoints
Processed batch with 10 keypoints
Processed batch with 10 keypoints
Processed batch with 10 keypoints
Processed batch with 10 keypoints
Processed batch with 10 keypoints
Processed batch with 10 keypoints
Processed batch with 10 keypoints
Processed batch with 10 keypoints
Processed batch with 10 keypoints
Processed batch with 10 keypoints
Processed batch with 10 keypoints
Processed batch with 10 keypoints
Processed batch with 10 keypoints
Processed batch with 10 keypoints
Processed batch with 10 keypoints
Processed batch with 10 keypoints


: 

In [31]:
# Example usage
MoveNet = infer_from_video_in_batches('ADL.mp4', movenet, input_size, init_crop_region, run_inference, draw_prediction_on_image, determine_crop_region, confidence_threshold=0.28, batch_size=10)
print(MoveNet)

<generator object infer_from_video_in_batches at 0x360292580>


#### Combining pose estimated frames into video

In [27]:
for output_image in output_images:
    cv2.imshow('Processed Frame', cv2.cvtColor(output_image, cv2.COLOR_RGB2BGR))
    if cv2.waitKey(30) & 0xFF == ord('q'):
        break

cv2.destroyAllWindows()

## i42 Adaption

### Preping functions

#### Install required packages

In [1]:
%pip install opencv-python
%pip install tensorflow
%pip install tensorflow-hub
%pip install numpy
%pip install pandas


You should consider upgrading via the '/Users/nick/Documents/GitHub/MoveNet/MoveNet/bin/python -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.
You should consider upgrading via the '/Users/nick/Documents/GitHub/MoveNet/MoveNet/bin/python -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.
You should consider upgrading via the '/Users/nick/Documents/GitHub/MoveNet/MoveNet/bin/python -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.
You should consider upgrading via the '/Users/nick/Documents/GitHub/MoveNet/MoveNet/bin/python -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.
You should consider upgrading via the '/Users/nick/Documents/GitHub/MoveNet/MoveNet/bin/python -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to us

#### Import required packages

In [2]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import cv2

# Import matplotlib libraries
from matplotlib import pyplot as plt
from matplotlib.collections import LineCollection
import matplotlib.patches as patches

#### Helper functions

In [3]:
# Dictionary that maps from joint names to keypoint indices.
KEYPOINT_DICT = {
    'nose': 0,
    'left_eye': 1,
    'right_eye': 2,
    'left_ear': 3,
    'right_ear': 4,
    'left_shoulder': 5,
    'right_shoulder': 6,
    'left_elbow': 7,
    'right_elbow': 8,
    'left_wrist': 9,
    'right_wrist': 10,
    'left_hip': 11,
    'right_hip': 12,
    'left_knee': 13,
    'right_knee': 14,
    'left_ankle': 15,
    'right_ankle': 16
}


KEYPOINT_EDGE_INDS_TO_COLOR = {
    (0, 1): 'm',
    (0, 2): 'c',
    (1, 3): 'm',
    (2, 4): 'c',
    (0, 5): 'm',
    (0, 6): 'c',
    (5, 7): 'm',
    (7, 9): 'm',
    (6, 8): 'c',
    (8, 10): 'c',
    (5, 6): 'y',
    (5, 11): 'm',
    (6, 12): 'c',
    (11, 12): 'y',
    (11, 13): 'm',
    (13, 15): 'm',
    (12, 14): 'c',
    (14, 16): 'c'
}

def _keypoints_and_edges_for_display(keypoints_with_scores,
                                     height,
                                     width,
                                     keypoint_threshold=0.25):
  """Returns high confidence keypoints and edges for visualization.

  Args:
    keypoints_with_scores: A numpy array with shape [1, 1, 17, 3] representing
      the keypoint coordinates and scores returned from the MoveNet model.
    height: height of the image in pixels.
    width: width of the image in pixels.
    keypoint_threshold: minimum confidence score for a keypoint to be
      visualized.

  Returns:
    A (keypoints_xy, edges_xy, edge_colors) containing:
      * the coordinates of all keypoints of all detected entities;
      * the coordinates of all skeleton edges of all detected entities;
      * the colors in which the edges should be plotted.
  """
  keypoints_all = []
  keypoint_edges_all = []
  edge_colors = []
  num_instances, _, _, _ = keypoints_with_scores.shape
  for idx in range(num_instances):
    kpts_x = keypoints_with_scores[0, idx, :, 1]
    kpts_y = keypoints_with_scores[0, idx, :, 0]
    kpts_scores = keypoints_with_scores[0, idx, :, 2]
    kpts_absolute_xy = np.stack(
        [width * np.array(kpts_x), height * np.array(kpts_y)], axis=-1)
    kpts_above_thresh_absolute = kpts_absolute_xy[
        kpts_scores > keypoint_threshold, :]
    keypoints_all.append(kpts_above_thresh_absolute)

    for edge_pair, color in KEYPOINT_EDGE_INDS_TO_COLOR.items():
      if (kpts_scores[edge_pair[0]] > keypoint_threshold and
          kpts_scores[edge_pair[1]] > keypoint_threshold):
        x_start = kpts_absolute_xy[edge_pair[0], 0]
        y_start = kpts_absolute_xy[edge_pair[0], 1]
        x_end = kpts_absolute_xy[edge_pair[1], 0]
        y_end = kpts_absolute_xy[edge_pair[1], 1]
        line_seg = np.array([[x_start, y_start], [x_end, y_end]])
        keypoint_edges_all.append(line_seg)
        edge_colors.append(color)
  if keypoints_all:
    keypoints_xy = np.concatenate(keypoints_all, axis=0)
  else:
    keypoints_xy = np.zeros((0, 17, 2))

  if keypoint_edges_all:
    edges_xy = np.stack(keypoint_edges_all, axis=0)
  else:
    edges_xy = np.zeros((0, 2, 2))
  return keypoints_xy, edges_xy, edge_colors

def draw_prediction_on_image(
    image, keypoints_with_scores, crop_region=None, close_figure=False,
    output_image_height=None):
  """Draws the keypoint predictions on image.

  Args:
    image: A numpy array with shape [height, width, channel] representing the
      pixel values of the input image.
    keypoints_with_scores: A numpy array with shape [1, 1, 17, 3] representing
      the keypoint coordinates and scores returned from the MoveNet model.
    crop_region: A dictionary that defines the coordinates of the bounding box
      of the crop region in normalized coordinates (see the init_crop_region
      function below for more detail). If provided, this function will also
      draw the bounding box on the image.
    output_image_height: An integer indicating the height of the output image.
      Note that the image aspect ratio will be the same as the input image.

  Returns:
    A numpy array with shape [out_height, out_width, channel] representing the
    image overlaid with keypoint predictions.
  """
  height, width, channel = image.shape
  aspect_ratio = float(width) / height
  fig, ax = plt.subplots(figsize=(12 * aspect_ratio, 12))
  # To remove the huge white borders
  fig.tight_layout(pad=0)
  ax.margins(0)
  ax.set_yticklabels([])
  ax.set_xticklabels([])
  plt.axis('off')

  im = ax.imshow(image)
  line_segments = LineCollection([], linewidths=(4), linestyle='solid')
  ax.add_collection(line_segments)
  # Turn off tick labels
  scat = ax.scatter([], [], s=60, color='#FF1493', zorder=3)

  (keypoint_locs, keypoint_edges,
   edge_colors) = _keypoints_and_edges_for_display(
       keypoints_with_scores, height, width)

  line_segments.set_segments(keypoint_edges)
  line_segments.set_color(edge_colors)
  if keypoint_edges.shape[0]:
    line_segments.set_segments(keypoint_edges)
    line_segments.set_color(edge_colors)
  if keypoint_locs.shape[0]:
    scat.set_offsets(keypoint_locs)

  if crop_region is not None:
    xmin = max(crop_region['x_min'] * width, 0.0)
    ymin = max(crop_region['y_min'] * height, 0.0)
    rec_width = min(crop_region['x_max'], 0.99) * width - xmin
    rec_height = min(crop_region['y_max'], 0.99) * height - ymin
    rect = patches.Rectangle(
        (xmin,ymin),rec_width,rec_height,
        linewidth=1,edgecolor='b',facecolor='none')
    ax.add_patch(rect)

  fig.canvas.draw()
  image_from_plot = np.frombuffer(fig.canvas.buffer_rgba(), dtype=np.uint8)
  image_from_plot = image_from_plot.reshape(fig.canvas.get_width_height()[::-1] + (4,))
  plt.close(fig)
  if output_image_height is not None:
    output_image_width = int(output_image_height / height * width)
    image_from_plot = cv2.resize(
        image_from_plot, dsize=(output_image_width, output_image_height),
        interpolation=cv2.INTER_CUBIC)
  return image_from_plot

In [4]:
# Dictionary that maps from joint names to keypoint indices.
KEYPOINT_DICT = {
    'nose': 0,
    'left_eye': 1,
    'right_eye': 2,
    'left_ear': 3,
    'right_ear': 4,
    'left_shoulder': 5,
    'right_shoulder': 6,
    'left_elbow': 7,
    'right_elbow': 8,
    'left_wrist': 9,
    'right_wrist': 10,
    'left_hip': 11,
    'right_hip': 12,
    'left_knee': 13,
    'right_knee': 14,
    'left_ankle': 15,
    'right_ankle': 16
}


KEYPOINT_EDGE_INDS_TO_COLOR = {
    (0, 1): 'm',
    (0, 2): 'c',
    (1, 3): 'm',
    (2, 4): 'c',
    (0, 5): 'm',
    (0, 6): 'c',
    (5, 7): 'm',
    (7, 9): 'm',
    (6, 8): 'c',
    (8, 10): 'c',
    (5, 6): 'y',
    (5, 11): 'm',
    (6, 12): 'c',
    (11, 12): 'y',
    (11, 13): 'm',
    (13, 15): 'm',
    (12, 14): 'c',
    (14, 16): 'c'
}

def _keypoints_and_edges_for_display(keypoints_with_scores,
                                     height,
                                     width,
                                     keypoint_threshold=0.25):
  """Returns high confidence keypoints and edges for visualization.

  Args:
    keypoints_with_scores: A numpy array with shape [1, 1, 17, 3] representing
      the keypoint coordinates and scores returned from the MoveNet model.
    height: height of the image in pixels.
    width: width of the image in pixels.
    keypoint_threshold: minimum confidence score for a keypoint to be
      visualized.

  Returns:
    A (keypoints_xy, edges_xy, edge_colors) containing:
      * the coordinates of all keypoints of all detected entities;
      * the coordinates of all skeleton edges of all detected entities;
      * the colors in which the edges should be plotted.
  """
  keypoints_all = []
  keypoint_edges_all = []
  edge_colors = []
  num_instances, _, _, _ = keypoints_with_scores.shape
  for idx in range(num_instances):
    kpts_x = keypoints_with_scores[0, idx, :, 1]
    kpts_y = keypoints_with_scores[0, idx, :, 0]
    kpts_scores = keypoints_with_scores[0, idx, :, 2]
    kpts_absolute_xy = np.stack(
        [width * np.array(kpts_x), height * np.array(kpts_y)], axis=-1)
    kpts_above_thresh_absolute = kpts_absolute_xy[
        kpts_scores > keypoint_threshold, :]
    keypoints_all.append(kpts_above_thresh_absolute)

    for edge_pair, color in KEYPOINT_EDGE_INDS_TO_COLOR.items():
      if (kpts_scores[edge_pair[0]] > keypoint_threshold and
          kpts_scores[edge_pair[1]] > keypoint_threshold):
        x_start = kpts_absolute_xy[edge_pair[0], 0]
        y_start = kpts_absolute_xy[edge_pair[0], 1]
        x_end = kpts_absolute_xy[edge_pair[1], 0]
        y_end = kpts_absolute_xy[edge_pair[1], 1]
        line_seg = np.array([[x_start, y_start], [x_end, y_end]])
        keypoint_edges_all.append(line_seg)
        edge_colors.append(color)
  if keypoints_all:
    keypoints_xy = np.concatenate(keypoints_all, axis=0)
  else:
    keypoints_xy = np.zeros((0, 17, 2))

  if keypoint_edges_all:
    edges_xy = np.stack(keypoint_edges_all, axis=0)
  else:
    edges_xy = np.zeros((0, 2, 2))
  return keypoints_xy, edges_xy, edge_colors

def draw_prediction_on_image(image, keypoints_with_scores, crop_region=None, close_figure=False, output_image_height=None):
    """
    Draws keypoints, edges, and bounding box on an image and returns the resulting image.
    
    Args:
        image (np.ndarray): The input image on which to draw.
        keypoints_with_scores (np.ndarray): Array containing keypoints and their scores.
        crop_region (dict, optional): Dictionary specifying the crop region with keys 'x_min', 'y_min', 'x_max', and 'y_max'. Defaults to None.
        close_figure (bool, optional): Whether to close the figure after drawing. Defaults to False.
        output_image_height (int, optional): Desired height of the output image. If specified, the output image will be resized to this height while maintaining the aspect ratio. Defaults to None.
    
    Returns:
        np.ndarray: The image with keypoints, edges, and bounding box drawn on it.
    """
    height, width, channel = image.shape
    aspect_ratio = float(width) / height
    fig, ax = plt.subplots(figsize=(12 * aspect_ratio, 12))
    # To remove the huge white borders
    fig.tight_layout(pad=0)
    ax.margins(0)
    ax.set_yticklabels([])
    ax.set_xticklabels([])
    plt.axis('off')

    im = ax.imshow(image)
    line_segments = LineCollection([], linewidths=(4), linestyle='solid')
    ax.add_collection(line_segments)
    # Turn off tick labels
    scat = ax.scatter([], [], s=60, color='#FF1493', zorder=3)

    (keypoint_locs, keypoint_edges, edge_colors) = _keypoints_and_edges_for_display(
        keypoints_with_scores, height, width)

    line_segments.set_segments(keypoint_edges)
    line_segments.set_color(edge_colors)
    if keypoint_edges.shape[0]:
        line_segments.set_segments(keypoint_edges)
        line_segments.set_color(edge_colors)
    if keypoint_locs.shape[0]:
        scat.set_offsets(keypoint_locs)

    if crop_region is not None:
        xmin = max(crop_region['x_min'] * width, 0.0)
        ymin = max(crop_region['y_min'] * height, 0.0)
        rec_width = min(crop_region['x_max'], 0.99) * width - xmin
        rec_height = min(crop_region['y_max'], 0.99) * height - ymin
        rect = patches.Rectangle(
            (xmin, ymin), rec_width, rec_height,
            linewidth=1, edgecolor='b', facecolor='none')
        ax.add_patch(rect)

    fig.canvas.draw()
    image_from_plot = np.frombuffer(fig.canvas.buffer_rgba(), dtype=np.uint8)
    image_from_plot = image_from_plot.reshape(fig.canvas.get_width_height()[::-1] + (4,))
    plt.close(fig)
    if output_image_height is not None:
        output_image_width = int(output_image_height / height * width)
        image_from_plot = cv2.resize(
            image_from_plot, dsize=(output_image_width, output_image_height),
            interpolation=cv2.INTER_CUBIC)
    return image_from_plot

#### Loading MoveNet - Single Pose Thunder 4

In [5]:
# Load the MoveNet model from TensorFlow Hub
try:
    module = hub.load("https://www.kaggle.com/models/google/movenet/TensorFlow2/singlepose-thunder/4")
    input_size = 256
except Exception as error:
    print(f"Failed to load the model: {error}")

def movenet(input_image):
    """Runs detection on an input image.

    Args:
        input_image: A [1, height, width, 3] tensor represents the input image
        pixels. Note that the height/width should already be resized and match the
        expected input resolution of the model before passing into this function.

    Returns:
        A [1, 1, 17, 3] float numpy array representing the predicted keypoint
        coordinates and scores.
    """
    model = module.signatures['serving_default']

    # SavedModel format expects tensor type of int32.
    input_image = tf.cast(input_image, dtype=tf.int32)
    # Run model inference.
    outputs = model(input_image)
    # Output is a [1, 1, 17, 3] tensor.
    keypoints_with_scores = outputs['output_0'].numpy()
    
    return keypoints_with_scores

2024-10-02 19:25:02.260586: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Pro
2024-10-02 19:25:02.260609: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2024-10-02 19:25:02.260636: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2024-10-02 19:25:02.260660: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-10-02 19:25:02.260674: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


### Live Stream (Image Sequence)

#### Cropping Algorithm

In [6]:
# Confidence score to determine whether a keypoint prediction is reliable.
MIN_CROP_KEYPOINT_SCORE = 0.2

def init_crop_region(image_height, image_width):
  """Defines the default crop region.

  The function provides the initial crop region (pads the full image from both
  sides to make it a square image) when the algorithm cannot reliably determine
  the crop region from the previous frame.
  """
  if image_width > image_height:
    box_height = image_width / image_height
    box_width = 1.0
    y_min = (image_height / 2 - image_width / 2) / image_height
    x_min = 0.0
  else:
    box_height = 1.0
    box_width = image_height / image_width
    y_min = 0.0
    x_min = (image_width / 2 - image_height / 2) / image_width

  return {
    'y_min': y_min,
    'x_min': x_min,
    'y_max': y_min + box_height,
    'x_max': x_min + box_width,
    'height': box_height,
    'width': box_width
  }

def torso_visible(keypoints):
  """Checks whether there are enough torso keypoints.

  This function checks whether the model is confident at predicting one of the
  shoulders/hips which is required to determine a good crop region.
  """
  return ((keypoints[0, 0, KEYPOINT_DICT['left_hip'], 2] >
           MIN_CROP_KEYPOINT_SCORE or
          keypoints[0, 0, KEYPOINT_DICT['right_hip'], 2] >
           MIN_CROP_KEYPOINT_SCORE) and
          (keypoints[0, 0, KEYPOINT_DICT['left_shoulder'], 2] >
           MIN_CROP_KEYPOINT_SCORE or
          keypoints[0, 0, KEYPOINT_DICT['right_shoulder'], 2] >
           MIN_CROP_KEYPOINT_SCORE))

def determine_torso_and_body_range(
    keypoints, target_keypoints, center_y, center_x):
  """Calculates the maximum distance from each keypoints to the center location.

  The function returns the maximum distances from the two sets of keypoints:
  full 17 keypoints and 4 torso keypoints. The returned information will be
  used to determine the crop size. See determineCropRegion for more detail.
  """
  torso_joints = ['left_shoulder', 'right_shoulder', 'left_hip', 'right_hip']
  max_torso_yrange = 0.0
  max_torso_xrange = 0.0
  for joint in torso_joints:
    dist_y = abs(center_y - target_keypoints[joint][0])
    dist_x = abs(center_x - target_keypoints[joint][1])
    if dist_y > max_torso_yrange:
      max_torso_yrange = dist_y
    if dist_x > max_torso_xrange:
      max_torso_xrange = dist_x

  max_body_yrange = 0.0
  max_body_xrange = 0.0
  for joint in KEYPOINT_DICT.keys():
    if keypoints[0, 0, KEYPOINT_DICT[joint], 2] < MIN_CROP_KEYPOINT_SCORE:
      continue
    dist_y = abs(center_y - target_keypoints[joint][0]);
    dist_x = abs(center_x - target_keypoints[joint][1]);
    if dist_y > max_body_yrange:
      max_body_yrange = dist_y

    if dist_x > max_body_xrange:
      max_body_xrange = dist_x

  return [max_torso_yrange, max_torso_xrange, max_body_yrange, max_body_xrange]

def determine_crop_region(
      keypoints, image_height,
      image_width):
  """Determines the region to crop the image for the model to run inference on.

  The algorithm uses the detected joints from the previous frame to estimate
  the square region that encloses the full body of the target person and
  centers at the midpoint of two hip joints. The crop size is determined by
  the distances between each joints and the center point.
  When the model is not confident with the four torso joint predictions, the
  function returns a default crop which is the full image padded to square.
  """
  target_keypoints = {}
  for joint in KEYPOINT_DICT.keys():
    target_keypoints[joint] = [
      keypoints[0, 0, KEYPOINT_DICT[joint], 0] * image_height,
      keypoints[0, 0, KEYPOINT_DICT[joint], 1] * image_width
    ]

  if torso_visible(keypoints):
    center_y = (target_keypoints['left_hip'][0] +
                target_keypoints['right_hip'][0]) / 2;
    center_x = (target_keypoints['left_hip'][1] +
                target_keypoints['right_hip'][1]) / 2;

    (max_torso_yrange, max_torso_xrange,
      max_body_yrange, max_body_xrange) = determine_torso_and_body_range(
          keypoints, target_keypoints, center_y, center_x)

    crop_length_half = np.amax(
        [max_torso_xrange * 1.9, max_torso_yrange * 1.9,
          max_body_yrange * 1.2, max_body_xrange * 1.2])

    tmp = np.array(
        [center_x, image_width - center_x, center_y, image_height - center_y])
    crop_length_half = np.amin(
        [crop_length_half, np.amax(tmp)]);

    crop_corner = [center_y - crop_length_half, center_x - crop_length_half];

    if crop_length_half > max(image_width, image_height) / 2:
      return init_crop_region(image_height, image_width)
    else:
      crop_length = crop_length_half * 2;
      return {
        'y_min': crop_corner[0] / image_height,
        'x_min': crop_corner[1] / image_width,
        'y_max': (crop_corner[0] + crop_length) / image_height,
        'x_max': (crop_corner[1] + crop_length) / image_width,
        'height': (crop_corner[0] + crop_length) / image_height -
            crop_corner[0] / image_height,
        'width': (crop_corner[1] + crop_length) / image_width -
            crop_corner[1] / image_width
      }
  else:
    return init_crop_region(image_height, image_width)

def crop_and_resize(image, crop_region, crop_size):
  """Crops and resize the image to prepare for the model input."""
  boxes=[[crop_region['y_min'], crop_region['x_min'],
          crop_region['y_max'], crop_region['x_max']]]
  output_image = tf.image.crop_and_resize(
      image, box_indices=[0], boxes=boxes, crop_size=crop_size)
  return output_image

def run_inference(movenet, image, crop_region, crop_size):
  """Runs model inference on the cropped region.

  The function runs the model inference on the cropped region and updates the
  model output to the original image coordinate system.
  """
  image_height, image_width, _ = image.shape
  input_image = crop_and_resize(
    tf.expand_dims(image, axis=0), crop_region, crop_size=crop_size)
  # Run model inference.
  keypoints_with_scores = movenet(input_image)
  # Update the coordinates.
  for idx in range(17):
    keypoints_with_scores[0, 0, idx, 0] = (
        crop_region['y_min'] * image_height +
        crop_region['height'] * image_height *
        keypoints_with_scores[0, 0, idx, 0]) / image_height
    keypoints_with_scores[0, 0, idx, 1] = (
        crop_region['x_min'] * image_width +
        crop_region['width'] * image_width *
        keypoints_with_scores[0, 0, idx, 1]) / image_width
  return keypoints_with_scores

#### Convert the keypoints into Dataframe

In [7]:
def keypoints_to_dataframe(keypoints_with_scores):
  """
  Converts keypoints with scores to a pandas DataFrame, reorganizes the columns, and removes eye and ear columns.
  
  Args:
    keypoints_with_scores (numpy.ndarray): A numpy array of shape 
    (1, 1, 17, 3) containing keypoints and their scores. The first 
    dimension is the batch size, the second dimension is the number 
    of instances, the third dimension is the number of keypoints 
    (17 for MoveNet), and the fourth dimension contains the 
    coordinates (x, y) and the score.
  
  Returns:
    pandas.DataFrame: A DataFrame containing the keypoints' coordinates 
    with columns named after the keypoint names followed by '_X' 
    and '_Y' for the x and y coordinates respectively, reorganized 
    such that x-coordinates come before y-coordinates for each keypoint, 
    and with eye and ear columns removed.
  """
  keypoints = keypoints_with_scores[0, 0, :, :2]  # Extract keypoints
  keypoint_names = [
    'Nose', 'Left Eye', 'Right Eye', 'Left Ear', 'Right Ear', 'Left Shoulder', 'Right Shoulder', 
    'Left Elbow', 'Right Elbow', 'Left Wrist', 'Right Wrist', 'Left Hip', 'Right Hip', 
    'Left Knee', 'Right Knee', 'Left Ankle', 'Right Ankle'
  ]
  
  # Create column names
  columns = []
  for name in keypoint_names:
    columns.append(f'{name}_Y')
    columns.append(f'{name}_X')

  # Flatten the keypoints array and create a DataFrame
  keypoints_flat = keypoints.flatten()
  df = pd.DataFrame([keypoints_flat], columns=columns)
  
  # Reorganize columns so that x comes before y
  x_columns = [col for col in columns if '_X' in col]
  y_columns = [col for col in columns if '_Y' in col]
  reorganized_columns = []
  for x_col, y_col in zip(x_columns, y_columns):
    reorganized_columns.append(x_col)
    reorganized_columns.append(y_col)
  df = df[reorganized_columns]
  
  # Remove eye and ear columns
  columns_to_remove = [
    'Left Eye_Y', 'Left Eye_X', 
    'Right Eye_Y', 'Right Eye_X', 
    'Left Ear_Y', 'Left Ear_X', 
    'Right Ear_Y', 'Right Ear_X'
  ]
  df = df.drop(columns=columns_to_remove)

  return df

#### Inference on Live Video Stream Frame from OpenCV

##### Adding Bounding Box

In [8]:
def add_bounding_box(image, keypoints_with_scores, threshold=0.4, margin=0.2):
    """
    Add a bounding box to the image based on the keypoints with an additional margin.

    Args:
        image (numpy.ndarray): The image with keypoints.
        keypoints_with_scores (numpy.ndarray): The keypoints with scores.
        threshold (float): The confidence threshold to consider a keypoint.
        margin (float): The margin to add around the bounding box as a percentage of the box dimensions.

    Returns:
        numpy.ndarray: The image with the bounding box.
    """
    # Extract keypoints
    keypoints = keypoints_with_scores[0, 0, :, :2]
    scores = keypoints_with_scores[0, 0, :, 2]

    # Filter keypoints based on the confidence threshold
    valid_keypoints = keypoints[scores > threshold]

    if valid_keypoints.size == 0:
        return image

    # Calculate the bounding box coordinates
    x_min = np.min(valid_keypoints[:, 1])
    y_min = np.min(valid_keypoints[:, 0])
    x_max = np.max(valid_keypoints[:, 1])
    y_max = np.max(valid_keypoints[:, 0])

    # Convert to integer coordinates
    x_min = int(x_min * image.shape[1])
    y_min = int(y_min * image.shape[0])
    x_max = int(x_max * image.shape[1])
    y_max = int(y_max * image.shape[0])

    # Add margin to the bounding box
    box_width = x_max - x_min
    box_height = y_max - y_min
    x_min = max(0, x_min - int(margin * box_width))
    y_min = max(0, y_min - int(margin * box_height))
    x_max = min(image.shape[1], x_max + int(margin * box_width))
    y_max = min(image.shape[0], y_max + int(margin * box_height))

    # Draw the bounding box on the image
    cv2.rectangle(image, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)

    return image

In [9]:
import cv2
import tensorflow as tf
import numpy as np

def stream_and_infer_from_camera(movenet, input_size, init_crop_region, run_inference, draw_prediction_on_image, determine_crop_region):
    """
    Capture live stream from the system camera and run inference on the captured frames.

    Args:
        movenet: The pose estimation model.
        input_size: The input size for the model.
        init_crop_region: Function to initialize the crop region.
        run_inference: Function to run inference on a frame.
        draw_prediction_on_image: Function to draw predictions on a frame.
        determine_crop_region: Function to determine the crop region based on keypoints.

    Returns:
        None
    """
    # Open the system camera
    stream = cv2.VideoCapture('0')

    if not stream.isOpened():
        print("Error: Could not open camera.")
        return

    # Initialize crop region
    ret, frame = stream.read()
    if not ret:
        print("Error: Could not read frame from camera.")
        return

    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    image_height, image_width, _ = frame.shape
    crop_region = init_crop_region(image_height, image_width)

    while True:
        ret, frame = stream.read()
        if not ret:
            break

        # Convert the frame from BGR to RGB
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frame_tensor = tf.convert_to_tensor(frame_rgb, dtype=tf.uint8)
        frame_tensor = tf.expand_dims(frame_tensor, axis=0)  # Add batch dimension

        # Run inference
        keypoints_with_scores = run_inference(
            movenet, frame_tensor[0], crop_region,
            crop_size=[input_size, input_size])
        
        # Draw predictions on the frame
        output_frame = draw_prediction_on_image(
            frame_rgb.astype(np.int32),
            keypoints_with_scores, crop_region=None,
            close_figure=True, output_image_height=300)
        
        # Update crop region
        crop_region = determine_crop_region(
            keypoints_with_scores, image_height, image_width)
        
        # Add bounding box to the frame
        output_frame = add_bounding_box(output_frame, keypoints_with_scores)

        # Display the live stream with predictions
        cv2.imshow('Live Stream', cv2.cvtColor(output_frame, cv2.COLOR_RGB2BGR))
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    stream.release()
    cv2.destroyAllWindows()