# Test the trained prototype model

In [39]:
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader


## Create a simple pipeline
This will include:
- using raw video data.
- Processing the raw video data.
- Passing the processed video data into the model.
- Displaying the results.

In [40]:

from keypoint_extractor import KeypointExtractorV2

In [41]:
def load_and_extract(path: str) -> np.ndarray:
    extractor = KeypointExtractorV2(r"models/mediapipe/pose_landmarker_full.task")
    keypoints = extractor.extract(path)
    
    max_frames = 200
    pad_len = max_frames - len(keypoints)
    if pad_len > 0:
        pad = np.zeros((pad_len, keypoints.shape[1], keypoints.shape[2]))  # Preserve all dimensions
        padded_sample = np.concatenate((keypoints, pad), axis=0)
    else:
        padded_sample = keypoints

    return np.array(padded_sample)
        

In [42]:
def load_and_extract_for_inference(video_path: str, max_frames: int = 200) -> tuple[np.ndarray, np.ndarray]:
    """
    Loads a video, extracts keypoints, pads them to max_frames,
    and generates an attention mask.

    Args:
        video_path: Path to the input video file.
        max_frames: The maximum sequence length used during training.

    Returns:
        A tuple:
            - padded_keypoints: np.ndarray of shape (max_frames, num_keypoints, coordinates)
            - attention_mask: np.ndarray of shape (max_frames,)
    """
    extractor = KeypointExtractorV2(r"models/mediapipe/pose_landmarker_full.task")
    keypoints = extractor.extract(video_path) # keypoints will be (actual_len, num_keypoints, coords)

    actual_len = keypoints.shape[0]

    # Initialize padded_sample and attention_mask
    padded_sample = np.zeros((max_frames, keypoints.shape[1], keypoints.shape[2]), dtype=keypoints.dtype)
    attention_mask = np.zeros(max_frames, dtype=np.float32) # Use float32 for mask

    # Copy actual keypoints to the padded array
    if actual_len > max_frames:
        # If the actual length is greater than max_frames, truncate it.
        # This aligns with how you filtered during training for `create_transformer_ready_arrays`.
        padded_sample = keypoints[:max_frames, :, :]
        attention_mask[:] = 1.0 # All elements are valid up to max_frames
        print(f"Warning: Video '{video_path}' was truncated from {actual_len} to {max_frames} frames.")
    else:
        padded_sample[:actual_len, :, :] = keypoints
        attention_mask[:actual_len] = 1.0 # Set mask to 1 for actual data points

    # Drop visibility dimension (assuming it's the 4th coord, like in your training data prep)
    padded_sample = padded_sample[:, :, :3]

    return padded_sample, attention_mask

In [43]:
video_to_test_path = 'data/raw/deadlifts/22a1eea2-80f4-41e8-a6d6-976ee610d8d4_V1-0024.mp4'
labels_map = {0: "Squats", 1: "Deadlifts", 2: "Shoulder Press"}

In [44]:
from hierarchical_transformer_prototype import HierarchicalTransformer

model = HierarchicalTransformer(
    num_joints=33,
    num_frames=378,
    d_model=64,
    nhead=4,
    num_spatial_layers=1,
    num_temporal_layers=1,
    num_classes=3,
    dim_feedforward=512
)
model.load_state_dict(torch.load("models/hierarchical transformer/hierarchical_transformer_weights_2025-06-20_small_1.pth"))

<All keys matched successfully>

In [45]:
video = load_and_extract(video_to_test_path)
x_sample = video[:, :, :3]
x_sample.shape

Processing data/raw/deadlifts/22a1eea2-80f4-41e8-a6d6-976ee610d8d4_V1-0024.mp4: 270x480, 95 frames
Extracted and normalized 95 frames from data/raw/deadlifts/22a1eea2-80f4-41e8-a6d6-976ee610d8d4_V1-0024.mp4


(200, 33, 3)

In [46]:
# inference
x_tensor = torch.tensor(x_sample, dtype=torch.float32).unsqueeze(0)
x_tensor.shape

torch.Size([1, 200, 33, 3])

In [47]:
model.eval()
with torch.no_grad():
    logits = model(x_tensor)
    predicted_class = torch.argmax(logits, dim=1).item()

# Squats: 0 , Deadlifts: 1, Shoulder Press: 2
print("Old Model:")
print(f"\nVideo: {video_to_test_path}")
print("Predicted Class Index:", predicted_class)
print("Predicted Exercise:", labels_map[predicted_class])

Old Model:

Video: data/raw/deadlifts/22a1eea2-80f4-41e8-a6d6-976ee610d8d4_V1-0024.mp4
Predicted Class Index: 2
Predicted Exercise: Shoulder Press


# New Model

In [48]:
from hierarchical_transformer_model_v2 import HierarchicalTransformer as HTV2

In [49]:
# New model
model_v2 = HTV2(
   num_joints=33,
    num_frames=200,
    d_model=64,
    nhead=4,
    num_spatial_layers=1,
    num_temporal_layers=1,
    num_classes=3,
    dim_feedforward=512,
    dropout=0.1
)

In [50]:
try:
    model_v2.load_state_dict(torch.load("models/hierarchical transformer/hierarchical_transformer_v2_weights_2025-06-21.pth"))
    print("Model weights loaded successfully.") # You should see this print!
except FileNotFoundError:
    print("Error: Model weights file not found. Please check the path and ensure it exists.")
    # You might want to exit or raise an exception here if loading is critical
    exit()
except Exception as e:
    print(f"An unexpected error occurred while loading model weights: {e}")
    exit()

Model weights loaded successfully.


In [51]:
# Load and process the video for inference
x_np, mask_np = load_and_extract_for_inference(video_to_test_path, max_frames=200)

Processing data/raw/deadlifts/22a1eea2-80f4-41e8-a6d6-976ee610d8d4_V1-0024.mp4: 270x480, 95 frames
Extracted and normalized 95 frames from data/raw/deadlifts/22a1eea2-80f4-41e8-a6d6-976ee610d8d4_V1-0024.mp4


In [52]:
# Convert to PyTorch tensors and add batch dimension
# x_np is (max_frames, num_keypoints, coordinates)
# We need (batch_size, max_frames, num_keypoints, coordinates) for the model
x_tensor = torch.from_numpy(x_np).float().unsqueeze(0) # Add batch dimension
mask_tensor = torch.from_numpy(mask_np).float().unsqueeze(0) # Add batch dimension for mask

# Ensure tensors are on the correct device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_v2.to(device)
x_tensor = x_tensor.to(device)
mask_tensor = mask_tensor.to(device)

with torch.no_grad(): # Disable gradient calculation for inference
    # Pass both the input data and the attention mask to the model
    logits = model_v2(x_tensor, temporal_mask=mask_tensor)

    # Get the predicted class (highest logit)
    predicted_class_idx = torch.argmax(logits, dim=1).item()
    
    # Get the probabilities (if needed)
    probabilities = torch.softmax(logits, dim=1)[0] # [0] because it's a batch of 1

# Print results
print("New Model (V2):")
print(f"\nVideo: {video_to_test_path}")
print(f"Predicted Class Index: {predicted_class_idx}")
print(f"Predicted Exercise: {labels_map.get(predicted_class_idx, 'Unknown')}")

New Model (V2):

Video: data/raw/deadlifts/22a1eea2-80f4-41e8-a6d6-976ee610d8d4_V1-0024.mp4
Predicted Class Index: 1
Predicted Exercise: Deadlifts
