# Test the trained prototype model

In [1]:
import sys
import os

# Go up to project root (from inside training/)
project_root = os.path.abspath(os.path.join(os.getcwd(), '../..'))
if project_root not in sys.path:
    sys.path.append(project_root)

In [2]:
import torch
import numpy as np
from core.utils import process_sample

# Old Model
from core.models.hierarchical_transformer_prototype import HierarchicalTransformer as HierarchicalTransformerPrototype
# Final Model
from core.models.hierarchical_transformer import HierarchicalTransformer


## Create a simple pipeline
This will include:
- using raw video data.
- Processing the raw video data.
- Passing the processed video data into the model.
- Displaying the results.

In [3]:

from core.keypoint_extractor import KeypointExtractorV2

In [4]:
def load_and_extract(path: str) -> np.ndarray:
    extractor = KeypointExtractorV2(r"../../models/mediapipe/pose_landmarker_full.task")
    keypoints = extractor.extract(path)
    
    max_frames = 200
    pad_len = max_frames - len(keypoints)
    if pad_len > 0:
        pad = np.zeros((pad_len, keypoints.shape[1], keypoints.shape[2]))  # Preserve all dimensions
        padded_sample = np.concatenate((keypoints, pad), axis=0)
    else:
        padded_sample = keypoints

    return np.array(padded_sample)
        

In [5]:
def load_and_extract_for_inference(video_path: str, max_frames: int = 200) -> tuple[np.ndarray, np.ndarray]:
    extractor = KeypointExtractorV2(r"../../models/mediapipe/pose_landmarker_full.task")
    keypoints = extractor.extract(video_path) # keypoints will be (actual_len, num_keypoints, coords)
    padded_sample, attention_mask,_ = process_sample(keypoints, max_frames)
    return (padded_sample, attention_mask)

In [6]:
labels_map = {0: "Squats", 1: "Deadlifts", 2: "Shoulder Press"}

In [7]:
model_prototype = HierarchicalTransformerPrototype(
    num_joints=33,
    num_frames=378,
    d_model=64,
    nhead=4,
    num_spatial_layers=1,
    num_temporal_layers=1,
    num_classes=3,
    dim_feedforward=512
)
model_prototype.load_state_dict(torch.load("../../models/hierarchical transformer/hierarchical_transformer_weights_2025-06-20_small_1.pth"))


# New model
model_final = HierarchicalTransformer(
   num_joints=33,
    num_frames=200,
    d_model=128,
    nhead=4,
    num_spatial_layers=1,
    num_temporal_layers=1,
    num_classes=3,
    dim_feedforward=1024,
    dropout=0.35
)
model_final.load_state_dict(torch.load("../../models/hierarchical_transformer/hierarchical_transformer_f200_d128_h4_s1_t1_do0.35_20250625_1739.pth"))

<All keys matched successfully>

In [8]:
def prototype_inference(video_to_test_path):
    video = load_and_extract(video_to_test_path)
    x_sample = video[:, :, :3]
    x_sample.shape
    
    x_tensor = torch.tensor(x_sample, dtype=torch.float32).unsqueeze(0)
    model_prototype.eval()
    with torch.no_grad():
        logits = model_prototype(x_tensor)
        predicted_class = torch.argmax(logits, dim=1).item()

    # Squats: 0 , Deadlifts: 1, Shoulder Press: 2
    print("Old Model:")
    print(f"\nVideo: {video_to_test_path}")
    print("Predicted Class Index:", predicted_class)
    print("Predicted Exercise:", labels_map[predicted_class])

In [9]:
def final_model_inference(video_to_test_path):
    x_np, mask_np = load_and_extract_for_inference(video_to_test_path, max_frames=200)
    
    x_np = x_np[:, :, :3] # get x y z only
    x_tensor = torch.from_numpy(x_np).float().unsqueeze(0) # Add batch dimension
    mask_tensor = torch.from_numpy(mask_np).float().unsqueeze(0) # Add batch dimension for mask

    # Ensure tensors are on the correct device (CPU or GPU)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model_final.to(device)
    x_tensor = x_tensor.to(device)
    mask_tensor = mask_tensor.to(device)

    with torch.no_grad(): # Disable gradient calculation for inference
        # Pass both the input data and the attention mask to the model
        logits = model_final(x_tensor, temporal_mask=mask_tensor)

        # Get the predicted class (highest logit)
        predicted_class_idx = torch.argmax(logits, dim=1).item()
        
        # Get the probabilities (if needed)
        probabilities = torch.softmax(logits, dim=1)[0] # [0] because it's a batch of 1

    # Print results
    print("New Model (V2):")
    print(f"\nVideo: {video_to_test_path}")
    print(f"Predicted Class Index: {predicted_class_idx}")
    print(f"Predicted Exercise: {labels_map.get(predicted_class_idx, 'Unknown')}")

In [13]:

video_to_test_path = '../../data/unseen/deadlift_other.mp4'
prototype_inference(video_to_test_path)
print("--------------------------------------------------------------------------------------")
final_model_inference(video_to_test_path)

Processing ../../data/unseen/deadlift_other.mp4: 816x1080, 121 frames
Extracted and normalized 120 frames from ../../data/unseen/deadlift_other.mp4
Old Model:

Video: ../../data/unseen/deadlift_other.mp4
Predicted Class Index: 2
Predicted Exercise: Shoulder Press
--------------------------------------------------------------------------------------
Processing ../../data/unseen/deadlift_other.mp4: 816x1080, 121 frames
Extracted and normalized 120 frames from ../../data/unseen/deadlift_other.mp4
New Model (V2):

Video: ../../data/unseen/deadlift_other.mp4
Predicted Class Index: 1
Predicted Exercise: Deadlifts
