In [1]:
import sys
import os

# Go up to project root (from inside training/)
project_root = os.path.abspath(os.path.join(os.getcwd(), '../..'))
if project_root not in sys.path:
    sys.path.append(project_root)


In [2]:
from core.models.hierarchical_transformer import HierarchicalTransformer
from core.models.base_transformer_model import SimpleTransformerEncoder as StandardTransformer
import torch

In [3]:
htformer_weights = "../../models/final/hierarchical_transformer_f201_d64_h2_s1_t1_do0.1_20250701_2251.pth"
stformer_weights = "../../models/final/base_hierarchical_transformer_f201_d64_h2_do0.1_20250702_0105.pth"

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
htformer = HierarchicalTransformer(
    num_classes=3,
    num_frames=201,
    d_model=64,
    nhead=2,
    dim_feedforward=2048,
    dropout=0.1,
    num_joints=33,
    num_spatial_layers=1,
    num_temporal_layers=1
).to(device)

htformer.load_state_dict(torch.load(htformer_weights, map_location=device))


<All keys matched successfully>

In [6]:
# print parameters
print(sum(p.numel() for p in htformer.parameters() if p.requires_grad))

562755


In [7]:
stformer = StandardTransformer(
    num_classes=3,
    num_frames=201,
    d_model=64,
    nhead=2,
    dim_feedforward=2048,
    dropout=0.1,
    num_joints=33,
    num_layers=2,
).to(device)

stformer.load_state_dict(torch.load(stformer_weights, map_location=device))
    

<All keys matched successfully>

In [8]:
# print parameters
print(sum(p.numel() for p in stformer.parameters() if p.requires_grad))

562755


In [9]:
for name, param in htformer.named_parameters():
    if param.requires_grad:
        print(f"{name}: {param.numel()} parameters")

embedding.weight: 192 parameters
embedding.bias: 64 parameters
spatial_encoder.transformer.layers.0.self_attn.in_proj_weight: 12288 parameters
spatial_encoder.transformer.layers.0.self_attn.in_proj_bias: 192 parameters
spatial_encoder.transformer.layers.0.self_attn.out_proj.weight: 4096 parameters
spatial_encoder.transformer.layers.0.self_attn.out_proj.bias: 64 parameters
spatial_encoder.transformer.layers.0.linear1.weight: 131072 parameters
spatial_encoder.transformer.layers.0.linear1.bias: 2048 parameters
spatial_encoder.transformer.layers.0.linear2.weight: 131072 parameters
spatial_encoder.transformer.layers.0.linear2.bias: 64 parameters
spatial_encoder.transformer.layers.0.norm1.weight: 64 parameters
spatial_encoder.transformer.layers.0.norm1.bias: 64 parameters
spatial_encoder.transformer.layers.0.norm2.weight: 64 parameters
spatial_encoder.transformer.layers.0.norm2.bias: 64 parameters
temporal_encoder.transformer.layers.0.self_attn.in_proj_weight: 12288 parameters
temporal_encod

In [10]:
for name, param in stformer.named_parameters():
    if param.requires_grad:
        print(f"{name}: {param.numel()} parameters")

embedding.weight: 192 parameters
embedding.bias: 64 parameters
transformer.layers.0.self_attn.in_proj_weight: 12288 parameters
transformer.layers.0.self_attn.in_proj_bias: 192 parameters
transformer.layers.0.self_attn.out_proj.weight: 4096 parameters
transformer.layers.0.self_attn.out_proj.bias: 64 parameters
transformer.layers.0.linear1.weight: 131072 parameters
transformer.layers.0.linear1.bias: 2048 parameters
transformer.layers.0.linear2.weight: 131072 parameters
transformer.layers.0.linear2.bias: 64 parameters
transformer.layers.0.norm1.weight: 64 parameters
transformer.layers.0.norm1.bias: 64 parameters
transformer.layers.0.norm2.weight: 64 parameters
transformer.layers.0.norm2.bias: 64 parameters
transformer.layers.1.self_attn.in_proj_weight: 12288 parameters
transformer.layers.1.self_attn.in_proj_bias: 192 parameters
transformer.layers.1.self_attn.out_proj.weight: 4096 parameters
transformer.layers.1.self_attn.out_proj.bias: 64 parameters
transformer.layers.1.linear1.weight: 13