In [1]:
# Minimal imports for SMPL-X -> vertices -> Renderer (headless-friendly)
import os
# change base folder
os.chdir('../')
import numpy as np
import torch
import cv2
import pickle
import smplx
from renderer.renderer import Renderer
from renderer.util import batch_orth_proj

In [2]:
base_path = "/mnt/GUAVA/assets/example/tracked_video/6gvP8f5WQyo__056/"

In [3]:

#shape_path = base_path + "id_share_params.pkl"

#with open(shape_path, "rb") as f:
#    data = pickle.load(f)


#print("Available keys:", list(data.keys()))

#for key, value in data.items():
#    if isinstance(value, np.ndarray):
#        print(f"{key}: shape={value.shape}, dtype={value.dtype}")
#    else:
#        print(f"{key}: type={type(value)} -> {value}")

In [4]:
tracking_path = base_path + "optim_tracking_ehm.pkl"

with open(tracking_path, "rb") as f:
    data = pickle.load(f)

# Print the keys of the dictionary to see what it contains
print(data.keys())
print(data['frame_000000'].keys())

dict_keys(['frame_000000', 'frame_000001', 'frame_000002', 'frame_000003', 'frame_000004', 'frame_000005', 'frame_000006', 'frame_000007', 'frame_000008', 'frame_000009', 'frame_000010', 'frame_000011', 'frame_000012', 'frame_000013', 'frame_000014', 'frame_000015', 'frame_000016', 'frame_000017', 'frame_000018', 'frame_000019', 'frame_000020', 'frame_000021', 'frame_000022', 'frame_000023', 'frame_000024', 'frame_000025', 'frame_000026', 'frame_000027', 'frame_000028', 'frame_000029', 'frame_000030', 'frame_000031', 'frame_000032', 'frame_000033', 'frame_000034', 'frame_000035', 'frame_000036', 'frame_000037', 'frame_000038', 'frame_000039', 'frame_000040', 'frame_000041', 'frame_000042', 'frame_000043', 'frame_000044', 'frame_000045', 'frame_000046', 'frame_000047', 'frame_000048', 'frame_000049', 'frame_000050', 'frame_000051', 'frame_000052', 'frame_000053', 'frame_000054', 'frame_000055', 'frame_000056', 'frame_000057', 'frame_000058', 'frame_000059', 'frame_000060', 'frame_000061

In [5]:
# Check what's actually in the nested structure
sample_frame = data['frame_000000']
print("Keys in frame:", sample_frame.keys())
print("\nKeys in smplx_coeffs:", sample_frame['smplx_coeffs'].keys())
print("\n=== Detailed structure of smplx_coeffs ===")
for key, value in sample_frame['smplx_coeffs'].items():
    if isinstance(value, np.ndarray):
        print(f"{key}: shape={value.shape}, dtype={value.dtype}, first few values: {value.flatten()[:5]}")
    else:
        print(f"{key}: type={type(value)} -> {value}")

Keys in frame: dict_keys(['body_crop', 'dwpose_raw', 'dwpose_rlt', 'smplx_coeffs', 'head_crop', 'head_lmk_203', 'head_lmk_70', 'head_lmk_mp', 'flame_coeffs', 'left_mano_coeffs', 'left_hand_crop', 'right_mano_coeffs', 'right_hand_crop'])

Keys in smplx_coeffs: dict_keys(['exp', 'global_pose', 'body_pose', 'body_cam', 'camera_RT_params', 'left_hand_pose', 'right_hand_pose'])

=== Detailed structure of smplx_coeffs ===
exp: shape=(50,), dtype=float32, first few values: [1.2100561  0.5304717  0.11870743 0.19686127 0.08652326]
global_pose: shape=(3,), dtype=float32, first few values: [ 2.9868624   0.06696548 -0.25063005]
body_pose: shape=(21, 3), dtype=float32, first few values: [-0.05537486  0.08022741  0.00862682 -0.0110795  -0.00271257]
body_cam: shape=(3,), dtype=float32, first few values: [1.9841318  0.05343538 0.83356845]
camera_RT_params: shape=(3, 4), dtype=float32, first few values: [-0.9999938  -0.00333516  0.00106661 -0.04895249  0.00326747]
left_hand_pose: shape=(15, 3), dtype=f

In [6]:
# Build SMPL-X model, parse coeffs, and prep camera/render context (no PCA hands, keep 45D)
from pathlib import Path

# Files
shape_path = base_path + 'id_share_params.pkl'
tracking_path = base_path + 'optim_tracking_ehm.pkl'

# Load shape data if available, otherwise use defaults
if Path(shape_path).exists():
    with open(shape_path, 'rb') as f:
        shape_data = pickle.load(f)
    betas_np = np.asarray(shape_data.get('smplx_shape'))  # often (1, 200)
    betas_np = betas_np.reshape(1, -1) if betas_np.ndim == 1 else betas_np
    print('Loaded betas from file')
else:
    # Use default neutral shape (zeros)
    betas_np = np.zeros((1, 10), dtype=np.float32)
    print('Shape file not found, using default neutral betas')

betas_full = torch.from_numpy(betas_np).float()
print('Betas provided:', betas_full.shape[1])

# Load tracking data
with open(tracking_path, 'rb') as f:
    tracking = pickle.load(f)

frame_keys = sorted([k for k in tracking.keys() if k.startswith('frame_')])
assert len(frame_keys) > 0, 'No frames found in tracking PKL'
print('Frames:', len(frame_keys))

# Inspect first frame to determine expression dim
def get_inner_coeffs(fd):
    return fd['smplx_coeffs'] if isinstance(fd, dict) and 'smplx_coeffs' in fd else fd

sample = get_inner_coeffs(tracking[frame_keys[0]])

def get_len(d, keys, default=None):
    for k in keys:
        if k in d and d[k] is not None:
            v = np.asarray(d[k]).reshape(-1)
            return v.shape[0]
    return default

expr_dim_in = get_len(sample, ['expression','expr'], 50)
body_dim_in = get_len(sample, ['body_pose','body','pose_body'], 63)
print(f'Found dims -> expr:{expr_dim_in} body:{body_dim_in}')

# Create SMPL-X model (always axis-angle hands 45D, keep it simple)
smplx_model_dir = '/mnt/fasttalk_upperbody/'  # user-specified
assert Path(smplx_model_dir).exists(), f'Missing SMPLX assets at {smplx_model_dir}'
model = smplx.create(
    smplx_model_dir,
    model_type='smplx', gender='neutral', ext='npz',
    num_betas=10,             # SMPL-X default
    num_expression_coeffs=10, # SMPL-X default
    use_pca=False,            # no PCA hands
    flat_hand_mean=True       # use flat (open) hand mean for natural opening
)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device).eval()

# Model-accepted sizes
n_betas_model = model.num_betas
n_exp_model   = model.num_expression_coeffs
print(f'Model expects -> betas:{n_betas_model} expr:{n_exp_model} hands:45 body:63')

# Conform betas to model
betas_used = betas_full[:, :n_betas_model] if betas_full.shape[1] >= n_betas_model else torch.cat([betas_full, torch.zeros(1, n_betas_model-betas_full.shape[1])], dim=1)

# Faces (triangles)
faces = np.load('/mnt/fasttalk_upperbody/smplx/smplx_faces.npy').astype(np.int64)  # user-specified
faces_t = torch.from_numpy(faces)[None, ...].to(device)

def to_row_tensor(arr):
    if arr is None:
        return None
    t = torch.from_numpy(np.asarray(arr).reshape(-1)).float()
    return t[None, :]

def fit_dim_row(t, target):
    if t is None:
        return torch.zeros(1, target).float()
    if t.shape[1] > target:
        return t[:, :target]
    if t.shape[1] < target:
        return torch.cat([t, torch.zeros(1, target - t.shape[1], dtype=t.dtype)], dim=1)
    return t

def first_val(d, keys):
    for k in keys:
        if k in d and d[k] is not None:
            return d[k]
    return None

def parse_frame(fd):
    # Align with project data_loader: use only smplx_coeffs entries, no MANO fallbacks
    d = get_inner_coeffs(fd)

    # Global/body/face
    go  = to_row_tensor(first_val(d, ['global_pose','global_orient','root_orient','orient']))
    bp  = to_row_tensor(first_val(d, ['body_pose','body','pose_body']))
    jp  = to_row_tensor(first_val(d, ['jaw_pose','jaw']))
    lep = to_row_tensor(first_val(d, ['leye_pose','left_eye_pose']))
    rep = to_row_tensor(first_val(d, ['reye_pose','right_eye_pose']))

    # Hands: strictly use SMPL-X hand pose from smplx_coeffs (axis-angle, 45D)
    lhp = to_row_tensor(d.get('left_hand_pose'))
    rhp = to_row_tensor(d.get('right_hand_pose'))

    exp = to_row_tensor(first_val(d, ['exp','expression','expr']))
    trn = to_row_tensor(first_val(d, ['transl','translation','trans']))

    # Fit dims (keep as flat row tensors, SMPL-X expects this)
    go  = fit_dim_row(go, 3)                         # (1,3)
    bp  = fit_dim_row(bp, 63)                        # (1,63)
    jp  = fit_dim_row(jp, 3)                         # (1,3)
    lep = fit_dim_row(lep, 3)                        # (1,3)
    rep = fit_dim_row(rep, 3)                        # (1,3)
    lhp = fit_dim_row(lhp, 45)                       # (1,45)
    rhp = fit_dim_row(rhp, 45)                       # (1,45)
    exp = fit_dim_row(exp, n_exp_model)              # (1,n_exp)
    trn = fit_dim_row(trn, 3)                        # (1,3)
    return {
        'global_orient': go,
        'body_pose': bp,
        'jaw_pose': jp,
        'leye_pose': lep,
        'reye_pose': rep,
        'left_hand_pose': lhp,
        'right_hand_pose': rhp,
        'expression': exp,
        'transl': trn,
    }

# First frame: vertices + camera frame (orthographic)
first = parse_frame(tracking[frame_keys[0]])
with torch.no_grad():
    out = model(
        betas=betas_used.to(device),
        global_orient=first['global_orient'].to(device),
        body_pose=first['body_pose'].to(device),
        jaw_pose=first['jaw_pose'].to(device),
        leye_pose=first['leye_pose'].to(device),
        reye_pose=first['reye_pose'].to(device),
        left_hand_pose=first['left_hand_pose'].to(device),
        right_hand_pose=first['right_hand_pose'].to(device),
        expression=first['expression'].to(device),
        transl=first['transl'].to(device),
    )
verts0_t = out.vertices  # [1, V, 3]

# Center and set a fixed scale for the whole clip
center_t = verts0_t.mean(dim=1, keepdim=True)
verts0_centered = verts0_t - center_t
scale = float(2.1 / (verts0_centered.abs().max().item() + 1e-6))
cam_params = torch.tensor([[scale, 0.0, 0.0]], dtype=torch.float32, device=device)
print('Init ready: verts0', tuple(verts0_t.shape), 'scale', scale)

Shape file not found, using default neutral betas
Betas provided: 10
Frames: 425
Found dims -> expr:50 body:63
Model expects -> betas:10 expr:10 hands:45 body:63
Model expects -> betas:10 expr:10 hands:45 body:63
Init ready: verts0 (1, 10475, 3) scale 1.6801521579389072
Init ready: verts0 (1, 10475, 3) scale 1.6801521579389072


In [7]:
# Debug: print model internal dimensions
print('=== Model internal dimensions ===')
print(f'shapedirs: {model.shapedirs.shape}')
print(f'expr_dirs: {model.expr_dirs.shape}')
print(f'num_betas: {model.num_betas}')
print(f'num_expression_coeffs: {model.num_expression_coeffs}')
print('='*40)

=== Model internal dimensions ===
shapedirs: torch.Size([10475, 3, 10])
expr_dirs: torch.Size([10475, 3, 10])
num_betas: 10
num_expression_coeffs: 10


In [8]:
verts0_t.shape

torch.Size([1, 10475, 3])

In [9]:
# Sanity check hands after parser simplification
fp = parse_frame(tracking[frame_keys[0]])
print('Left hand pose (len):', fp['left_hand_pose'].shape[1], 'first 6:', fp['left_hand_pose'][0, :6])
print('Right hand pose (len):', fp['right_hand_pose'].shape[1], 'first 6:', fp['right_hand_pose'][0, :6])

Left hand pose (len): 45 first 6: tensor([-0.0851,  0.1201, -1.3283,  0.2095, -0.2069, -0.6650])
Right hand pose (len): 45 first 6: tensor([ 0.0041, -0.3212,  0.8929,  0.4129,  0.2101,  1.1837])


In [10]:
# Setup FLAME model and FLAME->SMPL-X head fusion mapping
from flame_model.FLAME import FLAMEModel

# Detect FLAME dims from the first frame
flame_sample = tracking[frame_keys[0]].get('flame_coeffs', {})
def _get_len_np(d, keys, default=None):
    for k in keys:
        if k in d and d[k] is not None:
            v = np.asarray(d[k]).reshape(-1)
            return v.shape[0]
    return default

flame_n_shape = _get_len_np(flame_sample, ['shape_params','shape','betas'], 100)
flame_n_exp   = _get_len_np(flame_sample, ['expression','exp','expression_params'], 50)
print(f'FLAME dims -> shape:{flame_n_shape} exp:{flame_n_exp}')

# Create FLAME model (no landmarks needed for rendering)
flame = FLAMEModel(n_shape=flame_n_shape, n_exp=flame_n_exp, no_lmks=True).to(device).eval()

# Load mapping from FLAME vertices to SMPL-X vertices
fusion_enabled = True
map_path = '/mnt/fasttalk_upperbody/smplx/SMPL-X__FLAME_vertex_ids.npy'
try:
    mapping_raw = np.load(map_path, allow_pickle=True)
    smplx_idx = None
    flame_idx = None
    # Accept a variety of formats
    if isinstance(mapping_raw, np.ndarray) and mapping_raw.dtype != object:
        if mapping_raw.ndim == 2 and mapping_raw.shape[1] == 2:
            smplx_idx = mapping_raw[:, 0].astype(np.int64)
            flame_idx = mapping_raw[:, 1].astype(np.int64)
    elif isinstance(mapping_raw, np.ndarray) and mapping_raw.dtype == object:
        obj = mapping_raw.item() if mapping_raw.size == 1 else mapping_raw
        # Try common keys
        for k_smpl, k_fla in [('smplx', 'flame'), ('smplx_idx','flame_idx'), ('smplx_ids','flame_ids')]:
            if isinstance(obj, dict) and (k_smpl in obj) and (k_fla in obj):
                smplx_idx = np.asarray(obj[k_smpl]).astype(np.int64)
                flame_idx = np.asarray(obj[k_fla]).astype(np.int64)
                break
    if smplx_idx is None or flame_idx is None or len(smplx_idx) != len(flame_idx):
        print('WARN: Could not parse SMPL-X__FLAME mapping; disabling fusion.')
        fusion_enabled = False
    else:
        smplx_idx_t = torch.from_numpy(smplx_idx).long().to(device)
        flame_idx_t = torch.from_numpy(flame_idx).long().to(device)
        print(f'FLAME head fusion mapping loaded: {len(smplx_idx)} vertices')
except Exception as e:
    print('WARN: Failed to load mapping file:', e)
    fusion_enabled = False

def to_row_tensor_np(arr):
    if arr is None: return None
    t = torch.from_numpy(np.asarray(arr).reshape(-1)).float()
    return t[None, :]

def fit_dim_row_like(t, target):
    if t is None:
        return torch.zeros(1, target).float()
    if t.shape[1] > target:
        return t[:, :target]
    if t.shape[1] < target:
        return torch.cat([t, torch.zeros(1, target - t.shape[1], dtype=t.dtype)], dim=1)
    return t

def first_val_dict(d, keys):
    for k in keys:
        if k in d and d[k] is not None:
            return d[k]
    return None

def parse_flame(fd):
    d_smplx = get_inner_coeffs(fd)
    d_flame = fd.get('flame_coeffs', {}) if isinstance(fd, dict) else {}
    
    shp = to_row_tensor_np(first_val_dict(d_flame, ['shape_params','shape','betas']))
    exp = to_row_tensor_np(first_val_dict(d_flame, ['expression','exp','expression_params']))
    jaw = to_row_tensor_np(first_val_dict(d_flame, ['jaw_pose','jaw']))
    # Eyes: prefer FLAME if present, else fallback to SMPL-X
    lep = to_row_tensor_np(first_val_dict(d_flame, ['leye_pose','left_eye_pose']))
    rep = to_row_tensor_np(first_val_dict(d_flame, ['reye_pose','right_eye_pose']))
    if lep is None: lep = to_row_tensor_np(first_val(d_smplx, ['leye_pose','left_eye_pose']))
    if rep is None: rep = to_row_tensor_np(first_val(d_smplx, ['reye_pose','right_eye_pose']))
    # Global orient: reuse body global from SMPL-X to stay consistent
    go  = to_row_tensor_np(first_val(d_smplx, ['global_pose','global_orient','root_orient','orient']))
    
    # Fit dims
    shp = fit_dim_row_like(shp, flame_n_shape)
    exp = fit_dim_row_like(exp, flame_n_exp)
    jaw = fit_dim_row_like(jaw, 3)
    lep = fit_dim_row_like(lep, 3)
    rep = fit_dim_row_like(rep, 3)
    go  = fit_dim_row_like(go, 3)
    
    pose = torch.cat([go, jaw], dim=1)   # (1,6)
    eye  = torch.cat([lep, rep], dim=1)  # (1,6)
    return {
        'shape_params': shp,
        'expression_params': exp,
        'pose_params': pose,
        'eye_pose_params': eye,
    }

FLAME dims -> shape:100 exp:50
WARN: Could not parse SMPL-X__FLAME mapping; disabling fusion.


In [11]:
# Render full sequence to MP4 using the in-repo Renderer class + optional FLAME head fusion
import os, sys, subprocess, shutil
class SMPLXRenderer:
    def __init__(self, image_size=800):
        from renderer.renderer import Renderer
        self.rasterize_fn = Renderer.rasterize
        self.add_directionlight_fn = Renderer.add_directionlight
        self.image_size = image_size
    def render_smplx(self, vertices, cam_params, faces, device):
        from renderer.util import vertex_normals, face_vertices
        batch_size = vertices.shape[0]
        transformed_vertices = batch_orth_proj(vertices, cam_params)
        light_positions = torch.tensor([
            [-1, -1, -1], [1, -1, -1], [-1, +1, -1], [1, +1, -1], [0, 0, -1]
        ])[None, :, :].expand(batch_size, -1, -1).float()
        light_intensities = torch.ones_like(light_positions).float() * 1.7
        lights = torch.cat((light_positions, light_intensities), 2).to(device)
        transformed_vertices = transformed_vertices.clone()
        transformed_vertices[:, :, 2] = transformed_vertices[:, :, 2] + 10
        normals = vertex_normals(vertices, faces)
        face_normals = face_vertices(normals, faces)
        colors = torch.tensor([12, 156, 91])[None, None, :].repeat(1, vertices.shape[1], 1).float() / 255.0
        colors = colors.to(device)
        face_colors = face_vertices(colors, faces[0:1] if faces.shape[0] == 1 else faces)
        face_colors = face_colors.expand(batch_size, -1, -1, -1)
        attributes = torch.cat([face_colors, face_normals], -1)
        rendering = self.rasterize_fn(self, transformed_vertices, faces, attributes)
        albedo_images = rendering[:, :3, :, :]
        normal_images = rendering[:, 3:6, :, :]
        shading = self.add_directionlight_fn(self, normal_images.permute(0, 2, 3, 1).reshape([batch_size, -1, 3]), lights)
        shading_images = shading.reshape([batch_size, albedo_images.shape[2], albedo_images.shape[3], 3]).permute(0, 3, 1, 2).contiguous()
        shaded_images = albedo_images * shading_images
        return shaded_images

# Guard: make FLAME fusion optional if setup cell wasn't run
fusion_enabled = bool(globals().get('fusion_enabled', False))
smplx_idx_t = globals().get('smplx_idx_t', None)
flame_idx_t = globals().get('flame_idx_t', None)
if fusion_enabled and (smplx_idx_t is None or flame_idx_t is None):
    print('WARN: FLAME mapping tensors not found in session; disabling fusion.')
    fusion_enabled = False

renderer = SMPLXRenderer(image_size=800)
demo_path = '/mnt/fasttalk_upperbody/demo'
os.makedirs(demo_path, exist_ok=True)
out_video_renderer = os.path.join(demo_path, 'smplx_flame_fused.mp4')
fps = 25
res = 800
use_ffmpeg = shutil.which('ffmpeg') is not None
ffmpeg_proc = None
if use_ffmpeg:
    print('Using ffmpeg (libx264) for H.264 encoding')
    ffmpeg_cmd = ['ffmpeg','-y','-f','rawvideo','-vcodec','rawvideo','-pix_fmt','rgb24','-s',f'{res}x{res}','-r',str(fps),'-i','-','-an','-vcodec','libx264','-pix_fmt','yuv420p','-preset','veryfast','-crf','18',out_video_renderer]
    ffmpeg_proc = subprocess.Popen(ffmpeg_cmd, stdin=subprocess.PIPE)
else:
    print('ffmpeg not found; falling back to OpenCV (mp4v)')
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    writer = cv2.VideoWriter(out_video_renderer, fourcc, fps, (res, res), True)
    if not writer.isOpened(): raise RuntimeError('OpenCV VideoWriter init failed')
print(f'Rendering {len(frame_keys)} frames (FLAME fusion={fusion_enabled})...')
with torch.no_grad():
    for idx, fk in enumerate(frame_keys):  # slice e.g., frame_keys[:10] for a quick preview
        coeffs = parse_frame(tracking[fk])
        flame_coeffs = parse_flame(tracking[fk]) if fusion_enabled else None
        # SMPL-X full body
        smplx_out = model(betas=betas_used.to(device),
                          global_orient=coeffs['global_orient'].to(device),
                          body_pose=coeffs['body_pose'].to(device),
                          jaw_pose=coeffs['jaw_pose'].to(device),
                          leye_pose=coeffs['leye_pose'].to(device),
                          reye_pose=coeffs['reye_pose'].to(device),
                          left_hand_pose=coeffs['left_hand_pose'].to(device),
                          right_hand_pose=coeffs['right_hand_pose'].to(device),
                          expression=coeffs['expression'].to(device),
                          transl=coeffs['transl'].to(device))
        verts_body = smplx_out.vertices  # [1, Vb, 3]
        if fusion_enabled and flame_coeffs is not None and (smplx_idx_t is not None) and (flame_idx_t is not None):
            f_out = flame(shape_params=flame_coeffs['shape_params'].to(device),
                          expression_params=flame_coeffs['expression_params'].to(device),
                          pose_params=flame_coeffs['pose_params'].to(device),
                          eye_pose_params=flame_coeffs['eye_pose_params'].to(device))
            verts_flame = f_out[0] if isinstance(f_out, tuple) else f_out  # [1, Vf, 3] or [Vf,3]
            if verts_flame.ndim == 2: verts_flame = verts_flame.unsqueeze(0)
            # Replace SMPL-X head vertices with FLAME ones via mapping
            try:
                verts_fused = verts_body.clone()
                verts_fused[:, smplx_idx_t, :] = verts_flame[:, flame_idx_t, :]
            except Exception as e:
                if idx == 0: print('Fusion error, disabling fusion:', e)
                fusion_enabled = False
                verts_fused = verts_body
        else:
            verts_fused = verts_body
        # Center relative to initial frame
        img_t = renderer.render_smplx(verts_fused - center_t, cam_params, faces_t, device)
        img_rgb = (img_t[0].detach().cpu().permute(1, 2, 0).numpy().clip(0, 1) * 255).astype(np.uint8)
        if use_ffmpeg: ffmpeg_proc.stdin.write(img_rgb.tobytes())
        else: writer.write(img_rgb[:, :, ::-1])
        if (idx + 1) % 5 == 0 or (idx + 1) == len(frame_keys):
            print(f'  Rendered {idx + 1}/{len(frame_keys)} frames')
if use_ffmpeg:
    ffmpeg_proc.stdin.close(); ffmpeg_proc.wait()
else:
    writer.release()
print(f'✓ Saved video to {out_video_renderer}')

Using ffmpeg (libx264) for H.264 encoding
Rendering 425 frames (FLAME fusion=False)...


ffmpeg version 9c33b2f Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 9.3.0 (crosstool-NG 1.24.0.133_b0863d8_dirty)
  configuration: --prefix=/root/miniconda3/envs/fasttalk --cc=/home/conda/feedstock_root/build_artifacts/ffmpeg_1627813612080/_build_env/bin/x86_64-conda-linux-gnu-cc --disable-doc --disable-openssl --enable-avresample --enable-gnutls --enable-gpl --enable-hardcoded-tables --enable-libfreetype --enable-libopenh264 --enable-libx264 --enable-pic --enable-pthreads --enable-shared --enable-static --enable-version3 --enable-zlib --enable-libmp3lame --pkg-config=/home/conda/feedstock_root/build_artifacts/ffmpeg_1627813612080/_build_env/bin/pkg-config
  libavutil      56. 51.100 / 56. 51.100
  libavcodec     58. 91.100 / 58. 91.100
  libavformat    58. 45.100 / 58. 45.100
  libavdevice    58. 10.100 / 58. 10.100
  libavfilter     7. 85.100 /  7. 85.100
  libavresample   4.  0.  0 /  4.  0.  0
  libswscale      5.  7.100 /  5.  7.100
  libswresample   3.  7.100 / 

  Rendered 5/425 frames
  Rendered 10/425 frames
  Rendered 15/425 frames
  Rendered 20/425 frames
  Rendered 15/425 frames
  Rendered 20/425 frames


frame=   16 fps=0.0 q=0.0 size=       0kB time=00:00:00.00 bitrate=N/A speed=   0x    

  Rendered 25/425 frames
  Rendered 30/425 frames


frame=   32 fps= 31 q=0.0 size=       0kB time=00:00:00.00 bitrate=N/A speed=   0x    

  Rendered 35/425 frames
  Rendered 40/425 frames
  Rendered 45/425 frames
  Rendered 50/425 frames
  Rendered 45/425 frames
  Rendered 50/425 frames


frame=   49 fps= 31 q=23.0 size=       0kB time=00:00:00.28 bitrate=   1.4kbits/s speed=0.179x    

  Rendered 55/425 frames
  Rendered 60/425 frames
  Rendered 65/425 frames
  Rendered 70/425 frames
  Rendered 65/425 frames
  Rendered 70/425 frames


frame=   65 fps= 31 q=23.0 size=       0kB time=00:00:00.92 bitrate=   0.4kbits/s speed=0.442x    

  Rendered 75/425 frames
  Rendered 80/425 frames


frame=   81 fps= 31 q=23.0 size=       0kB time=00:00:01.56 bitrate=   0.2kbits/s speed=0.602x    

  Rendered 85/425 frames
  Rendered 90/425 frames
  Rendered 95/425 frames
  Rendered 100/425 frames
  Rendered 95/425 frames
  Rendered 100/425 frames


frame=   97 fps= 31 q=23.0 size=       0kB time=00:00:02.20 bitrate=   0.2kbits/s speed=0.711x    

  Rendered 105/425 frames
  Rendered 110/425 frames


frame=  114 fps= 32 q=23.0 size=       0kB time=00:00:02.88 bitrate=   0.1kbits/s speed=0.799x    

  Rendered 115/425 frames
  Rendered 120/425 frames
  Rendered 125/425 frames
  Rendered 130/425 frames
  Rendered 125/425 frames
  Rendered 130/425 frames


frame=  131 fps= 32 q=23.0 size=     256kB time=00:00:03.56 bitrate= 589.2kbits/s speed=0.865x    

  Rendered 135/425 frames
  Rendered 140/425 frames
  Rendered 145/425 frames
  Rendered 150/425 frames
  Rendered 145/425 frames
  Rendered 150/425 frames


frame=  148 fps= 32 q=23.0 size=     256kB time=00:00:04.24 bitrate= 494.7kbits/s speed=0.913x    

  Rendered 155/425 frames
  Rendered 160/425 frames


frame=  164 fps= 32 q=23.0 size=     256kB time=00:00:04.88 bitrate= 429.8kbits/s speed=0.946x    

  Rendered 165/425 frames
  Rendered 170/425 frames
  Rendered 175/425 frames
  Rendered 180/425 frames
  Rendered 175/425 frames
  Rendered 180/425 frames


frame=  181 fps= 32 q=23.0 size=     256kB time=00:00:05.56 bitrate= 377.2kbits/s speed=0.979x    

  Rendered 185/425 frames
  Rendered 190/425 frames
  Rendered 195/425 frames
  Rendered 200/425 frames
  Rendered 195/425 frames
  Rendered 200/425 frames


frame=  199 fps= 32 q=23.0 size=     512kB time=00:00:06.28 bitrate= 667.9kbits/s speed=1.01x    

  Rendered 205/425 frames
  Rendered 210/425 frames
  Rendered 215/425 frames
  Rendered 220/425 frames
  Rendered 215/425 frames
  Rendered 220/425 frames


frame=  218 fps= 33 q=23.0 size=     512kB time=00:00:07.04 bitrate= 595.8kbits/s speed=1.05x    

  Rendered 225/425 frames
  Rendered 230/425 frames
  Rendered 235/425 frames
  Rendered 240/425 frames
  Rendered 235/425 frames
  Rendered 240/425 frames


frame=  238 fps= 33 q=23.0 size=     512kB time=00:00:07.84 bitrate= 535.0kbits/s speed=1.09x    

  Rendered 245/425 frames
  Rendered 250/425 frames
  Rendered 255/425 frames
  Rendered 260/425 frames
  Rendered 255/425 frames
  Rendered 260/425 frames


frame=  258 fps= 33 q=23.0 size=     512kB time=00:00:08.64 bitrate= 485.5kbits/s speed=1.12x    

  Rendered 265/425 frames
  Rendered 270/425 frames
  Rendered 275/425 frames
  Rendered 280/425 frames
  Rendered 275/425 frames
  Rendered 280/425 frames


frame=  278 fps= 34 q=23.0 size=     768kB time=00:00:09.44 bitrate= 666.5kbits/s speed=1.15x    

  Rendered 285/425 frames
  Rendered 290/425 frames
  Rendered 295/425 frames
  Rendered 300/425 frames
  Rendered 295/425 frames
  Rendered 300/425 frames


frame=  299 fps= 34 q=23.0 size=     768kB time=00:00:10.28 bitrate= 612.0kbits/s speed=1.18x    

  Rendered 305/425 frames
  Rendered 310/425 frames
  Rendered 315/425 frames
  Rendered 320/425 frames
  Rendered 315/425 frames
  Rendered 320/425 frames


frame=  320 fps= 35 q=23.0 size=     768kB time=00:00:11.12 bitrate= 565.8kbits/s speed= 1.2x    

  Rendered 325/425 frames
  Rendered 330/425 frames
  Rendered 335/425 frames
  Rendered 340/425 frames
  Rendered 335/425 frames
  Rendered 340/425 frames


frame=  340 fps= 35 q=23.0 size=     768kB time=00:00:11.92 bitrate= 527.8kbits/s speed=1.22x    

  Rendered 345/425 frames
  Rendered 350/425 frames
  Rendered 355/425 frames
  Rendered 360/425 frames
  Rendered 355/425 frames
  Rendered 360/425 frames


frame=  361 fps= 35 q=23.0 size=    1024kB time=00:00:12.76 bitrate= 657.4kbits/s speed=1.24x    

  Rendered 365/425 frames
  Rendered 370/425 frames
  Rendered 375/425 frames
  Rendered 380/425 frames
  Rendered 375/425 frames
  Rendered 380/425 frames


frame=  378 fps= 35 q=23.0 size=    1024kB time=00:00:13.44 bitrate= 624.2kbits/s speed=1.24x    

  Rendered 385/425 frames
  Rendered 390/425 frames
  Rendered 395/425 frames
  Rendered 400/425 frames
  Rendered 395/425 frames
  Rendered 400/425 frames


frame=  397 fps= 35 q=23.0 size=    1024kB time=00:00:14.20 bitrate= 590.8kbits/s speed=1.25x    

  Rendered 405/425 frames
  Rendered 410/425 frames


frame=  414 fps= 35 q=23.0 size=    1024kB time=00:00:14.88 bitrate= 563.8kbits/s speed=1.26x    

  Rendered 415/425 frames
  Rendered 420/425 frames
  Rendered 425/425 frames
✓ Saved video to /mnt/fasttalk_upperbody/demo/smplx_flame_fused.mp4
  Rendered 425/425 frames
✓ Saved video to /mnt/fasttalk_upperbody/demo/smplx_flame_fused.mp4


frame=  425 fps= 35 q=-1.0 Lsize=    1364kB time=00:00:16.88 bitrate= 661.9kbits/s speed=1.37x    
video:1358kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.427863%
[libx264 @ 0x55d85bc5e1c0] frame I:2     Avg QP: 6.37  size: 13450
[libx264 @ 0x55d85bc5e1c0] frame P:107   Avg QP:16.17  size:  5975
[libx264 @ 0x55d85bc5e1c0] frame B:316   Avg QP:19.37  size:  2290
[libx264 @ 0x55d85bc5e1c0] consecutive B-frames:  0.7%  0.5%  0.0% 98.8%
[libx264 @ 0x55d85bc5e1c0] mb I  I16..4: 75.7% 12.3% 12.0%
[libx264 @ 0x55d85bc5e1c0] mb P  I16..4:  3.2%  4.0%  1.9%  P16..4:  6.8%  5.6%  3.0%  0.0%  0.0%    skip:75.5%
[libx264 @ 0x55d85bc5e1c0] mb B  I16..4:  0.5%  0.2%  0.1%  B16..8:  8.1%  4.3%  0.8%  direct: 2.7%  skip:83.3%  L0:46.3% L1:42.3% BI:11.4%
[libx264 @ 0x55d85bc5e1c0] 8x8 transform intra:36.8% inter:7.3%
[libx264 @ 0x55d85bc5e1c0] coded y,uvDC,uvAC intra: 39.4% 59.8% 34.3% inter: 2.2% 4.6% 0.9%
[libx264 @ 0x55d85bc5e1c0] i16 v,h,dc,p: 74%  8%  6% 12%
[li

In [12]:
# Inspect hand-related coefficients across a few frames to diagnose parsing issues
frames_to_check = frame_keys[:8]
report = []
for fk in frames_to_check:
    fr = tracking[fk]
    smplx_part = fr.get('smplx_coeffs', {})
    l_mano = fr.get('left_mano_coeffs', {})
    r_mano = fr.get('right_mano_coeffs', {})
    def shape_of(x):
        try:
            arr = np.asarray(x)
            return tuple(arr.shape), arr.flatten()[:6].tolist()
        except Exception:
            return None, None
    entry = {
        'frame': fk,
        'smplx_left_hand_pose_shape': shape_of(smplx_part.get('left_hand_pose'))[0],
        'smplx_right_hand_pose_shape': shape_of(smplx_part.get('right_hand_pose'))[0],
        'mano_left_keys': list(l_mano.keys()) if isinstance(l_mano, dict) else None,
        'mano_right_keys': list(r_mano.keys()) if isinstance(r_mano, dict) else None,
        'mano_left_axisang_shape': shape_of(l_mano.get('hand_pose_axisang'))[0],
        'mano_right_axisang_shape': shape_of(r_mano.get('hand_pose_axisang'))[0],
        'mano_left_generic_shape': shape_of(l_mano.get('hand_pose'))[0],
        'mano_right_generic_shape': shape_of(r_mano.get('hand_pose'))[0],
    }
    report.append(entry)

for r in report:
    print(r)

# Also print a single example of the raw left/right hand arrays for first frame for deeper look
first_frame = tracking[frame_keys[0]]
lf_raw = first_frame.get('smplx_coeffs', {}).get('left_hand_pose')
rf_raw = first_frame.get('smplx_coeffs', {}).get('right_hand_pose')
print('\nFirst frame left_hand_pose length:', None if lf_raw is None else len(np.asarray(lf_raw).reshape(-1)))
print('First frame right_hand_pose length:', None if rf_raw is None else len(np.asarray(rf_raw).reshape(-1)))
print('First frame left_hand_pose first 9 vals:', None if lf_raw is None else np.asarray(lf_raw).reshape(-1)[:9])
print('First frame right_hand_pose first 9 vals:', None if rf_raw is None else np.asarray(rf_raw).reshape(-1)[:9])

{'frame': 'frame_000000', 'smplx_left_hand_pose_shape': (15, 3), 'smplx_right_hand_pose_shape': (15, 3), 'mano_left_keys': ['pred_cam', 'global_orient', 'hand_pose', 'pred_cam_t', 'focal_length', 'camera_RT_params'], 'mano_right_keys': ['pred_cam', 'global_orient', 'hand_pose', 'pred_cam_t', 'focal_length', 'camera_RT_params'], 'mano_left_axisang_shape': (), 'mano_right_axisang_shape': (), 'mano_left_generic_shape': (1, 15, 3, 3), 'mano_right_generic_shape': (1, 15, 3, 3)}
{'frame': 'frame_000001', 'smplx_left_hand_pose_shape': (15, 3), 'smplx_right_hand_pose_shape': (15, 3), 'mano_left_keys': ['pred_cam', 'global_orient', 'hand_pose', 'pred_cam_t', 'focal_length', 'camera_RT_params'], 'mano_right_keys': ['pred_cam', 'global_orient', 'hand_pose', 'pred_cam_t', 'focal_length', 'camera_RT_params'], 'mano_left_axisang_shape': (), 'mano_right_axisang_shape': (), 'mano_left_generic_shape': (1, 15, 3, 3), 'mano_right_generic_shape': (1, 15, 3, 3)}
{'frame': 'frame_000002', 'smplx_left_hand_p