In [1]:
# imports
# from modeling_pretrain import PretrainVisionTransformerMultiOutout
from run_class_finetuning import get_args as args_parser
from run_videomae_vis_v2 import DataAugmentationForVideoMAEInference, get_model, load_frames, save_video
import yaml
import os.path as osp
import os
from types import SimpleNamespace as Namespace
from typing import List, Union
from timm.models import create_model
import torch
from utils import load_state_dict, time_function_decorator
from torch import sigmoid as logit
from mpigroup.const import LABELS as LABELS_MAP
import pandas as pd
from PIL import Image, ImageDraw, ImageFont

  def vit_small_patch16_224(pretrained=False, **kwargs):
  def vit_base_patch16_224(pretrained=False, **kwargs):
  def vit_base_patch16_384(pretrained=False, **kwargs):
  def vit_large_patch16_224(pretrained=False, **kwargs):
  def vit_large_patch16_384(pretrained=False, **kwargs):


In [2]:
LABELS_MAP

{0: 'Adjusting_clothing',
 1: 'Fold_arms',
 2: 'Fumble',
 3: 'Gesture',
 4: 'Groom',
 5: 'Hand_face',
 6: 'Hand_mouth',
 7: 'Leg_movement',
 8: 'Legs_crossed',
 9: 'Scratch',
 10: 'Settle',
 11: 'Shrug',
 12: 'Smearing_hands',
 13: 'Stretching'}

In [3]:
# paths and consts
config_path = osp.join('..','model_configs','mpigroup_multiclass_inference_debug.yaml')
config_path

'..\\model_configs\\mpigroup_multiclass_inference_debug.yaml'

In [4]:
def pars_path(p: Union[str, List[Union[List, str]]]):
    
    assert isinstance(p, (list, str)), TypeError("p must be a List or a str")
    
    # If p is a string, return it
    if isinstance(p, str):
        return p
    
    # If p is an empty list, return an empty string
    if len(p) == 0:
        return ''
    
    # Initialize an empty list to store components of the path
    components = []
    
    # Iterate over elements of the nested list
    for item in p:
        # Recursively call pars_path if item is a list
        if isinstance(item, list):
            components.append(pars_path(item))
        # Append the string directly to components if item is a string
        elif isinstance(item, str):
            components.append(item)
        else:
            print(type(item))
            raise TypeError("Invalid type in nested list")
    
    # Use os.path.join() to construct the path
    return os.path.join(*components)       
def get_args(yaml_path):            
    # load yaml
    loaded_config = yaml.safe_load(open(yaml_path, 'r'))
    finetuning_params = loaded_config['finetuning_params']
    
    for k, v in finetuning_params.items():
        if isinstance(v, list):
            if isinstance(v[0], float):
                continue
            v = pars_path(v)
        finetuning_params[k] = v 
    
    return Namespace(**finetuning_params)
args = get_args(config_path)

In [5]:
# load model
model = create_model(
    args.model,
    pretrained=False,
    num_classes=args.nb_classes,
    all_frames=args.num_frames * args.num_segments,
    tubelet_size=args.tubelet_size,
    fc_drop_rate=args.fc_drop_rate,
    drop_rate=args.drop,
    drop_path_rate=args.drop_path,
    attn_drop_rate=args.attn_drop_rate,
    drop_block_rate=None,
    use_checkpoint=args.use_checkpoint,
    use_mean_pooling=args.use_mean_pooling,
    init_scale=args.init_scale,
    )

In [6]:
# load checkpoint
device = torch.device(args.device)
# p = "D:\\Project-mpg microgesture\\human_micro_gesture_classifier\\scripts\\MPIIGroupInteraction\\videomae_vit_base_patch16_224_kinetic_400_densepose_dual\\outputs\\checkpoint-best\\mp_rank_00_model_states.pt"
checkpoint = torch.load(args.finetune, map_location='cpu')
# checkpoint = torch.load(p, map_location='cpu')
checkpoint_model = checkpoint['module']
load_state_dict(model, checkpoint_model)
# model_gpu = model.to(device)
# model_cpu = model.to('cpu')
model.eval()
# model_gpu.eval()

VisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv3d(3, 768, kernel_size=(2, 16, 16), stride=(2, 16, 16))
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (blocks): ModuleList(
    (0): Block(
      (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=768, out_features=2304, bias=False)
        (attn_drop): Dropout(p=0, inplace=False)
        (proj): Linear(in_features=768, out_features=768, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (drop_path): Identity()
      (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=768, out_features=3072, bias=True)
        (act): GELU(approximate='none')
        (fc2): Linear(in_features=3072, out_features=768, bias=True)
        (drop): Dropout(p=0.0, inplace=False)
      )
    )
    (1): Block(
      (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (attn

In [7]:
patch_size = model.patch_embed.patch_size
print("Patch size = %s" % str(patch_size))
args.window_size = (args.num_frames // 2, args.input_size // patch_size[0], args.input_size // patch_size[1])
args.patch_size = patch_size

Patch size = (16, 16)


In [8]:
# load video

# path_to_video = "D:\\Project-mpg microgesture\\human_micro_gesture_classifier\\video_samples_results\\MPIG_densepose_dual_2\\checkpoint-99\\MPIIGroupInteraction\clips_train\\00000-video\\videos\\ori_vid.mp4"

path_to_video = "D:\\Project-mpg microgesture\\imigue_rgb_phase1\\iMiGUE_RGB_Phase1\\imigue_rgb_train\\0012\\0012.mp4"

transforms = DataAugmentationForVideoMAEInference(args)
# vid = load_frames(img_path=path_to_video,num_frames=16, transformations=transforms, frame_id_list=range(16))[0]
vid = load_frames(img_path=path_to_video,num_frames=16, transformations=transforms, frame_id_list=None)[0]


In [9]:
def transform_video(video_data, transformations):
    n_frames = video_data.shape[0]
    img = [Image.fromarray(video_data[vid, :, :, :]).convert('RGB') for vid, _ in enumerate(n_frames)]
    # Performe transformations on the image - resizeing, normalization, reshape
    img, _ = transformations((img, None))  # T*C,H,W
    img = img.view((n_frames, 3) + img.size()[-2:]).transpose(0, 1)  # T*C,H,W -> T,C,H,W -> C,T,H,W
    img = img.unsqueeze(0)
    return img

In [10]:
# run inference
@time_function_decorator
def run_inference(model, vid, device='cpu'):
    vid = vid.to(device)
    out = model(vid)
    logits = logit(out).detach().cpu().tolist()
    df = pd.DataFrame(logits, columns=LABELS_MAP.values()).transpose()
    return df 

In [11]:

df = run_inference(model=model, vid=vid, device='cpu')
df



Function run_inference took: 2.3477602005004883 seconds to run


Unnamed: 0,0
Adjusting_clothing,0.836331
Fold_arms,0.0161
Fumble,0.024937
Gesture,0.1294
Groom,0.178861
Hand_face,0.017553
Hand_mouth,0.012834
Leg_movement,0.499612
Legs_crossed,0.647438
Scratch,0.019225


In [6]:
# visualize results