In [75]:
import av
import torch
from torch.nn import functional as F
import numpy as np
from easydict import EasyDict as edict

from transformers.models.clip.configuration_clip import CLIPConfig, CLIPTextConfig, CLIPVisionConfig
from transformers import CLIPProcessor, CLIPTokenizerFast
from clipvip.CLIP_VIP import CLIPModel, clip_loss

In [63]:
extraCfg = edict({
    "type": "ViP",
    "temporal_size": 12,
    "if_use_temporal_embed": 1,
    "logit_scale_init_value": 4.60,
    "add_cls_num": 3
})

clipconfig = CLIPConfig.from_pretrained("openai/clip-vit-base-patch32")
clipconfig.vision_additional_config = extraCfg

checkpoint = torch.load("/Users/teli/www/test/视频向量提取/XPretrain-视频获取/CLIP-ViP/pretrain_clipvip_base_32.pt")
cleanDict = { key.replace("clipmodel.", "") : value for key, value in checkpoint.items() }
model =  CLIPModel(config=clipconfig)
model.load_state_dict(cleanDict)

<All keys matched successfully>

In [67]:
!export https_proxy=http://127.0.0.1:7890 http_proxy=http://127.0.0.1:7890 all_proxy=socks5://127.0.0.1:7890

In [81]:
tokenizer = CLIPTokenizerFast.from_pretrained("openai/clip-vit-base-patch32")
tokens = tokenizer(["in the forest"], padding=True, return_tensors="pt")
textOutput = model.get_text_features(**tokens)
print(textOutput.shape)

torch.Size([1, 512])


In [9]:
def read_video_pyav(container, indices):
    frames = []
    container.seek(0)
    start_index = indices[0]
    end_index = indices[-1]
    for i, frame in enumerate(container.decode(video=0)):
        if i > end_index:
            break
        if i >= start_index and i in indices:
            frames.append(frame)
    return np.stack([x.to_ndarray(format="rgb24") for x in frames])


def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
    '''
    Sample a given number of frame indices from the video.
    Args:
        clip_len (`int`): Total number of frames to sample.
        frame_sample_rate (`int`): Sample every n-th frame.
        seg_len (`int`): Maximum allowed index of sample's last frame.
    Returns:
        indices (`List[int]`): List of sampled frame indices
    '''
    converted_len = int(clip_len * frame_sample_rate)
    end_idx = np.random.randint(converted_len, seg_len)
    start_idx = end_idx - converted_len
    indices = np.linspace(start_idx, end_idx, num=clip_len)
    indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
    return indices


In [17]:
from transformers import AutoProcessor
processor = AutoProcessor.from_pretrained("microsoft/xclip-base-patch16")


In [18]:
container = av.open("/Volumes/Samsung_T3/_download/test_cut/py_053.mp4")

clip_len = 12
fcount = container.streams.video[0].frames
# sample 16 frames
indices = sample_frame_indices(clip_len=clip_len, frame_sample_rate=fcount//clip_len, seg_len=fcount)
video = read_video_pyav(container, indices)
pixel_values = processor(videos=list(video), return_tensors="pt").pixel_values

print("video.shape", pixel_values.shape)


No accelerated colorspace conversion found from yuv420p to rgb24.
 (repeated 11 more times)
No accelerated colorspace conversion found from yuv420p to rgb24.


video.shape torch.Size([1, 12, 3, 224, 224])


  return torch.tensor(value)


In [19]:
B, N, C, H, W = pixel_values.shape
vv = pixel_values.reshape(-1, C, H, W)
print(vv.shape)

torch.Size([12, 3, 224, 224])


In [74]:
inputs = {
        "if_norm": True,
        "pixel_values": pixel_values}
with torch.no_grad():
    video_features = model.get_image_features(**inputs)
print(video_features)

tensor([[ 4.2968e-02,  4.5826e-03, -2.0510e-02,  2.1235e-02,  2.9924e-02,
         -2.1660e-02,  1.0331e-02, -5.7496e-03,  1.9632e-03, -1.6750e-02,
          2.7891e-02,  1.9469e-03,  1.7285e-02,  3.7726e-02, -2.0460e-03,
         -2.4150e-02, -1.2243e-01,  2.5370e-02, -3.1692e-02, -2.2621e-02,
         -5.9742e-02,  5.1138e-02, -1.8714e-02, -3.2980e-02, -1.4633e-02,
         -6.1462e-02, -4.4542e-03, -7.4782e-03,  8.8724e-03,  1.6593e-02,
         -5.3571e-02, -1.2733e-02, -7.3391e-03,  5.5302e-03,  8.7937e-02,
         -1.3672e-02, -4.0799e-02, -2.3102e-02,  1.0312e-02,  2.3676e-02,
         -5.8805e-03,  2.3927e-02,  1.3316e-02, -1.1890e-02,  1.8357e-02,
          7.2390e-02, -8.1439e-02, -3.8800e-04,  5.2957e-02,  1.6394e-02,
          2.0144e-02, -4.0233e-02, -2.2974e-02, -2.2099e-02,  6.2945e-04,
         -1.3704e-02,  8.3659e-02, -1.9435e-02, -4.7705e-03, -5.0799e-02,
          6.5837e-02, -5.7450e-03, -4.7846e-02,  1.4080e-02,  5.2817e-03,
         -7.3937e-03,  4.3109e-02, -2.

In [82]:
sim = F.cosine_similarity(textOutput, video_features, dim=1)
print(sim)

tensor([0.1142], grad_fn=<SumBackward1>)


<All keys matched successfully>

<All keys matched successfully>

tensor([[ 4.2968e-02,  4.5826e-03, -2.0510e-02,  2.1235e-02,  2.9924e-02,
         -2.1660e-02,  1.0331e-02, -5.7496e-03,  1.9632e-03, -1.6750e-02,
          2.7891e-02,  1.9469e-03,  1.7285e-02,  3.7726e-02, -2.0460e-03,
         -2.4150e-02, -1.2243e-01,  2.5370e-02, -3.1692e-02, -2.2621e-02,
         -5.9742e-02,  5.1138e-02, -1.8714e-02, -3.2980e-02, -1.4633e-02,
         -6.1462e-02, -4.4542e-03, -7.4782e-03,  8.8724e-03,  1.6593e-02,
         -5.3571e-02, -1.2733e-02, -7.3391e-03,  5.5302e-03,  8.7937e-02,
         -1.3672e-02, -4.0799e-02, -2.3102e-02,  1.0312e-02,  2.3676e-02,
         -5.8805e-03,  2.3927e-02,  1.3316e-02, -1.1890e-02,  1.8357e-02,
          7.2390e-02, -8.1439e-02, -3.8800e-04,  5.2957e-02,  1.6394e-02,
          2.0144e-02, -4.0233e-02, -2.2974e-02, -2.2099e-02,  6.2945e-04,
         -1.3704e-02,  8.3659e-02, -1.9435e-02, -4.7705e-03, -5.0799e-02,
          6.5837e-02, -5.7450e-03, -4.7846e-02,  1.4080e-02,  5.2817e-03,
         -7.3937e-03,  4.3109e-02, -2.

Parameter containing:
tensor([[ 0.0205,  0.0028, -0.0089,  ...,  0.0097,  0.0270,  0.0024],
        [ 0.0115,  0.0075, -0.0120,  ...,  0.0050,  0.0048, -0.0432],
        [ 0.0141,  0.0140,  0.0021,  ...,  0.0164,  0.0107, -0.0082],
        ...,
        [-0.0023,  0.0027,  0.0039,  ..., -0.0287,  0.0108,  0.0049],
        [ 0.0486, -0.0194, -0.0194,  ..., -0.0122,  0.0148,  0.0281],
        [-0.0008,  0.0048, -0.0132,  ...,  0.0080,  0.0052, -0.0132]],
       requires_grad=True)