In [20]:
import torch
import clip
import numpy as np
from PIL import Image
from model.tp import TemporalPooling
import cv2
import mmcv
import pandas as pd

In [80]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model, preprocess = clip.load("ViT-B/32", device=device)
np.set_printoptions(suppress=True, precision=6)

In [None]:
video_infos = mmcv.VideoReader("E:\\DATASETS\\TBAD-9\\bowing to students\\1.mp4")

In [None]:
video_infos[0]

In [6]:
image = preprocess(Image.open("teacher_blackboard.jpg")).unsqueeze(0).to(device)

In [7]:
with torch.no_grad():
    image_features = model.encode_image(image)

In [8]:
image_features.shape

torch.Size([1, 512])

In [16]:
def get_video_data(path):
    video_capture = cv2.VideoCapture(path)
    total_frames = int(video_capture.get(cv2.CAP_PROP_FRAME_COUNT))
    frames = []
    frame_ids = np.linspace(0, total_frames - 1, 8, dtype=np.int)
    for i in range(total_frames):
        ret, frame = video_capture.read()
        if not ret:
            break
        if i in frame_ids:
            frames.append(frame)
    video_capture.release()
    return frames

In [82]:
image_input_lists = get_video_data('E:/DATASETS/hmdb51_org/kiss/5GreatHollywoodKisses_kiss_h_cm_np2_ri_goo_1.avi')
image_inputs = [preprocess(Image.fromarray(cv2.cvtColor(c, cv2.COLOR_BGR2RGB))).unsqueeze(0).to(device) for c in image_input_lists]
print(image_input_lists)

[array([[[ 22,  27,  27],
        [ 21,  26,  26],
        [ 20,  25,  25],
        ...,
        [ 65,  91,  89],
        [ 61,  85,  82],
        [ 58,  82,  79]],

       [[ 22,  27,  27],
        [ 21,  26,  26],
        [ 20,  25,  25],
        ...,
        [ 63,  89,  87],
        [ 60,  84,  81],
        [ 57,  81,  78]],

       [[ 22,  27,  27],
        [ 21,  26,  26],
        [ 20,  25,  25],
        ...,
        [ 62,  82,  84],
        [ 53,  82,  78],
        [ 51,  80,  76]],

       ...,

       [[123, 133, 132],
        [123, 133, 132],
        [123, 133, 132],
        ...,
        [ 27,  26,  13],
        [ 25,  26,  13],
        [ 22,  23,  10]],

       [[123, 133, 132],
        [123, 133, 132],
        [123, 133, 132],
        ...,
        [ 27,  26,  13],
        [ 24,  25,  12],
        [ 22,  23,  10]],

       [[122, 132, 131],
        [122, 132, 131],
        [122, 132, 131],
        ...,
        [ 27,  26,  13],
        [ 24,  25,  12],
        [ 22,  23,  10]

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  frame_ids = np.linspace(0, total_frames - 1, 8, dtype=np.int)


In [58]:
with torch.no_grad():
    image_features = [model.encode_image(x) for x in image_inputs]
    image_features = torch.stack(image_features, dim=1)  # shape: [1,N, 512]
    print(image_features.shape)

torch.Size([1, 7, 512])


In [42]:
image_features

tensor([[[ 0.1261, -0.1764, -0.2549,  ...,  0.6479, -0.3528, -0.0178],
         [ 0.2045, -0.0872, -0.3354,  ...,  0.4807, -0.2795, -0.1611],
         [ 0.1016,  0.0012, -0.2661,  ...,  0.5181, -0.1754, -0.2091],
         ...,
         [ 0.1428, -0.0314, -0.1385,  ...,  0.7026, -0.0976, -0.1097],
         [ 0.0763, -0.0279, -0.1854,  ...,  0.8115, -0.2115, -0.0922],
         [ 0.0765,  0.1412, -0.2203,  ...,  0.5293, -0.1965, -0.2507]]],
       device='cuda:0', dtype=torch.float16)

In [73]:
from model.tp import TemporalPooling
# 创建时间池化模型
temporal_pooling = TemporalPooling(feature_dim=image_features.shape[-1]).to(device)
# 对图像特征进行时间池化
video_feature = temporal_pooling(image_features)
video_feature

tensor([[[ 1.1542e-01, -4.2908e-02, -2.3340e-01, -4.9902e-01, -6.0120e-02,
          -3.3423e-01,  2.5586e-01, -5.5225e-01, -7.3389e-01, -1.3611e-01,
           2.8296e-01, -1.3745e-01,  4.0918e-01,  5.6213e-02, -1.9104e-01,
          -2.2363e-01, -4.3066e-01,  2.4744e-01,  1.5942e-01, -3.4424e-01,
          -2.8336e-02,  3.5858e-02, -3.4375e-01, -7.1045e-02, -4.6094e-01,
           6.3184e-01, -3.8989e-01, -4.3091e-01,  1.5271e-01, -5.9521e-01,
          -3.0225e-01,  4.2920e-01, -1.1652e-01, -2.6474e-02,  2.6855e-01,
           4.3018e-01, -4.1650e-01,  1.5088e-01,  1.9482e-01,  1.2031e+00,
          -3.0327e-03, -5.1172e-01, -3.5718e-01,  2.8857e-01,  1.3538e-01,
           1.1328e+00, -7.0419e-03, -2.2388e-01, -3.5278e-01,  3.1592e-01,
           6.4636e-02,  5.4150e-01,  7.0312e-02, -5.7227e-01,  3.1226e-01,
           7.8308e-02, -1.4603e-02,  4.1431e-01,  2.0401e-02,  9.3933e-02,
           8.2275e-02, -2.7515e-01,  4.5837e-02, -2.2168e-01, -9.1064e-02,
           1.4551e-01,  3

In [72]:
classes_all = pd.read_csv('labels/hmdb51_org_base_labels.csv')
classnames = classes_all.values.tolist()
classnames = [class_name for i, class_name in classnames]
text_inputs = torch.cat([clip.tokenize(f"a photo of {c}") for c in classnames]).to(device)
with torch.no_grad():
    text_features = model.encode_text(text_inputs).to(torch.float16)
print(text_features)

tensor([[-0.2037, -0.1061, -0.4888,  ..., -0.0934, -0.2981,  0.3403],
        [ 0.0922, -0.1044,  0.3354,  ..., -0.2666, -0.0920,  0.1075],
        [ 0.3103,  0.0479, -0.0540,  ..., -0.5229, -0.2001,  0.3022],
        ...,
        [-0.3782,  0.4531, -0.1771,  ..., -0.6079, -0.1566,  0.0392],
        [-0.2598, -0.0426,  0.0418,  ..., -0.5962,  0.0851, -0.0486],
        [-0.2390,  0.2720,  0.0621,  ..., -0.4124, -0.0188,  0.3442]],
       device='cuda:0', dtype=torch.float16)


In [74]:
video_feature /= video_feature.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
similarity = (100.0 * video_feature @ text_features.T).softmax(dim=-1)
print(text_features)
print(video_feature)

tensor([[-0.0203, -0.0106, -0.0488,  ..., -0.0093, -0.0298,  0.0340],
        [ 0.0099, -0.0112,  0.0359,  ..., -0.0285, -0.0098,  0.0115],
        [ 0.0299,  0.0046, -0.0052,  ..., -0.0504, -0.0193,  0.0291],
        ...,
        [-0.0361,  0.0432, -0.0169,  ..., -0.0580, -0.0149,  0.0037],
        [-0.0247, -0.0041,  0.0040,  ..., -0.0567,  0.0081, -0.0046],
        [-0.0227,  0.0258,  0.0059,  ..., -0.0391, -0.0018,  0.0327]],
       device='cuda:0', dtype=torch.float16)
tensor([[[ 1.1429e-02, -4.2458e-03, -2.3102e-02, -4.9408e-02, -5.9509e-03,
          -3.3081e-02,  2.5330e-02, -5.4657e-02, -7.2632e-02, -1.3474e-02,
           2.8015e-02, -1.3603e-02,  4.0497e-02,  5.5656e-03, -1.8906e-02,
          -2.2141e-02, -4.2633e-02,  2.4490e-02,  1.5778e-02, -3.4088e-02,
          -2.8057e-03,  3.5496e-03, -3.4027e-02, -7.0343e-03, -4.5624e-02,
           6.2561e-02, -3.8605e-02, -4.2664e-02,  1.5114e-02, -5.8929e-02,
          -2.9922e-02,  4.2480e-02, -1.1536e-02, -2.6207e-03,  2.6581e-

In [75]:
values, indices = similarity[0].topk(1)
    # print(indices)
if indices.item() == 22:
   print("Yes")     

Yes
