In [25]:
import torch
import torchvision
import numpy as np

import clip
from PIL import Image

import time
import csv

from mmpt.models import MMPTModel

device = 'cuda'

In [26]:
!nvidia-smi

Tue Dec 13 02:51:44 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.141.03   Driver Version: 470.141.03   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            On   | 00000000:06:00.0 Off |                    0 |
| N/A   58C    P0    29W /  70W |   2956MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

In [27]:
split_input = []
original_input = []

with open("prompts.txt", "r") as f:
    for i, line in enumerate(f.readlines()):
        if i % 2 == 0:
            line = line.strip('\n').lower()
            text = line.strip('.').split("&")[1].strip(' ')
            original_input.append(text)
            
        else:
            line = line.strip('\n').lower()
            str_list = line.split("“")
            processed_str_list = []
            for j, text in enumerate(str_list):
                if j < 1 or text.startswith(","):
                    continue
                processed_str_list.append(text.strip('”, ').strip('.”'))
            split_input.append(processed_str_list)

In [28]:
prompts = []

for str_list in split_input:
    prompt_list = []
    prompt_list.append("a video of {}".format(str_list[0]))
    prompt_list.append("the object in video {}".format(str_list[1]))
    if len(str_list) >= 3:
        prompt_list.append("the event happens {}".format(str_list[2]))
    prompts.append(prompt_list)

In [29]:
video_arrays = np.load(f'video_arrays.npz',"r")
cog_arr = video_arrays['cog_arr']
diff_arr = video_arrays['diff_arr']
aph_arr = video_arrays['aph_arr']

In [30]:
def preprocessing(video):
    # B, T, FPS, H, W, C
    # 1, 4, 8, 480, 480, 3
    
    temp = torch.zeros(1,4,30,224,224,3).to(device)
    for t in range(video.size(0)):
        r = video[t,:,:,:,:].permute(0,3,1,2)
        r = torchvision.transforms.functional.resize(r, 224)
        r = r.permute(0,2,3,1)
        for f in range(8):
            frame = r[f,:,:,:]
            if f < 7:
                temp[:,t,f*4:f*4+4,:,:,:]=frame.repeat(4,1,1,1)
            elif f==7:
                temp[:,t,f*4:f*4+2,:,:,:]=frame.repeat(2,1,1,1)
            else:
                raise ValueError()
    return temp

def CLIP_preprocessing(video):
    # T, FPS, H, W, C
    # 4, 8, 480, 480, 3
    temp = torch.zeros(4,8,3,224,224).to(device)
    for t in range(video.size(0)):
        r = video[t,:,:,:,:].permute(0,3,1,2)
        r = torchvision.transforms.functional.resize(r, 224)
        r = torchvision.transforms.functional.normalize(r, mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711))
        
        temp[t] = r
        
    return temp

# print(cog_arr[0].shape)
# preprocessing(torch.from_numpy(cog_arr[0]).to(device))

In [31]:
model, tokenizer, aligner = MMPTModel.from_pretrained(
    "projects/retri/videoclip/how2.yaml")

model = model.to(device)
model.eval()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing MMBertForEncoder: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'bert.encoder.layer.6.attention.self.query.weight', 'bert.encoder.layer.6.attention.self.query.bias', 'bert.encoder.layer.6.attention.self.key.weight', 'bert.encoder.layer.6.attention.self.key.bias', 'bert.encoder.layer.6.attention.self.value.weight', 'bert.encoder.layer.6.attention.self.value.bias', 'bert.encoder.layer.6.attention.output.dense.weight', 'bert.encoder.layer.6.attention.output.dense.bias', 'bert.encoder.layer.6.intermediate.dense.weight', 'bert.encoder.layer.6.intermediate.dense.bias', 'bert.encoder.layer.6.output.dense.weight', 'bert.encoder.layer.6.output.dense.bias', 'bert.enc

MMPTModel(
  (video_encoder): S3D(
    (conv1): STConv3D(
      (relu): ReLU(inplace=True)
      (conv1): Conv3d(24, 64, kernel_size=(2, 4, 4), stride=(1, 1, 1), padding=(1, 2, 2), bias=False)
      (bn1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (conv_2b): STConv3D(
      (relu): ReLU(inplace=True)
      (conv1): Conv3d(64, 64, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
      (bn1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (conv_2c): STConv3D(
      (relu): ReLU(inplace=True)
      (conv1): Conv3d(64, 192, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1), bias=False)
      (bn1): BatchNorm3d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv3d(192, 192, kernel_size=(3, 1, 1), stride=(1, 1, 1), padding=(1, 0, 0), bias=False)
      (bn2): BatchNorm3d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (gati

In [32]:
def score(video_frames, input_sentence, str_list):
    
    # B, T, FPS, H, W, C (VideoCLIP is trained on 30 fps of s3d)
    T = video_frames.size(1)
    total_frame_num = T * 3
    
    #overall score
    
    caps, cmasks = aligner._build_text_seq(
        tokenizer(input_sentence, add_special_tokens=False)["input_ids"]
    )

    caps, cmasks = caps[None, :].to(device), cmasks[None, :].to(device)  # bsz=1
    
    with torch.no_grad():
        output = model(video_frames, caps, cmasks, return_score=True)
        normalization = output["pooled_video"].squeeze(0).norm() * output["pooled_text"].squeeze(0).norm()
        overall_score = output["score"] / normalization
    
    # precision and recall
    Vi = [video_frames[:,t,:,:,:,:].unsqueeze(1) for t in range(T)]
    Ti = str_list
    
#     for idv, v in enumerate(Vi):
#         for idt, text in enumerate(Ti):
#             caps, cmasks = aligner._build_text_seq(
#                 tokenizer(text, add_special_tokens=False)["input_ids"]
#             )

#             caps, cmasks = caps[None, :].to(device), cmasks[None, :].to(device)  # bsz=1
            
#             with torch.no_grad():
#                 score = model(v, caps, cmasks, return_score=True)["score"]/total_frame_num
            
#             print("Video {}s, {}: {}".format(idv, text, score.item()))
    
    R = 0
    for t, v in enumerate(Vi):
        with torch.no_grad():
#             score = model(v, caps, cmasks, return_score=True)["score"]/total_frame_num
            output = model(v, caps, cmasks, return_score=True)
            normalization = output["pooled_video"].squeeze(0).norm() * output["pooled_text"].squeeze(0).norm()
            score = output["score"] / normalization
        
        R += score/len(Vi)
        print("Video {}s: {}".format(t, score.item()))
    
    P = 0
    for t, text in enumerate(Ti):
        caps, cmasks = aligner._build_text_seq(
            tokenizer(text, add_special_tokens=False)["input_ids"]
        )

        caps, cmasks = caps[None, :].to(device), cmasks[None, :].to(device)  # bsz=1
        
        with torch.no_grad():
#             score = model(video_frames, caps, cmasks, return_score=True)["score"]/total_frame_num
            output = model(video_frames, caps, cmasks, return_score=True)
            normalization = output["pooled_video"].squeeze(0).norm() * output["pooled_text"].squeeze(0).norm()
            score = output["score"] / normalization
        
        P += score/len(Ti)
        print("Sentence {}: {}".format(text, score.item()))
    
    F1 = 2*P*R/(P+R)
    print("overall score: {}, P: {}, R: {}, F1: {}".format(overall_score, P, R, F1))
    return [overall_score[0].item(),
        P[0].item(),
        R[0].item(),
        F1[0].item(),
        overall_score[0].item() + F1[0].item()]

In [9]:

# # B, T, FPS, H, W, C (VideoCLIP is trained on 30 fps of s3d)
# start = time.time()
# video_frames = torch.randn(1, 4, 30, 224, 224, 3).to(device)

# str_list = ['A video of a horse', 'The object in video is basking in the sun', 'The event happen on a beach couch']
# input_sentence = "a horse is basking in the sun on a beach couch"

# score(video_frames, input_sentence, str_list)

# print("spend time: ", time.time()-start)

In [33]:
csv_path = 'Ours_score_flat_f1.csv'
header = ['idx', 'cog', 'diff', 'aph']
csv_writer = csv.writer(open(csv_path, 'a', 1))
csv_writer.writerow(header)

for t in range(101):
    cog = cog_arr[t]/255.0
    diff = diff_arr[t]/255.0
    aph = aph_arr[t]/255.0
    
    cog_video = preprocessing(torch.from_numpy(cog).to(device))
    diff_video = preprocessing(torch.from_numpy(diff).to(device))
    aph_video = preprocessing(torch.from_numpy(aph).to(device))
    
    str_list = prompts[t]
    input_sentence = original_input[t]
    print("idx: ", t)
    print(str_list)
    print(input_sentence)
    
    cog_result = score(cog_video, input_sentence, str_list)
    diff_result = score(diff_video, input_sentence, str_list)
    aph_result = score(aph_video, input_sentence, str_list)
    
    csv_writer.writerow([t, cog_result[-2], diff_result[-2], aph_result[-2]])

idx:  0
['a video of a boy', 'the object in video is running', 'the event happens on the lawn']
a boy is running on the lawn
Video 0s: 0.25353655219078064
Video 1s: 0.23153233528137207
Video 2s: 0.23612771928310394
Video 3s: 0.23785191774368286
Sentence a video of a boy: 0.253278911113739
Sentence the object in video is running: 0.198406383395195
Sentence the event happens on the lawn: 0.1938672810792923
overall score: tensor([0.2037], device='cuda:0'), P: tensor([0.2152], device='cuda:0'), R: tensor([0.2398], device='cuda:0'), F1: tensor([0.2268], device='cuda:0')
Video 0s: 0.19911085069179535
Video 1s: 0.24622014164924622
Video 2s: 0.2483539581298828
Video 3s: 0.22472333908081055
Sentence a video of a boy: 0.2201022207736969
Sentence the object in video is running: 0.23524844646453857
Sentence the event happens on the lawn: 0.21246981620788574
overall score: tensor([0.2349], device='cuda:0'), P: tensor([0.2226], device='cuda:0'), R: tensor([0.2296], device='cuda:0'), F1: tensor([0.22

Sentence the event happens in the living room: 0.1831032633781433
overall score: tensor([0.2087], device='cuda:0'), P: tensor([0.2140], device='cuda:0'), R: tensor([0.2204], device='cuda:0'), F1: tensor([0.2171], device='cuda:0')
Video 0s: 0.1859389692544937
Video 1s: 0.14995422959327698
Video 2s: 0.15411277115345
Video 3s: 0.16033293306827545
Sentence a video of a girl: 0.1779768168926239
Sentence the object in video is singing: 0.2254544347524643
Sentence the event happens in the living room: 0.14212343096733093
overall score: tensor([0.1294], device='cuda:0'), P: tensor([0.1819], device='cuda:0'), R: tensor([0.1626], device='cuda:0'), F1: tensor([0.1717], device='cuda:0')
idx:  6
['a video of a man', 'the object in video is running', 'the event happens on the lawn']
a man is running on the lawn
Video 0s: 0.20139379799365997
Video 1s: 0.2267548143863678
Video 2s: 0.22802630066871643
Video 3s: 0.22012074291706085
Sentence a video of a man: 0.19761943817138672
Sentence the object in vi

Video 0s: 0.2322797030210495
Video 1s: 0.2314835488796234
Video 2s: 0.2148737609386444
Video 3s: 0.23351173102855682
Sentence a video of a woman: 0.1890614926815033
Sentence the object in video is singing: 0.22024203836917877
Sentence the event happens in the living room: 0.18930929899215698
overall score: tensor([0.2056], device='cuda:0'), P: tensor([0.1995], device='cuda:0'), R: tensor([0.2280], device='cuda:0'), F1: tensor([0.2128], device='cuda:0')
Video 0s: 0.22099877893924713
Video 1s: 0.21125216782093048
Video 2s: 0.2038588970899582
Video 3s: 0.24206224083900452
Sentence a video of a woman: 0.22095365822315216
Sentence the object in video is singing: 0.20371025800704956
Sentence the event happens in the living room: 0.1632719188928604
overall score: tensor([0.1940], device='cuda:0'), P: tensor([0.1960], device='cuda:0'), R: tensor([0.2195], device='cuda:0'), F1: tensor([0.2071], device='cuda:0')
Video 0s: 0.2027217447757721
Video 1s: 0.1904713213443756
Video 2s: 0.18317921459674

Video 0s: 0.27987006306648254
Video 1s: 0.26341819763183594
Video 2s: 0.2871803045272827
Video 3s: 0.28656014800071716
Sentence a video of a couple: 0.18456776440143585
Sentence the object in video are hugging: 0.18880228698253632
overall score: tensor([0.2530], device='cuda:0'), P: tensor([0.1867], device='cuda:0'), R: tensor([0.2793], device='cuda:0'), F1: tensor([0.2238], device='cuda:0')
Video 0s: 0.24764682352542877
Video 1s: 0.2518536448478699
Video 2s: 0.24504949152469635
Video 3s: 0.2563188970088959
Sentence a video of a couple: 0.18813052773475647
Sentence the object in video are hugging: 0.18204079568386078
overall score: tensor([0.2004], device='cuda:0'), P: tensor([0.1851], device='cuda:0'), R: tensor([0.2502], device='cuda:0'), F1: tensor([0.2128], device='cuda:0')
idx:  18
['a video of a couple', 'the object in video are kissing']
a couple are kissing
Video 0s: 0.3240289092063904
Video 1s: 0.33157649636268616
Video 2s: 0.33748167753219604
Video 3s: 0.3173408508300781
Sent

Video 0s: 0.16830018162727356
Video 1s: 0.2042442262172699
Video 2s: 0.19843874871730804
Video 3s: 0.19470584392547607
Sentence a video of a man and a woman: 0.17311549186706543
Sentence the object in video are quarreling: 0.16750147938728333
overall score: tensor([0.1509], device='cuda:0'), P: tensor([0.1703], device='cuda:0'), R: tensor([0.1914], device='cuda:0'), F1: tensor([0.1802], device='cuda:0')
idx:  24
['a video of a man and a woman', 'the object in video are having a meal']
a man and a woman are having a meal
Video 0s: 0.25660693645477295
Video 1s: 0.22598965466022491
Video 2s: 0.24758201837539673
Video 3s: 0.2190762758255005
Sentence a video of a man and a woman: 0.18012306094169617
Sentence the object in video are having a meal: 0.22398333251476288
overall score: tensor([0.2168], device='cuda:0'), P: tensor([0.2021], device='cuda:0'), R: tensor([0.2373], device='cuda:0'), F1: tensor([0.2183], device='cuda:0')
Video 0s: 0.19837215542793274
Video 1s: 0.20748284459114075
Vide

idx:  30
['a video of two men', 'the object in video are hugging']
two men are hugging
Video 0s: 0.23448143899440765
Video 1s: 0.23835766315460205
Video 2s: 0.22099782526493073
Video 3s: 0.20273280143737793
Sentence a video of two men: 0.20855164527893066
Sentence the object in video are hugging: 0.19976840913295746
overall score: tensor([0.1871], device='cuda:0'), P: tensor([0.2042], device='cuda:0'), R: tensor([0.2241], device='cuda:0'), F1: tensor([0.2137], device='cuda:0')
Video 0s: 0.21317841112613678
Video 1s: 0.21243305504322052
Video 2s: 0.22031345963478088
Video 3s: 0.22448328137397766
Sentence a video of two men: 0.20777465403079987
Sentence the object in video are hugging: 0.1929505467414856
overall score: tensor([0.1994], device='cuda:0'), P: tensor([0.2004], device='cuda:0'), R: tensor([0.2176], device='cuda:0'), F1: tensor([0.2086], device='cuda:0')
Video 0s: 0.16515277326107025
Video 1s: 0.168087899684906
Video 2s: 0.17582954466342926
Video 3s: 0.18652816116809845
Senten

Sentence a video of a family: 0.22817084193229675
Sentence the object in video is quarreling: 0.1465565711259842
overall score: tensor([0.1986], device='cuda:0'), P: tensor([0.1874], device='cuda:0'), R: tensor([0.2359], device='cuda:0'), F1: tensor([0.2088], device='cuda:0')
Video 0s: 0.2185835987329483
Video 1s: 0.16805940866470337
Video 2s: 0.21162082254886627
Video 3s: 0.1968729943037033
Sentence a video of a family: 0.17429232597351074
Sentence the object in video is quarreling: 0.17224137485027313
overall score: tensor([0.1482], device='cuda:0'), P: tensor([0.1733], device='cuda:0'), R: tensor([0.1988], device='cuda:0'), F1: tensor([0.1852], device='cuda:0')
idx:  37
['a video of a man', 'the object in video is taking a walk', 'the event happens in the sunshine']
a man is taking a walk in the sunshine
Video 0s: 0.2525133192539215
Video 1s: 0.26642560958862305
Video 2s: 0.261792927980423
Video 3s: 0.2614518404006958
Sentence a video of a man: 0.20359265804290771
Sentence the objec

idx:  42
['a video of a man', 'the object in video is taking a walk', 'the event happens in summer']
a man is taking a walk in summer
Video 0s: 0.26808100938796997
Video 1s: 0.2738494277000427
Video 2s: 0.27115845680236816
Video 3s: 0.27238085865974426
Sentence a video of a man: 0.2181079089641571
Sentence the object in video is taking a walk: 0.2338683307170868
Sentence the event happens in summer: 0.23031537234783173
overall score: tensor([0.2331], device='cuda:0'), P: tensor([0.2274], device='cuda:0'), R: tensor([0.2714], device='cuda:0'), F1: tensor([0.2475], device='cuda:0')
Video 0s: 0.2450743466615677
Video 1s: 0.22721485793590546
Video 2s: 0.2532658576965332
Video 3s: 0.2712423503398895
Sentence a video of a man: 0.23396266996860504
Sentence the object in video is taking a walk: 0.2801622152328491
Sentence the event happens in summer: 0.23595964908599854
overall score: tensor([0.2581], device='cuda:0'), P: tensor([0.2500], device='cuda:0'), R: tensor([0.2492], device='cuda:0'),

Sentence the event happens on a cloudy day: 0.23554649949073792
overall score: tensor([0.2423], device='cuda:0'), P: tensor([0.2217], device='cuda:0'), R: tensor([0.2408], device='cuda:0'), F1: tensor([0.2308], device='cuda:0')
Video 0s: 0.23825815320014954
Video 1s: 0.2263544201850891
Video 2s: 0.23507048189640045
Video 3s: 0.22152771055698395
Sentence a video of a man: 0.18090154230594635
Sentence the object in video is running: 0.1847023069858551
Sentence the event happens on a cloudy day: 0.18278901278972626
overall score: tensor([0.1908], device='cuda:0'), P: tensor([0.1828], device='cuda:0'), R: tensor([0.2303], device='cuda:0'), F1: tensor([0.2038], device='cuda:0')
idx:  48
['a video of a man', 'the object in video is running', 'the event happens in the rain']
a man is running in the rain
Video 0s: 0.27010563015937805
Video 1s: 0.28223735094070435
Video 2s: 0.2820775806903839
Video 3s: 0.2863463759422302
Sentence a video of a man: 0.21343016624450684
Sentence the object in vide

Sentence a video of a man: 0.21965068578720093
Sentence the object in video is riding a bicycle: 0.14943775534629822
Sentence the event happens in the sunshine: 0.24563159048557281
overall score: tensor([0.1557], device='cuda:0'), P: tensor([0.2049], device='cuda:0'), R: tensor([0.1831], device='cuda:0'), F1: tensor([0.1934], device='cuda:0')
Video 0s: 0.16285352408885956
Video 1s: 0.1519053429365158
Video 2s: 0.21174824237823486
Video 3s: 0.20580491423606873
Sentence a video of a man: 0.2076341062784195
Sentence the object in video is riding a bicycle: 0.19364643096923828
Sentence the event happens in the sunshine: 0.21810470521450043
overall score: tensor([0.2208], device='cuda:0'), P: tensor([0.2065], device='cuda:0'), R: tensor([0.1831], device='cuda:0'), F1: tensor([0.1941], device='cuda:0')
Video 0s: 0.14135588705539703
Video 1s: 0.12408178299665451
Video 2s: 0.13955199718475342
Video 3s: 0.12325077503919601
Sentence a video of a man: 0.18309690058231354
Sentence the object in vi

Video 0s: 0.13178390264511108
Video 1s: 0.14167936146259308
Video 2s: 0.1420716941356659
Video 3s: 0.1492297649383545
Sentence a video of a man: 0.18722793459892273
Sentence the object in video is riding a bicycle: 0.08464303612709045
Sentence the event happens in summer: 0.21580663323402405
overall score: tensor([0.1068], device='cuda:0'), P: tensor([0.1626], device='cuda:0'), R: tensor([0.1412], device='cuda:0'), F1: tensor([0.1511], device='cuda:0')
idx:  59
['a video of a man', 'the object in video is riding a bicycle', 'the event happens in autumn']
a man is riding a bicycle in autumn
Video 0s: 0.19875819981098175
Video 1s: 0.20193402469158173
Video 2s: 0.19681718945503235
Video 3s: 0.20288994908332825
Sentence a video of a man: 0.20867465436458588
Sentence the object in video is riding a bicycle: 0.21109890937805176
Sentence the event happens in autumn: 0.18055416643619537
overall score: tensor([0.1801], device='cuda:0'), P: tensor([0.2001], device='cuda:0'), R: tensor([0.2001], 

Sentence a video of a man: 0.23588325083255768
Sentence the object in video is skateboarding: 0.19266171753406525
Sentence the event happens in the rain: 0.22933605313301086
overall score: tensor([0.2102], device='cuda:0'), P: tensor([0.2193], device='cuda:0'), R: tensor([0.2402], device='cuda:0'), F1: tensor([0.2293], device='cuda:0')
Video 0s: 0.18702225387096405
Video 1s: 0.2027198076248169
Video 2s: 0.2600474953651428
Video 3s: 0.21672290563583374
Sentence a video of a man: 0.22218622267246246
Sentence the object in video is skateboarding: 0.1682250201702118
Sentence the event happens in the rain: 0.2327626645565033
overall score: tensor([0.2155], device='cuda:0'), P: tensor([0.2077], device='cuda:0'), R: tensor([0.2166], device='cuda:0'), F1: tensor([0.2121], device='cuda:0')
Video 0s: 0.13081824779510498
Video 1s: 0.16283018887043
Video 2s: 0.15331201255321503
Video 3s: 0.14589977264404297
Sentence a video of a man: 0.15819820761680603
Sentence the object in video is skateboardin

Video 0s: 0.17478178441524506
Video 1s: 0.17789022624492645
Video 2s: 0.19200605154037476
Video 3s: 0.1934078335762024
Sentence a video of a woman: 0.17578817903995514
Sentence the object in video is taking a walk: 0.18101826310157776
Sentence the event happens in the sunshine: 0.22137653827667236
overall score: tensor([0.1543], device='cuda:0'), P: tensor([0.1927], device='cuda:0'), R: tensor([0.1845], device='cuda:0'), F1: tensor([0.1885], device='cuda:0')
idx:  70
['a video of a woman', 'the object in video is taking a walk', 'the event happens in the snow']
a woman is taking a walk in the snow
Video 0s: 0.22478309273719788
Video 1s: 0.23761776089668274
Video 2s: 0.230272576212883
Video 3s: 0.2488868236541748
Sentence a video of a woman: 0.21217262744903564
Sentence the object in video is taking a walk: 0.2079598754644394
Sentence the event happens in the snow: 0.24154482781887054
overall score: tensor([0.2104], device='cuda:0'), P: tensor([0.2206], device='cuda:0'), R: tensor([0.23

idx:  75
['a video of a young woman', 'the object in video is eating bananas', 'the event happens on a sofa']
a young woman is eating bananas on a sofa
Video 0s: 0.2476339340209961
Video 1s: 0.23128867149353027
Video 2s: 0.22478269040584564
Video 3s: 0.23292791843414307
Sentence a video of a young woman: 0.16676510870456696
Sentence the object in video is eating bananas: 0.2525305151939392
Sentence the event happens on a sofa: 0.10527029633522034
overall score: tensor([0.2023], device='cuda:0'), P: tensor([0.1749], device='cuda:0'), R: tensor([0.2342], device='cuda:0'), F1: tensor([0.2002], device='cuda:0')
Video 0s: 0.10850998014211655
Video 1s: 0.12314000725746155
Video 2s: 0.12264527380466461
Video 3s: 0.17898264527320862
Sentence a video of a young woman: 0.14993084967136383
Sentence the object in video is eating bananas: 0.22988355159759521
Sentence the event happens on a sofa: 0.14090223610401154
overall score: tensor([0.1813], device='cuda:0'), P: tensor([0.1736], device='cuda:0

Video 0s: 0.13858969509601593
Video 1s: 0.18548613786697388
Video 2s: 0.29860004782676697
Video 3s: 0.2902677655220032
Sentence a video of a dig: 0.14050129055976868
Sentence the object in video playing the piano: 0.17519795894622803
overall score: tensor([0.2469], device='cuda:0'), P: tensor([0.1578], device='cuda:0'), R: tensor([0.2282], device='cuda:0'), F1: tensor([0.1866], device='cuda:0')
Video 0s: 0.1736677587032318
Video 1s: 0.1593465507030487
Video 2s: 0.16180607676506042
Video 3s: 0.16445401310920715
Sentence a video of a dig: 0.19383502006530762
Sentence the object in video playing the piano: 0.10844134539365768
overall score: tensor([0.1395], device='cuda:0'), P: tensor([0.1511], device='cuda:0'), R: tensor([0.1648], device='cuda:0'), F1: tensor([0.1577], device='cuda:0')
idx:  81
['a video of a lion', 'the object in video is typing', 'the event happens in front of the computer']
a lion is typing in front of the computer
Video 0s: 0.16527657210826874
Video 1s: 0.17242357134

idx:  86
['a video of a doctor', 'the object in video is cutting a cake', 'the event happens in a classroom']
a doctor is cutting a cake in a classroom
Video 0s: 0.27303755283355713
Video 1s: 0.2674582600593567
Video 2s: 0.25720465183258057
Video 3s: 0.2693936228752136
Sentence a video of a doctor: 0.14537782967090607
Sentence the object in video is cutting a cake: 0.2161332219839096
Sentence the event happens in a classroom: 0.15383753180503845
overall score: tensor([0.2520], device='cuda:0'), P: tensor([0.1718], device='cuda:0'), R: tensor([0.2668], device='cuda:0'), F1: tensor([0.2090], device='cuda:0')
Video 0s: 0.16621102392673492
Video 1s: 0.1702287197113037
Video 2s: 0.18850889801979065
Video 3s: 0.19934707880020142
Sentence a video of a doctor: 0.2067912071943283
Sentence the object in video is cutting a cake: 0.12942558526992798
Sentence the event happens in a classroom: 0.17772448062896729
overall score: tensor([0.1817], device='cuda:0'), P: tensor([0.1713], device='cuda:0'),

Video 0s: 0.16653168201446533
Video 1s: 0.16710254549980164
Video 2s: 0.2521359622478485
Video 3s: 0.24547553062438965
Sentence a video of a woman: 0.22607482969760895
Sentence the object in video is skateboarding: 0.1816379278898239
Sentence the event happens in the rain: 0.1852950155735016
overall score: tensor([0.2339], device='cuda:0'), P: tensor([0.1977], device='cuda:0'), R: tensor([0.2078], device='cuda:0'), F1: tensor([0.2026], device='cuda:0')
Video 0s: 0.16005074977874756
Video 1s: 0.13742253184318542
Video 2s: 0.1635560840368271
Video 3s: 0.1638106405735016
Sentence a video of a woman: 0.1424846649169922
Sentence the object in video is skateboarding: 0.09372712671756744
Sentence the event happens in the rain: 0.20788834989070892
overall score: tensor([0.1128], device='cuda:0'), P: tensor([0.1480], device='cuda:0'), R: tensor([0.1562], device='cuda:0'), F1: tensor([0.1520], device='cuda:0')
idx:  92
['a video of a woman wearing a red dress', 'the object in video is smiling']


Sentence a video of a man wearing glasses: 0.21434450149536133
Sentence the object in video is angry: 0.21927447617053986
overall score: tensor([0.2113], device='cuda:0'), P: tensor([0.2168], device='cuda:0'), R: tensor([0.2354], device='cuda:0'), F1: tensor([0.2257], device='cuda:0')
Video 0s: 0.13390527665615082
Video 1s: 0.11831960827112198
Video 2s: 0.11751401424407959
Video 3s: 0.10545966774225235
Sentence a video of a man wearing glasses: 0.056048158556222916
Sentence the object in video is angry: 0.20441541075706482
overall score: tensor([0.0745], device='cuda:0'), P: tensor([0.1302], device='cuda:0'), R: tensor([0.1188], device='cuda:0'), F1: tensor([0.1243], device='cuda:0')
idx:  98
['a video of a woman', 'the object in video is riding a bicycle', 'the event happens in winter']
a woman is riding a bicycle in winter
Video 0s: 0.24565905332565308
Video 1s: 0.25036704540252686
Video 2s: 0.23319993913173676
Video 3s: 0.23024410009384155
Sentence a video of a woman: 0.195007681846

In [17]:
t = 2

cog = cog_arr[t]/255.0
diff = diff_arr[t]/255.0
aph = aph_arr[t]/255.0

cog_video = preprocessing(torch.from_numpy(cog).to(device))
diff_video = preprocessing(torch.from_numpy(diff).to(device))
aph_video = preprocessing(torch.from_numpy(aph).to(device))

str_list = prompts[t]
input_sentence = original_input[t]
print("idx: ", t)
print(str_list)
print(input_sentence)

print("cog")
cog_result = score(cog_video, input_sentence, str_list)
print("diff")
diff_result = score(diff_video, input_sentence, str_list)
print("aph")
aph_result = score(aph_video, input_sentence, str_list)



idx:  2
['a video of a man', 'the object in video is dancing', 'the event happens on the beach']
a man is dancing on the beach
cog
Video 0s: 0.24466434121131897
Video 1s: 0.24100665748119354
Video 2s: 0.23127445578575134
Video 3s: 0.21229515969753265
Sentence a video of a man: 0.17090624570846558
Sentence the object in video is dancing: 0.20595897734165192
Sentence the event happens on the beach: 0.19206292927265167
overall score: tensor([0.1919], device='cuda:0'), P: tensor([0.1896], device='cuda:0'), R: tensor([0.2323], device='cuda:0'), F1: tensor([0.2088], device='cuda:0')
diff
Video 0s: 0.17029929161071777
Video 1s: 0.11891451478004456
Video 2s: 0.18814058601856232
Video 3s: 0.21601328253746033
Sentence a video of a man: 0.2156868278980255
Sentence the object in video is dancing: 0.19468101859092712
Sentence the event happens on the beach: 0.12519288063049316
overall score: tensor([0.1507], device='cuda:0'), P: tensor([0.1785], device='cuda:0'), R: tensor([0.1733], device='cuda:0'

In [38]:
model, preprocess = clip.load("ViT-B/32", device=device)

In [63]:
def CLIP_score(video_frames, input_sentence):
    
    
    text = clip.tokenize([input_sentence]).to(device)
    
    score = 0
    
    for t in range(video_frames.size(0)):
        with torch.no_grad():
            logits_per_image, logits_per_text = model(video_frames[t], text)

        score += logits_per_image.sum().item()
    
    return score

# cog = cog_arr[0]/255.0
# input_sentence = original_input[0]

# cog_video = CLIP_preprocessing(torch.from_numpy(cog).to(device)).squeeze(0)
# CLIP_score(cog_video, input_sentence)


csv_path = 'CLIP_scores_flat.csv'
header = ['idx', 'cog', 'diff', 'aph']
csv_writer = csv.writer(open(csv_path, 'a', 1))
csv_writer.writerow(header)

for t in range(101):
    cog = cog_arr[t]/255.0
    diff = diff_arr[t]/255.0
    aph = aph_arr[t]/255.0
    
    cog_video = CLIP_preprocessing(torch.from_numpy(cog).to(device))
    diff_video = CLIP_preprocessing(torch.from_numpy(diff).to(device))
    aph_video = CLIP_preprocessing(torch.from_numpy(aph).to(device))
    
    input_sentence = original_input[t]
    print("idx: ", t)
    print(str_list)
    print(input_sentence)
    
    cog_result = CLIP_score(cog_video, input_sentence)
    diff_result = CLIP_score(diff_video, input_sentence)
    aph_result = CLIP_score(aph_video, input_sentence)
    print(cog_result, diff_result, aph_result)
    
#     csv_writer.writerow([t, 'cog', cog_result])
#     csv_writer.writerow([t, 'diff', diff_result])
#     csv_writer.writerow([t, 'aph', aph_result])
    csv_writer.writerow([t, cog_result, diff_result, aph_result])

idx:  0
['a video of a woman', 'the object in video is skateboarding', 'the event happens in autumn']
a boy is running on the lawn
938.0 731.375 1082.0
idx:  1
['a video of a woman', 'the object in video is skateboarding', 'the event happens in autumn']
a girl is taking a walk on the sea
1024.5 692.75 1088.75
idx:  2
['a video of a woman', 'the object in video is skateboarding', 'the event happens in autumn']
a man is dancing on the beach
1031.0 661.75 1049.75
idx:  3
['a video of a woman', 'the object in video is skateboarding', 'the event happens in autumn']
a woman is playing tai chi by the lake
1100.5 774.125 1155.75
idx:  4
['a video of a woman', 'the object in video is skateboarding', 'the event happens in autumn']
a boy is boxing in the sky
927.5 789.5 1134.75
idx:  5
['a video of a woman', 'the object in video is skateboarding', 'the event happens in autumn']
a girl is singing in the living room
924.25 724.0 1005.5
idx:  6
['a video of a woman', 'the object in video is skateboa

idx:  54
['a video of a woman', 'the object in video is skateboarding', 'the event happens in autumn']
a man is riding a bicycle in the snow
898.375 642.0 1120.25
idx:  55
['a video of a woman', 'the object in video is skateboarding', 'the event happens in autumn']
a man is riding a bicycle on a cloudy day
898.25 743.625 1132.25
idx:  56
['a video of a woman', 'the object in video is skateboarding', 'the event happens in autumn']
a man is riding a bicycle in the rain
940.125 706.0 1146.0
idx:  57
['a video of a woman', 'the object in video is skateboarding', 'the event happens in autumn']
a man is riding a bicycle in spring
891.625 792.5 1098.5
idx:  58
['a video of a woman', 'the object in video is skateboarding', 'the event happens in autumn']
a man is riding a bicycle in summer
944.0 741.0 1052.25
idx:  59
['a video of a woman', 'the object in video is skateboarding', 'the event happens in autumn']
a man is riding a bicycle in autumn
964.5 760.75 1113.5
idx:  60
['a video of a woman

In [34]:
human_arr = np.load('human_score.npz')["human_score"]
print(human_arr)

[[1 3 2]
 [1 3 2]
 [1 2 3]
 [1 2 3]
 [1 3 2]
 [1 3 2]
 [2 3 1]
 [1 3 2]
 [2 3 1]
 [2 3 1]
 [1 3 2]
 [1 3 2]
 [2 1 3]
 [1 2 3]
 [1 3 2]
 [2 3 1]
 [1 3 2]
 [1 2 3]
 [1 2 3]
 [2 3 1]
 [2 3 1]
 [3 1 2]
 [1 3 2]
 [2 3 1]
 [1 3 2]
 [3 2 1]
 [1 2 3]
 [1 2 3]
 [1 3 2]
 [1 2 3]
 [1 2 3]
 [1 3 2]
 [2 3 1]
 [1 2 3]
 [1 2 3]
 [1 2 3]
 [2 3 1]
 [1 3 2]
 [2 3 1]
 [1 3 2]
 [2 3 1]
 [2 3 1]
 [1 3 2]
 [1 3 2]
 [1 3 2]
 [1 3 2]
 [1 3 2]
 [1 2 3]
 [1 3 2]
 [2 3 1]
 [1 2 3]
 [1 3 2]
 [1 2 3]
 [1 2 3]
 [2 3 1]
 [1 3 2]
 [2 3 1]
 [1 2 3]
 [1 2 3]
 [1 2 3]
 [1 2 3]
 [1 2 3]
 [1 2 3]
 [1 2 3]
 [1 3 2]
 [1 2 3]
 [1 2 3]
 [1 3 2]
 [1 3 2]
 [1 2 3]
 [1 3 2]
 [1 2 3]
 [2 1 3]
 [1 2 3]
 [1 3 2]
 [1 3 2]
 [1 2 3]
 [1 3 2]
 [1 3 2]
 [1 3 2]
 [1 2 3]
 [1 2 3]
 [1 3 2]
 [1 2 3]
 [1 2 3]
 [1 3 2]
 [1 2 3]
 [1 3 2]
 [1 2 3]
 [1 2 3]
 [1 3 2]
 [1 2 3]
 [1 2 3]
 [1 3 2]
 [1 2 3]
 [1 2 3]
 [1 2 3]
 [1 2 3]
 [1 3 2]
 [1 3 2]
 [1 3 2]]


In [39]:
file = 'Ours_score_flat_f1.csv'
fields = []
rows = []

with open(file, 'r') as csvfile:
    # creating a csv reader object
    csvreader = csv.reader(csvfile)

    # extracting field names through first row
    fields = next(csvreader)

    # extracting each data row one by one
    for row in csvreader:
        rows.append(row)

ours_arr = []

for idx, row in enumerate(rows):
    this_row = []
    this_row.append(float(row[1]))
    this_row.append(float(row[2]))
    this_row.append(float(row[3]))
    this_row = np.array(this_row)
    order = (-this_row).argsort()
    ranks = order.argsort()
    ours_arr.append(ranks+1)
ours_arr = np.array(ours_arr)
print(ours_arr)


[[1 2 3]
 [1 2 3]
 [1 2 3]
 [2 1 3]
 [2 3 1]
 [1 2 3]
 [2 1 3]
 [1 3 2]
 [1 2 3]
 [1 2 3]
 [2 3 1]
 [1 2 3]
 [1 3 2]
 [1 2 3]
 [1 2 3]
 [3 1 2]
 [1 2 3]
 [1 2 3]
 [1 2 3]
 [1 3 2]
 [3 2 1]
 [1 3 2]
 [1 2 3]
 [1 2 3]
 [1 2 3]
 [1 2 3]
 [2 1 3]
 [1 2 3]
 [1 2 3]
 [1 2 3]
 [1 2 3]
 [2 1 3]
 [1 2 3]
 [1 2 3]
 [1 2 3]
 [2 1 3]
 [2 1 3]
 [1 2 3]
 [1 2 3]
 [2 1 3]
 [2 1 3]
 [1 2 3]
 [2 1 3]
 [3 1 2]
 [1 2 3]
 [1 2 3]
 [1 3 2]
 [2 1 3]
 [2 1 3]
 [3 1 2]
 [1 2 3]
 [1 2 3]
 [1 2 3]
 [2 1 3]
 [1 2 3]
 [1 2 3]
 [2 1 3]
 [1 2 3]
 [1 2 3]
 [1 2 3]
 [1 2 3]
 [2 1 3]
 [1 2 3]
 [1 2 3]
 [1 2 3]
 [2 1 3]
 [1 2 3]
 [1 2 3]
 [1 2 3]
 [1 2 3]
 [1 2 3]
 [2 1 3]
 [1 2 3]
 [2 1 3]
 [1 2 3]
 [1 2 3]
 [2 1 3]
 [2 1 3]
 [3 1 2]
 [1 2 3]
 [2 1 3]
 [2 1 3]
 [1 2 3]
 [1 2 3]
 [1 2 3]
 [1 3 2]
 [1 2 3]
 [2 1 3]
 [1 2 3]
 [2 1 3]
 [1 2 3]
 [2 1 3]
 [3 1 2]
 [1 2 3]
 [1 2 3]
 [1 2 3]
 [2 1 3]
 [1 2 3]
 [1 2 3]
 [1 2 3]
 [2 1 3]]


In [38]:
from scipy import stats

rho, pval = stats.spearmanr(human_arr, ours_arr, axis=None)
print(rho)
print(pval)

0.2555977457512623
6.61109668575468e-06


In [21]:
file = 'CLIP_scores_flat.csv'
fields = []
rows = []

with open(file, 'r') as csvfile:
    # creating a csv reader object
    csvreader = csv.reader(csvfile)

    # extracting field names through first row
    fields = next(csvreader)

    # extracting each data row one by one
    for row in csvreader:
        rows.append(row)

clips_arr = []

for idx, row in enumerate(rows):
    this_row = []
    this_row.append(float(row[1]))
    this_row.append(float(row[2]))
    this_row.append(float(row[3]))
#     print(this_row)
#     this_row = np.array(this_row)
#     order = (-this_row).argsort()
#     ranks = order.argsort()
#     clips_arr.append(ranks+1)
    clips_arr.append(this_row)

clips_arr = np.array(clips_arr)
print(clips_arr)

[938.0, 731.375, 1082.0]
[1024.5, 692.75, 1088.75]
[1031.0, 661.75, 1049.75]
[1100.5, 774.125, 1155.75]
[927.5, 789.5, 1134.75]
[924.25, 724.0, 1005.5]
[886.125, 677.875, 1079.5]
[946.75, 686.875, 1117.25]
[1065.25, 728.75, 1143.0]
[1005.875, 805.625, 1131.0]
[972.0, 700.125, 1046.5]
[826.375, 736.5, 974.75]
[887.875, 908.375, 1054.5]
[823.75, 871.5, 971.375]
[970.25, 865.375, 996.0]
[761.25, 774.125, 916.0]
[921.25, 702.625, 996.5]
[816.75, 746.375, 950.125]
[897.125, 794.625, 1004.125]
[769.375, 796.5, 886.25]
[905.25, 668.75, 1015.5]
[948.75, 914.125, 1047.5]
[927.875, 887.5, 995.0]
[806.625, 788.0, 918.125]
[991.125, 799.5, 1027.75]
[917.875, 947.75, 1102.25]
[977.625, 929.5, 1031.75]
[879.375, 776.125, 934.125]
[927.5, 860.125, 1035.875]
[967.25, 889.25, 1015.875]
[1015.125, 805.0, 1024.375]
[903.875, 769.75, 1025.75]
[938.25, 714.0, 1048.25]
[942.75, 813.375, 1002.375]
[896.125, 782.625, 972.125]
[903.0, 772.25, 948.0]
[795.625, 759.5, 910.0]
[865.25, 773.625, 968.5]
[979.125, 66

In [37]:
from scipy import stats

rho, pval = stats.spearmanr(human_arr, clips_arr, axis=None)
print(rho)
print(pval)

0.18316831683168316
0.0013631192939210061
