In [1]:
import argparse
import json
import torch
import json
import os

In [2]:
def save_arguments(args, filepath):
    with open(filepath, 'w') as file:
        json.dump(vars(args), file)

def load_arguments(filepath):
    with open(filepath, 'r') as file:
        args_dict = json.load(file)
    return args_dict

# Optionally, repopulate argparse.ArgumentParser with these arguments
def repopulate_arguments(args_dict):
    parser = argparse.ArgumentParser(description="Example script")
    for key, value in args_dict.items():
        parser.add_argument(f'--{key}', type=type(value),default=value)
    return parser.parse_args([])

In [8]:
path = '../vqa_checkpoint/checkpoint_pretrain/llama2_7b_acc4_br5e3_correct_vnips'

loaded_args = load_arguments(path+'/args.json')

args = repopulate_arguments(loaded_args)
args.llama_model_path = '.' +args.llama_model_path
args.resume=f'{path}/checkpoint_19.pth'

In [4]:
# path = '../vqa_checkpoint/checkpoint_pretrain/llama2_13b_acc8_br8e3_bs4_vnips'

# loaded_args = load_arguments(path+'/args.json')

# args = repopulate_arguments(loaded_args)
# args.llama_model_path = '.' +args.llama_model_path
# args.resume='../vqa_checkpoint/checkpoint_pretrain/llama2_13b_acc8_br8e3_bs4_vnips/checkpoint_18.pth'

In [5]:
import sys
sys.path.append('../')
from llama import Tokenizer
from llama_vqa import LLaMA_VQA
from dataloader import load_data

  from .autonotebook import tqdm as notebook_tqdm
  from pandas.core import (


In [6]:
with torch.no_grad():
    model = LLaMA_VQA(args)

Using model: 7B
loading from ../pretrained/llama2/7B/consolidated.00.pth


  _C._set_default_tensor_type(t)


In [9]:
checkpoint = torch.load(args.resume, map_location='cpu')
model.load_state_dict(checkpoint['model'], strict=False)
tokenizer = Tokenizer(model_path=f'{args.llama_model_path}./tokenizer.model')

In [11]:
import clip
clip_model, preprocess = clip.load("ViT-L/14")
clip_model.eval()
clip_model = clip_model.cuda()

In [12]:
import cv2
import numpy as np
from PIL import Image
def sample_images_from_video(video_path, num_samples=10):
    # Open the video file
    cap = cv2.VideoCapture(video_path)

    # Get the total number of frames in the video
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    frame_rate = cap.get(cv2.CAP_PROP_FPS)

    # Calculate total duration in seconds
    total_duration = total_frames / frame_rate
    # print(total_duration)
    # Check if the video opened successfully
    if not cap.isOpened():
        print("Error opening video file.")
        return []

    # Calculate the interval for sampling
    interval = total_frames // num_samples

    # Initialize a list to store the sampled images
    sampled_images = []

    for i in range(num_samples):
        # Set the frame position
        frame_id = i * interval
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_id)

        # Read the frame
        ret, frame = cap.read()

        # If frame reading was successful, save the frame
        if ret:
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            pil_image = Image.fromarray(frame)
            sampled_images.append(pil_image)
            
        else:
            print(f"Error reading frame at position {frame_id}")

    # Release the video capture object
    cap.release()

    return sampled_images, total_frames


In [21]:
def decoding(model, tokenizer, prompt1,prompt2,video=None):
    adapter = model.adapter_query.weight.reshape(-1, model.adapter_len, model.params.dim).unsqueeze(1)
    freqs= model.freqs_cis.cuda()
    
    tokens = [tokenizer.bos_id] + tokenizer.sp_model.encode(prompt1)
    query = torch.tensor(tokens, dtype=torch.int64).cuda()
    input_embedding = model.tok_embeddings(query)

    tokens_2 = tokenizer.sp_model.encode(prompt2)
    query_2 = torch.tensor(tokens_2, dtype=torch.int64).cuda()
    input_embedding_2 = model.tok_embeddings(query_2)
    tokens.extend(tokens_2)
    video = video.cuda().float()
    video/=video.norm(dim=-1,keepdim=True)
    if False:
        sim = video@model.memory.T

        sim = (sim*100).softmax(dim=-1)

        video = sim@model.memory
        video = video/video.norm(dim=-1,keepdim=True)
        
    video_feature = model.visual_proj(video)
    video_feature = (video_feature + model.temporal_emb.weight[:, :]).type(model.llamatype)
    vqa_video_start=input_embedding.shape[0]
    # print(video_feature.shape)
    input_embedding = torch.cat([input_embedding,video_feature,input_embedding_2])
    start_pos=0
    for j in range(15):
        vqa_h = input_embedding.unsqueeze(0)
        seqlen = vqa_h.shape[-2]
        freqs_cis = freqs[:seqlen]
        mask = None
        mask = torch.full((1, 1, seqlen, seqlen), float("-inf"), device=vqa_h.device)
        mask = torch.triu(mask, diagonal=0 + 1).type_as(vqa_h)

        for i, layer in enumerate(model.layers[-1 * model.adapter_layer:]):
            vqa_h = layer(vqa_h, start_pos, freqs_cis, mask, adapter[i].type(model.llamatype), vqa_video_start)
        vqa_h = model.norm(vqa_h)
        vqa_output = model.output(vqa_h)
        vqa_output = vqa_output.reshape(-1, model.vocab_size)
        next_token = vqa_output[-1,:].argmax()
        tokens.append(next_token.item())
        token_emb = model.tok_embeddings(next_token.unsqueeze(0))
        input_embedding = torch.cat([input_embedding,token_emb],dim=0)
    return tokens

In [14]:
dataset_path = '../../data/videos/vatex/'
test_file = os.path.join(dataset_path,'vatex_validation_v1.0.json')

with open(test_file,'r') as f:
    test_info = json.load(f)

In [15]:
annotations = []
for v in test_info:
    for caption in v['enCap']:
        ann={}
        ann['image_id'] = v['videoID']
        ann['id'] = v['videoID']
        ann['caption'] = caption
        annotations.append(ann)
annotations_gt = {}
annotations_gt['images'] = [{'id':v['videoID'],'videoID':v['videoID']} for v in test_info]
annotations_gt['annotations'] =annotations

In [16]:

from pycocotools.coco import COCO
from pycocoevalcap.eval import COCOEvalCap

from json import encoder
encoder.FLOAT_REPR = lambda o: format(o, '.3f')
import sys

annotation_file = os.path.join(dataset_path,'annotation_file')
with open(annotation_file,'w') as f:
    json.dump(annotations_gt,f)
coco = COCO(annotation_file)
video_ids = coco.getImgIds()

loading annotations into memory...
Done (t=0.03s)
creating index...
index created!


In [17]:
# prompt = "Instruction: Predict the answer based on the video and question.\nVideo:"
# prompt2 = "\nQuestion: Summarize the video.\nAnswer: It is a video showing"  #26.7

prompt = "Instruction: Generate a dense description for the video.\nVideo:"
prompt2 = "\nVideo Caption: The video shows"

In [22]:
from tqdm import tqdm
results = []
    
for video_id in tqdm(video_ids[:10]):
    with torch.no_grad():
        try:
                
            video_path = os.path.join(dataset_path,'val_all',str(video_id)+'.mp4')
            sampled_images,_ = sample_images_from_video(video_path)
    
            image_features = [preprocess(image) for image in sampled_images]
            image_features = torch.stack(image_features,dim=0).cuda()
            image_features = clip_model.encode_image(image_features)
            image_features/=image_features.norm(dim=-1,keepdim=True)
            tokens = decoding(model,tokenizer,prompt,prompt2,image_features)
            generate_text = tokenizer.decode(tokens[:])
            generate_text = generate_text.split('Video Caption: ')[1].replace("The video shows",'').strip().split('.')[0]
            results.append({'image_id':video_id,'caption': generate_text})
        except:
            results.append({'image_id':video_id,'caption': 'A video'})
# for video_id in video_ids[10:]:
#     results.append({'image_id':video_id,'caption': 'a video'})

100%|██████████| 10/10 [00:17<00:00,  1.73s/it]


In [23]:
results

[{'image_id': 'G9zN5TTuGO4_000179_000189',
  'caption': 'a man named Igor Kuzmich climbing a snowy mountain'},
 {'image_id': 'CQzUU7-cVck_000006_000016',
  'caption': 'a man jumping from a tree to a wooden obstacle course'},
 {'image_id': 'OMK0OJ4f_TI_000000_000010',
  'caption': 'a man wearing a black hood and a black mask, and he'},
 {'image_id': 'xnyOA58A07Q_000127_000137',
  'caption': 'Beth and Jim Acosta, two climbers, rappelling down a cl'},
 {'image_id': 'n6lUXDwL4Y0_000022_000032',
  'caption': 'a man demonstrating the importance of being flexible and having a positive attitude when'},
 {'image_id': 'cU1qVk7HXfE_000304_000314',
  'caption': 'a man in a Jesus Christ costume performing a striptease'},
 {'image_id': '7CN3ENwfMBE_000096_000106',
  'caption': 'a man performing a one-man show about the character Ostroff from'},
 {'image_id': 'rW0KwHhQZTE_000013_000023',
  'caption': 'a group of students performing an act of mischief'},
 {'image_id': 'KKm1bM51CZs_000070_000080',
  'c

In [23]:
coco_result = coco.loadRes(results) 

coco_eval = COCOEvalCap(coco, coco_result)
coco_eval.evaluate()
# print output evaluation s|cores
scores = {}
for metric, score in coco_eval.eval.items():
    print(f"{metric}: {score:.3f}")
    scores[metric] = score
with open('vatex_7B_results.json','w') as f:
    json.dump(scores,f)

Loading and preparing results...
DONE (t=0.00s)
creating index...
index created!
tokenization...


PTBTokenizer tokenized 486507 tokens at 1215912.69 tokens per second.
PTBTokenizer tokenized 28554 tokens at 393917.56 tokens per second.


setting up scorers...
computing Bleu score...
{'testlen': 25107, 'reflen': 32132, 'guess': [25107, 22107, 19107, 16107], 'correct': [17572, 6785, 2294, 735]}
ratio: 0.7813705962902782
Bleu_1: 0.529
Bleu_2: 0.350
Bleu_3: 0.223
Bleu_4: 0.140
computing METEOR score...
METEOR: 0.166
computing Rouge score...
ROUGE_L: 0.371
computing CIDEr score...
CIDEr: 0.310
computing SPICE score...


Parsing reference captions
Parsing test captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ... 
done [0.3 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ner
Loading classifier from edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz ... done [0.7 sec].
Loading classifier from edu/stanford/nlp/models/ner/english.muc.7class.distsim.crf.ser.gz ... done [0.6

SPICE evaluation took: 15.56 s
SPICE: 0.074
Bleu_1: 0.529
Bleu_2: 0.350
Bleu_3: 0.223
Bleu_4: 0.140
METEOR: 0.166
ROUGE_L: 0.371
CIDEr: 0.310
SPICE: 0.074


In [21]:
coco_result = coco.loadRes(results) 

coco_eval = COCOEvalCap(coco, coco_result)
coco_eval.evaluate()
# print output evaluation scores
scores = {}
for metric, score in coco_eval.eval.items():
    print(f"{metric}: {score:.3f}")
    scores['metric'] = score
with open('msrvtt_results.json','w') as f:
    json.dump(scores,f)

Loading and preparing results...
DONE (t=0.00s)
creating index...
index created!
tokenization...


PTBTokenizer tokenized 617049 tokens at 947671.93 tokens per second.
PTBTokenizer tokenized 23044 tokens at 341563.64 tokens per second.


setting up scorers...
computing Bleu score...
{'testlen': 20039, 'reflen': 21562, 'guess': [20039, 17049, 14060, 11076], 'correct': [14340, 6147, 1952, 667]}
ratio: 0.9293664780632163
Bleu_1: 0.663
Bleu_2: 0.471
Bleu_3: 0.306
Bleu_4: 0.200
computing METEOR score...
METEOR: 0.214
computing Rouge score...
ROUGE_L: 0.481
computing CIDEr score...
CIDEr: 0.307
computing SPICE score...


Parsing reference captions
Parsing test captions


SPICE evaluation took: 7.328 s
SPICE: 0.053
Bleu_1: 0.663
Bleu_2: 0.471
Bleu_3: 0.306
Bleu_4: 0.200
METEOR: 0.214
ROUGE_L: 0.481
CIDEr: 0.307
SPICE: 0.053
