In [1]:
import json, os, glob, sys, re
from collections import defaultdict

In [2]:
# input video
video_filename = "a_corgi_taking_a_selfie.mp4"

# pre-generated questions
tifa_questions = json.load(open("sample_questions.json"))

# working directory to save all the frames and corresponding answers
working_folder = "sample_results"

In [3]:
# extract frames from the video
from utils import extract_frames

frames = extract_frames(video_filename, working_folder,n_div=12)

In [6]:
# VQA for all questions
# replace get_vqa_answers to use other VLMs
from gpt4o import get_vqa_answers

def get_vqa_answers_for_dir(image_directory, tifa_questions):
    
    images = glob.glob(os.path.join(image_directory, "*.png"))
    
    this_prompt_questions = tifa_questions["questions"]
    
    this_prompt_answers= {"questions": this_prompt_questions,
                          "answers": defaultdict(dict),
                          "scene_graph": tifa_questions["scene_graph"],
                          "dependencies": tifa_questions["question_dependencies"]
                          }
    
    for image in images:
        answers = get_vqa_answers(image, this_prompt_questions)
        this_image = os.path.basename(image)
        
        for qid, answer in answers.items():
            this_prompt_answers["answers"][qid][this_image] = answer
            
        for qid in this_prompt_questions.keys():
            if int(qid) not in answers:
                this_prompt_answers["answers"][int(qid)][this_image] = "N/A"
            
    with open(os.path.join(image_directory, "vqa_answers.json"), "w") as f:
        json.dump(this_prompt_answers, f, indent=4)

In [7]:
# Run the VQA (may take a while)
get_vqa_answers_for_dir(working_folder, tifa_questions)

In [22]:
# Compute the alignment score for the model. Also, get the score for each question and record it.

def process_directory_tifa(directory, json_name="vqa_answers.json", n_div=12):
    tifa_score_dict = json.load(open(os.path.join(directory, json_name)))
    per_question_score_dict = {}
    
    q_ids = list(tifa_score_dict["answers"].keys())
    
    question_dependencies = tifa_score_dict["dependencies"]
    
    for q_id in q_ids:
        this_scores = []

        # first, check dependency
        dependency_ids = question_dependencies[q_id]
        question_valid_flag = True
        for dep_id in dependency_ids:
            dep_id = str(dep_id)
            if dep_id in per_question_score_dict:
                if per_question_score_dict[dep_id] < 0.5:
                    question_valid_flag = False
                    break
        
        # if prerequisite question got the wrong answer, then this question is also wrong
        if not question_valid_flag:
            per_question_score_dict[q_id] = 0
            continue
        
        for i in range(n_div):
            try:
                answer = tifa_score_dict["answers"][q_id][f"{i}.png"]
            except:
                answer = "N/A"
            
            if "yes" in answer.strip().lower():
                this_scores.append(1)
            else:
                this_scores.append(0)
        
        # now pooling is here
        looped_scores = this_scores + this_scores + this_scores
        
        # if three consecutive yes, then the question is good
        for i in range(n_div, 2*n_div):
            if looped_scores[i] == 1 and looped_scores[i+1] == 1 and looped_scores[i-1] == 1:
                per_question_score_dict[q_id] = 1
                break
            
        if q_id not in per_question_score_dict:
            per_question_score_dict[q_id] = 0
        
    # record per_question_scores
    tifa_score_dict["alignment_scores_per_question"] = per_question_score_dict
    
    # return average score
    avg_score =  sum(per_question_score_dict.values())/len(per_question_score_dict)
    tifa_score_dict["alignment_score"] = avg_score
    
    with open(os.path.join(directory, json_name), "w") as f:
        json.dump(tifa_score_dict, f, indent=2)
        
    return avg_score

In [23]:
# compute the final alignment score

process_directory_tifa(working_folder)

0.5