In [95]:
import cv2
import numpy as np
from PIL import Image
import io

def process_video_into_frames(video_link):
    video = cv2.VideoCapture(video_link)
    
    total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = int(video.get(cv2.CAP_PROP_FPS))  # Frames per second
    duration = total_frames / fps  # Video duration in seconds

    print(f"Total number of frames: {total_frames}")
    print(f"Video duration (seconds): {duration}")

    frames = []
    
    # Ensure we get 20 frames evenly spaced
    if total_frames < 100:
        print("Not enough frames in the video.")
        return frames
    
    # Get 20 evenly spaced indices across the video frames
    sample_intervals = np.linspace(0, total_frames - 1, 100, dtype=int)  # Get 20 evenly spaced indices
    
    for idx in sample_intervals:
        video.set(cv2.CAP_PROP_POS_FRAMES, idx)  # Jump to specific frame
        ret, frame = video.read()
        if not ret:
            continue
        
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        img = Image.fromarray(frame)

        # Save the frame as bytes
        img_bytes = io.BytesIO()
        img.save(img_bytes, format="JPEG")
        img_bytes = img_bytes.getvalue()

        frames.append(img_bytes)
    
    video.release()
    return frames

In [111]:
def query_frame(frames, prompt):
    
    client = ollama.Client()
    
    model = "ZimaBlueAI/Qwen2.5-VL-7B-Instruct"
    
    # Send image to the LLM
    response = client.generate(
        model=model,
        prompt=prompt,
        images=frames 
    )
    
    return response['response']

In [70]:
def query(prompt):
    
    client = ollama.Client()
    
    model = "minicpm-v"
    
    # Send image to the LLM
    response = client.generate(
        model=model,
        prompt=prompt
    )
    
    return response['response']

In [52]:
def ask(video_link, final_prompt):

    context_dict = []

    ''' Add sampling and reasoning model and vector db? '''
    frames = process_video_into_frames(video_link)
    
    for i in range(0, len(frames), 5):  
            
        frame_context = frames[i:i+5]  
    
        prompt = "Tell me what does this frames do? It is part of a video. Keep it short"

        if i+6 > len(frames):

            prompt = f"Answer this question {final_prompt}, context is given from previous frames: " + " ".join(context_dict)

        result = query_frame(frame_context, prompt)
        context_dict.append(result)
        
    print("ENGINEERED PROMPT HERE:" + prompt)
    
    return result

# Pipeline

In [72]:
import pandas as pd

df = pd.read_parquet(r"C:\Users\leege\Downloads\test-00000-of-00001.parquet")
df.head()

Unnamed: 0,qid,video_id,question_type,capability,question,duration,question_prompt,answer,youtube_url
0,0008-0,sj81PWrerDk,Primary Open-ended Question,Plot Attribute (Montage),What is the difference between the action of t...,8.85,Please state your answer with a brief explanat...,,https://www.youtube.com/shorts/sj81PWrerDk
1,0008-1,sj81PWrerDk,Paraphrased Open-ended Question,Plot Attribute (Montage),Can you describe how the actions of the last p...,8.85,Please state your answer with a brief explanat...,,https://www.youtube.com/shorts/sj81PWrerDk
2,0008-2,sj81PWrerDk,Correctly-led Open-ended Question,Plot Attribute (Montage),Did the last person open the bottle without us...,8.85,Please state your answer with a brief explanat...,,https://www.youtube.com/shorts/sj81PWrerDk
3,0008-3,sj81PWrerDk,Wrongly-led Open-ended Question,Plot Attribute (Montage),Did the last person in the video open the bott...,8.85,Please state your answer with a brief explanat...,,https://www.youtube.com/shorts/sj81PWrerDk
4,0008-7,sj81PWrerDk,Multiple-choice Question with a Single Correct...,Plot Attribute (Montage),How does the last person in the video open the...,8.85,E. None of the above\nSelect one best answer t...,,https://www.youtube.com/shorts/sj81PWrerDk


In [53]:
import pandas as pd
import os

folder_path = r"C:\Users\leege\Downloads\Benchmark-AllVideos-HQ-Encoded-challenge\Benchmark-AllVideos-HQ-Encoded-challenge"

video_ids = df['video_id'].unique().tolist()

result_data = []

for videos in video_ids:

    filtered = df[df['video_id'] == videos]

    videos = videos+".mp4"
    video_path = os.path.join(folder_path, videos)

    context_dict = []
    
    for index, row in filtered.iterrows():
        question = row['question']
        prompt = row['question_prompt']

        question_prompt = question + prompt + 'please choose to the best of your ability the right answer'

        ''' Add the question phrasing part here '''
        result = ask(video_path, question_prompt) 

        print(question_prompt)
        print(result)
        
        result_data.append({
            'video_id': video_id,
            'video_path': video_path,
            'question': question,
            'question_prompt': prompt,
            'result': result
        })
        break

        
    break
result_df = pd.DataFrame(result_data)

Total number of frames: 263
Video duration (seconds): 9.068965517241379
One cycle done
One cycle done
One cycle done
One cycle done
ENGINEERED PROMPT HERE:Answer this question What is the difference between the action of the last person in the video and the actions of the first two people?Please state your answer with a brief explanation.please choose to the best of your ability the right answer, context is given from previous frames: The frames are from a split-screen music video where two individuals, one in each half, perform actions with drinks and make expressive gestures to the camera while singing or rapping lyrics. The first individual appears outside on a balcony holding a bottle of beer, opening it, taking a sip, and gesturing confidently as if performing for an audience not visible within this frame. Meanwhile, another person is seen indoors at what looks like a kitchen counter also drinking from a similar-looking beverage container with enthusiasm or emphasis in their gestu

# Random 

In [116]:
frames = process_video_into_frames(r"C:\Users\leege\Downloads\Benchmark-AllVideos-HQ-Encoded-challenge\Benchmark-AllVideos-HQ-Encoded-challenge\sj81PWrerDk.mp4")

Total number of frames: 263
Video duration (seconds): 9.068965517241379


In [122]:
len(frames)

100

In [125]:
prompt = f"""This are snippets sampled every few frames from a video, please pay attention to 
            1) How many people are there
            2) What are they doing with a bottle
            and give me a summary of what is going on. 
            The context should be linked to: {question}"""

result1 = query_frame(frames[:5], prompt)
display(result1)

"The images appear to show a sequence where an individual is interacting with a bottle in different settings or actions, possibly involving opening it or examining its contents. However, there are no clear visual cues indicating whether any of these actions were performed without using a knife.\n\n1. In image-0: The person seems to be holding something up close to their mouth.\n2. Image-1 and 2 show the individual in different poses with a bottle nearby but not directly interacting with it.\n3. Image-3 shows the same individual standing next to or interacting with another object, which does not clearly indicate any use of a knife.\n4. In image-4, there is no clear evidence that suggests the person used a knife; instead, they might be holding or presenting something in their hands.\n\nGiven these observations:\n\n- The setting and context are too limited for definitive conclusions about whether anyone is opening a bottle without using a knife. \n- The images do not provide enough inform

In [127]:
prompt = f"""This are snippets sampled every few frames from a video, please pay attention to 
            1) How many people are there
            2) What are they doing with a bottle
            and give me a summary of what is going on. Please take into account the context from the previous frames of the same video, which is:
            {result1} 
            The eventual question I am trying to answer is: {question}"""

result2 = query_frame(frames[5:10], prompt)
display(result2)

KeyboardInterrupt: 

In [None]:
prompt = f"""This are snippets sampled every few frames from a video, please pay attention to 
            1) How many people are there
            2) What are they doing with a bottle
            and give me a summary of what is going on. Please take into account the context from the previous frames of the same video, which is:
            {result2}
            The eventual question I am trying to answer is: {question}"""

result3 = query_frame(frames[10:15], prompt)
display(result3)

In [None]:
prompt = f"""This are snippets sampled every few frames from a video, please pay attention to 
            1) How many people are there
            2) What are they doing with a bottle
            and give me a summary of what is going on. Please take into account the context from the previous frames of the same video, which is:
            {result3}
            The eventual question I am trying to answer is: {question}"""

result4 = query_frame(frames[15:20], prompt)
display(result4)

In [None]:
prompt = f""" I have split up a video and these are the descriptions of each part. 
                Part 1: {result1} END
                Part 2: {result2} END
                Part 3: {result3} END 
                Part 4: {result4} END
                Please answer the question {question}
"""

final = query(prompt)
display(final)

In [1]:
from IPython.display import display
from PIL import Image
import io

for img_bytes in frames:
    img = Image.open(io.BytesIO(img_bytes))
    display(img)