Ego4D download videos

In [None]:
# Step 1: Install Ego4D CLI
!pip install git+https://github.com/facebookresearch/Ego4d.git#egg=ego4d

In [None]:
!pip install awscli
import os
os.environ["PATH"] += ":/root/.local/bin"
!aws --version

In [None]:
import os

# Replace these with your actual credentials
aws_access_key = ""
aws_secret_key = ""
aws_region = "us-east-1"  # Or your preferred region

# Create AWS config and credentials paths
aws_dir = os.path.expanduser("~/.aws")
os.makedirs(aws_dir, exist_ok=True)

# Write to ~/.aws/credentials
with open(os.path.join(aws_dir, "credentials"), "w") as f:
    f.write(f"""[default]
aws_access_key_id = {aws_access_key}
aws_secret_access_key = {aws_secret_key}
""")

# Write to ~/.aws/config
with open(os.path.join(aws_dir, "config"), "w") as f:
    f.write(f"""[default]
region = {aws_region}
output = json
""")

print("✅ AWS default profile created!")


In [None]:
import json

# Load the JSON file
with open('egotempo_openQA.json', 'r') as f:
    data = json.load(f)

# Lists to store unique IDs
video_uids = set()
video_clip_uids = set()

# Iterate over the annotations and extract video ID and clip ID
for annotation in data['annotations']:
    clip_id = annotation['clip_id']

    # Handle .mp4 suffix if exists
    if clip_id.endswith('.mp4'):
        clip_id = clip_id[:-4]  # Remove the last 4 chars

    # Extract video_id (before first '_') for video_uids
    video_id = clip_id.split('_')[0]
    video_uids.add(video_id)

    # Full clip_id (with _start_end timestamps) goes into video_clip_uids
    video_clip_uids.add(clip_id)

# Save video IDs (only the base video names)
with open('video_uids.txt', 'w') as f:
    for video_id in sorted(video_uids):
        f.write(video_id + '\n')

# Save clip IDs (full clip names without .mp4)
with open('video_clip_uids.txt', 'w') as f:
    for clip_id in sorted(video_clip_uids):
        f.write(clip_id + '\n')

print(f"Extracted {len(video_uids)} unique video IDs to 'video_uids.txt'.")
print(f"Extracted {len(video_clip_uids)} unique clip IDs to 'video_clip_uids.txt'.")


In [None]:
# Read the video UIDs into a list
with open('video_uids.txt', 'r') as f:
    video_uids = [line.strip() for line in f if line.strip()]

# Join them into a space-separated string
video_uids_str = ' '.join(video_uids)

# Now run the command
!ego4d --video_uids {video_uids_str} \
       -o ./ego4d_clips --datasets video_540ss --yes


Trim videos

In [None]:
import os
import subprocess

# === CONFIG ===
input_txt = "video_clip_uids.txt"           # Your .txt file
output_dir = "./trimmed_clips"              # Final trimmed videos
raw_download_dir = "./ego4d_clips/v2/video_540ss"          # Raw downloaded videos
os.makedirs(output_dir, exist_ok=True)
os.makedirs(raw_download_dir, exist_ok=True)

# === Function to download and trim ===
def download_and_trim(line):
    line = line.strip()
    if not line:
        return  # Skip empty lines

    try:
        parts = line.split('_')
        if len(parts) < 3:
            raise ValueError("Line doesn't match expected format: videoid_start_end")

        # Handle cases with more underscores in video ID
        video_id = '_'.join(parts[:-2])
        start = float(parts[-2])
        end = float(parts[-1])

        clip_name = f"{video_id}_{start}_{end}.mp4"
        output_path = os.path.join(output_dir, clip_name)
        raw_video_path = os.path.join(raw_download_dir, f"{video_id}.mp4")

        # Trim with ffmpeg
        print(f"Trimming {video_id} from {start} to {end}...")
        subprocess.run([
            "ffmpeg", "-y", "-i", raw_video_path,
            "-ss", str(start), "-to", str(end),
            "-c", "copy", output_path
        ], check=True)

        print(f"Saved: {output_path}")

    except Exception as e:
        print(f"Error processing '{line}': {e}")

# === Run for all lines ===
with open(input_txt, 'r') as f:
    lines = f.readlines()

for line in lines:
    download_and_trim(line)


Extract frames from clips

In [None]:
import cv2
import numpy as np
import os

def extract_uniform_frames(video_path, base_output_dir, num_frames=32):
    if not os.path.exists(video_path):
        print(f"Video path does not exist: {video_path}")
        return

    video_name = os.path.splitext(os.path.basename(video_path))[0]
    output_dir = os.path.join(base_output_dir, video_name)
    os.makedirs(output_dir, exist_ok=True)

    print(f"Extracting frames from: {video_path}")
    print(f"Saving to: {output_dir}")

    cap = cv2.VideoCapture(video_path)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    if total_frames < num_frames:
        print(f"Video has only {total_frames} frames. Extracting all of them.")
        frame_indices = np.linspace(0, total_frames - 1, total_frames, dtype=int)
    else:
        frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)

    print(f"Extracting frames at indices: {frame_indices}")

    extracted = 1
    for i in range(total_frames):
        ret, frame = cap.read()
        if not ret:
            break
        if i in frame_indices:
            frame_name = f"frame_{extracted:06d}.jpg"
            cv2.imwrite(os.path.join(output_dir, frame_name), frame)
            extracted += 1

    cap.release()
    print(f"✅ Done! Extracted {extracted - 1} frames to {output_dir}\n")


# === Apply to all videos ===
trimmed_dir = "/content/trimmed_clips"
output_base_dir = "/content/extracted_frames"

for filename in os.listdir(trimmed_dir):
    if filename.endswith(".mp4"):
        video_path = os.path.join(trimmed_dir, filename)
        extract_uniform_frames(video_path, output_base_dir)

Evaluate using Gemini

In [None]:
import os
import json
import time
import random
import ast
import base64
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import google.generativeai as genai


class Config:
    OUTPUT_DIR = './results/'
    DATA_FILE = './egotempo_openQA.json'  # Updated to JSON file
    GEMINI_API_KEY = ''
    MODEL_NAME = 'gemini-1.5-flash'
    TEMPERATURE = 0.0
    MAX_WORKERS = 1
    BATCH_SIZE = 1
    SHUFFLE_DATA = False
    RESULT_FILE_TEMPLATE = 'results.json'


class QADataset:
    def __init__(self, data_file):
        with open(data_file, 'r') as f:
            raw = json.load(f)
            self.annotations = raw["annotations"]

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, index):
        row = self.annotations[index]
        video_id = row['clip_id']
        question = row['question']
        category = row['question_type']
        answer = row['answer']

        question_str = (
            f"These are frames from a video that I want to upload. "
            f"Use the visual cues to answer the question: {question}. "
            f"You need to answer the question in any case and not demand additional context information. "
            f"Note: All actions mentioned refer to the person recording the video."
        )

        return {
            'video_id': video_id,
            'question_answer': question_str,
            'question': question,
            'answer': answer,
            'category': category
        }


def load_images_as_base64(frames_dir):
    image_paths = sorted([f for f in os.listdir(frames_dir) if f.endswith(('.jpg', '.jpeg', '.png'))])
    images_b64 = []
    for image in image_paths:
        with open(os.path.join(frames_dir, image), 'rb') as img_file:
            b64_image = base64.b64encode(img_file.read()).decode('utf-8')
            images_b64.append({
                "mime_type": "image/jpeg",
                "data": b64_image
            })
    return images_b64[:1]  # Limiting to 1 frame


def call_gemini_model(model, images, text_prompt):
    response = model.generate_content(
        contents=[{"role": "user", "parts": images + [{"text": text_prompt}]}],
    )
    return response.text


def process_qa_item(batch, model, existing_entries):
    uid = batch['video_id']
    question = batch['question']
    question_answer = batch['question_answer']
    category = batch['category']
    answer = batch['answer']

    if (uid, question) in existing_entries:
        return None

    frames_dir = f'/content/extracted_frames/{uid}'
    if not os.path.exists(frames_dir):
        print(f"Frames directory not found: {frames_dir}")
        return None

    try:
        images = load_images_as_base64(frames_dir)
        output_text = call_gemini_model(model, images, question_answer)
        print(output_text)
    except Exception as e:
        print(f"Error processing {uid}: {e}")
        return None

    return {
        "V": uid,
        "Q": question,
        "QA": question_answer,
        "A": output_text,
        "C": answer,
        "M": category
    }


def perform_bulk_inference(dataset, model, output_file_path):
    results = []
    existing_entries = set()

    if os.path.exists(output_file_path):
        with open(output_file_path, 'r') as f:
            try:
                existing_data = json.load(f)
                results = [entry for entry in existing_data if entry["A"] != ""]
                existing_entries = {(entry["V"], entry["Q"]) for entry in results}
            except Exception as e:
                print("Error loading previous results:", e)

    with ThreadPoolExecutor(max_workers=Config.MAX_WORKERS) as executor:
        futures = {
            executor.submit(process_qa_item, dataset[i], model, existing_entries): i
            for i in range(len(dataset))
        }

        for future in tqdm(as_completed(futures), total=len(futures), desc="Running Inference"):
            result = future.result()
            if result:
                results.append(result)
                if len(results) % 50 == 0:
                    with open(output_file_path, 'w') as f:
                        json.dump(results, f)

    with open(output_file_path, 'w') as f:
        json.dump(results, f)
    print(f"Saved {len(results)} results to {output_file_path}")


def main():
    genai.configure(api_key=Config.GEMINI_API_KEY)
    model = genai.GenerativeModel(Config.MODEL_NAME)

    os.makedirs(Config.OUTPUT_DIR, exist_ok=True)

    dataset = QADataset(Config.DATA_FILE)
    output_file_path = os.path.join(Config.OUTPUT_DIR, Config.RESULT_FILE_TEMPLATE)
    perform_bulk_inference(dataset, model, output_file_path)


if __name__ == "__main__":
    main()


Evaluate the results

In [None]:
import json
import os
import re
import ast
import time
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
import google.generativeai as genai


GEMINI_API_KEY = ''
MODEL_NAME = 'gemini-1.5-flash'
OUTPUT_EVAL_DIR = './eval_results/'
MAX_WORKERS = 5


genai.configure(api_key=GEMINI_API_KEY)
model = genai.GenerativeModel(MODEL_NAME)


def create_prompt(q, a, pred):
    return f"""role: "system",
content: "You are an intelligent chatbot designed for evaluating the correctness of AI assistant predictions for question-answer pairs.
Your task is to compare the predicted answer with the ground-truth answer and determine if the predicted answer is correct or not. Here's how you can accomplish the task:
-----##INSTRUCTIONS:
- Focus on the correctness and accuracy of the predicted answer with the ground-truth.
- Consider uncertain predictions, such as 'it is impossible to answer the question from the video', as incorrect, unless the ground truth answer also says that."
role: "user",
content: "Please evaluate the following video-based question-answer pair:
Question: {q}
Ground truth correct Answer: {a}
Predicted Answer: {pred}
Provide your evaluation as a correct/incorrect prediction along with the score where the score is an integer value between 0 (fully wrong) and 5 (fully correct). The middle score provides the percentage of correctness.
Please generate the response in the form of a Python dictionary string with keys 'pred', 'score' and 'reason', where value of 'pred' is a string of 'correct' or 'incorrect',
value of 'score' is in INTEGER, not STRING and value of 'reason' should provide the reason behind the decision."
"""


def evaluate_with_gemini(qa_item):
    question = qa_item['Q']
    answer = qa_item['C']
    pred = qa_item['A']

    prompt = create_prompt(question, answer, pred)

    try:
        response = model.generate_content(prompt)
        match = re.search(r'\{.*?\}', response.text, re.DOTALL)
        if match:
            eval_dict = ast.literal_eval(match.group(0))
            return {
                "pred": eval_dict.get("pred", ""),
                "score": int(eval_dict.get("score", 0)),
                "reason": eval_dict.get("reason", "")
            }
    except Exception as e:
        print(f"Error evaluating: {e}")
    return None


def evaluate_predictions(input_path, output_path):
    with open(input_path, "r") as f:
        raw_data = json.load(f)

    print(f"Loaded {len(raw_data)} QA pairs from {input_path}")

    results = []
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        futures = list(tqdm(executor.map(evaluate_with_gemini, raw_data), total=len(raw_data)))

    for idx, result in enumerate(futures):
        if result:
            results.append([result, raw_data[idx]])

    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    with open(output_path, "w") as f:
        json.dump(results, f, indent=2)

    print(f"Saved evaluated results to {output_path}")


def main():
    input_dir = './results/'
    output_dir = OUTPUT_EVAL_DIR

    for file_name in os.listdir(input_dir):
        if file_name.endswith(".json"):
            input_path = os.path.join(input_dir, file_name)
            output_subdir = os.path.join(output_dir, file_name.replace(".json", ""))
            output_path = os.path.join(output_subdir, "eval_results.json")
            evaluate_predictions(input_path, output_path)


if __name__ == "__main__":
    main()
