In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
!ls "/content/drive/MyDrive/Animal_Kingdom/action_recognition/dataset"

In [None]:
!mkdir -p /content/videos
!tar -xvzf "/content/drive/MyDrive/Animal_Kingdom/action_recognition/dataset/video.tar.gz" -C /content/videos

In [None]:
import os

video_dir = "/content/videos/video"
video_files = [f for f in os.listdir(video_dir) if f.endswith(('.mp4', '.avi', '.mov'))]
print(f"Found {len(video_files)} video files.")
print(video_files[:5])  # show a few

In [None]:
import pandas as pd

metadata_path = "/content/drive/MyDrive/Animal_Kingdom/action_recognition/AR_metadata.xlsx"
df = pd.read_excel(metadata_path)
print(df.head())

In [None]:
#Transformers for BLIP
!pip install transformers

#Torch for model and inference
!pip install torch torchvision torchaudio

#Excel file reading
!pip install openpyxl pandas

#Image and video processing
!pip install opencv-python pillow

Pipeline execution

In [None]:
import torch
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration

device = "cuda" if torch.cuda.is_available() else "cpu"

processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

def caption_image(image_path):
    raw_image = Image.open(image_path).convert('RGB')
    inputs = processor(raw_image, return_tensors="pt").to(device)
    output = blip_model.generate(**inputs)
    return processor.decode(output[0], skip_special_tokens=True)


In [None]:
import cv2
import os

def extract_frames_1fps(video_path, output_folder):
    os.makedirs(output_folder, exist_ok=True)
    vidcap = cv2.VideoCapture(video_path)
    fps = vidcap.get(cv2.CAP_PROP_FPS)  # Get video FPS
    interval = int(fps)  # Capture 1 frame per second

    success, image = vidcap.read()
    count, saved = 0, 0
    while success:
        if count % interval == 0:
            frame_path = os.path.join(output_folder, f"frame_{saved}.jpg")
            cv2.imwrite(frame_path, image)
            saved += 1
        success, image = vidcap.read()
        count += 1
    vidcap.release()
    print(f"Extracted {saved} frames (1 fps) from {video_path}")


In [None]:
def generate_video_captions(frames_folder):
    captions = []
    for frame in sorted(os.listdir(frames_folder)):
        if frame.endswith(".jpg"):
            caption = caption_image(os.path.join(frames_folder, frame))
            captions.append(caption)
    return captions


In [None]:
from transformers import BartTokenizer, BartForConditionalGeneration
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer_bart = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
model_bart = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn").to(device)


In [None]:
def summarize_captions_bart_single(captions):
    # Provide a clear instruction
    text_input = " ".join(captions)
    text_input = f"Generate one concise descriptive sentence: {text_input}"

    inputs = tokenizer_bart([text_input], max_length=1024, return_tensors="pt", truncation=True).to(device)
    summary_ids = model_bart.generate(
        inputs['input_ids'],
        max_length=25,   # restrict length
        min_length=5,
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True
    )
    result = tokenizer_bart.decode(summary_ids[0], skip_special_tokens=True)

    # Ensure single sentence
    return result.split(".")[0].strip() + "."

In [None]:
video_path = "/content/videos/video/LKBDONQN.mp4"
frames_folder = "/content/frames"

# Step 1: Extract frames
extract_frames_1fps(video_path, frames_folder)

# Step 2: Generate captions
frame_captions = generate_video_captions(frames_folder)
print("Frame Captions:", frame_captions)

# Step 3: Summarize with BART (single sentence)
final_caption = summarize_captions_bart_single(frame_captions)
print("Final Video Caption:", final_caption)


In [None]:
import pandas as pd

# Load metadata Excel
metadata_path = "/content/drive/MyDrive/Animal_Kingdom/action_recognition/AR_metadata.xlsx"
df = pd.read_excel(metadata_path)
metadata_dict = {row['video_id']: row for _, row in df.iterrows()}


In [None]:
# Example final caption from your BLIP + BART pipeline
final_caption = "Generate one concise descriptive sentence: a small bird perched on a branch of a tree"
video_id = "LKBDONQN"

# Fetch metadata for this video
meta = metadata_dict.get(video_id, {})

# Extract animals and actions keywords
animals = meta.get("list_animal", [])

# Parse actions if stored as a string representation of list of tuples
actions_raw = meta.get("list_animal_action", "")
try:
    actions = [act for (_, act) in eval(actions_raw)] if isinstance(actions_raw, str) else []
except:
    actions = []

keywords = [a.lower() for a in animals] + [a.lower() for a in actions]

# Check semantic correctness
def semantic_correctness_score(caption, keywords):
    caption = caption.lower()
    matched = [kw for kw in keywords if kw.lower() in caption]
    missing = [kw for kw in keywords if kw.lower() not in caption]
    score = len(matched) / len(keywords) if len(keywords) > 0 else 0
    return score, matched, missing

score, matched, missing = semantic_correctness_score(final_caption, keywords)
percentage = score * 100

print(f"Video ID: {video_id}")
print(f"Final Caption: {final_caption}")
print(f"Keywords: {keywords}")
print(f"Matched Keywords: {matched}")
print(f"Missing Keywords: {missing}")
print(f"Semantic Correctness Score: {score:.2f} ({percentage:.1f}%)")


In [None]:
!pip install transformers timm fairscale accelerate opencv-python pandas

import os
import cv2
import torch
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration, BartTokenizer, BartForConditionalGeneration
import pandas as pd

device = "cuda" if torch.cuda.is_available() else "cpu"

# BLIP Setup
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

def caption_image(image_path):
    raw_image = Image.open(image_path).convert('RGB')
    inputs = blip_processor(raw_image, return_tensors="pt").to(device)
    out = blip_model.generate(**inputs)
    return blip_processor.decode(out[0], skip_special_tokens=True)

# BART Setup
tokenizer_bart = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
model_bart = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn").to(device)

def summarize_captions_bart(captions):
    text_input = " ".join(captions)
    text_input = f"Generate one concise descriptive sentence: {text_input}"
    inputs = tokenizer_bart([text_input], max_length=1024, return_tensors="pt", truncation=True).to(device)
    summary_ids = model_bart.generate(
        inputs['input_ids'],
        max_length=25,
        min_length=5,
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True
    )
    return tokenizer_bart.decode(summary_ids[0], skip_special_tokens=True).split(".")[0].strip() + "."


In [None]:
def extract_frames_1fps(video_path, output_folder):
    os.makedirs(output_folder, exist_ok=True)
    vidcap = cv2.VideoCapture(video_path)
    fps = vidcap.get(cv2.CAP_PROP_FPS)
    interval = int(fps) if fps > 0 else 1  # 1 frame per second
    success, image = vidcap.read()
    count, saved = 0, 0
    while success:
        if count % interval == 0:
            frame_path = os.path.join(output_folder, f"frame_{saved}.jpg")
            cv2.imwrite(frame_path, image)
            saved += 1
        success, image = vidcap.read()
        count += 1
    vidcap.release()
    return saved


In [None]:
def generate_video_captions(frames_folder):
    captions = []
    for frame in sorted(os.listdir(frames_folder)):
        if frame.endswith(".jpg"):
            caption = caption_image(os.path.join(frames_folder, frame))
            captions.append(caption)
    return captions


In [None]:
def semantic_correctness_score(caption, keywords):
    caption = caption.lower()
    matched = [kw for kw in keywords if kw.lower() in caption]
    missing = [kw for kw in keywords if kw.lower() not in caption]
    score = len(matched) / len(keywords) if len(keywords) > 0 else 0
    return score, matched, missing


BLIP+BART

In [None]:
# ================================
# 1. Install dependencies
# ================================
!pip install transformers timm fairscale accelerate opencv-python pandas

# ================================
# 2. Import Libraries
# ================================
import os
import cv2
import json
import torch
import pandas as pd
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration, BartTokenizer, BartForConditionalGeneration

# ================================
# 3. Device Setup
# ================================
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

# ================================
# 4. BLIP Setup
# ================================
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

def caption_image(image_path):
    raw_image = Image.open(image_path).convert('RGB')
    inputs = blip_processor(raw_image, return_tensors="pt").to(device)
    out = blip_model.generate(**inputs)
    return blip_processor.decode(out[0], skip_special_tokens=True)

# ================================
# 5. BART Setup
# ================================
tokenizer_bart = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
model_bart = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn").to(device)

def summarize_captions_bart(captions):
    text_input = " ".join(captions)
    text_input = f"Generate one concise descriptive sentence: {text_input}"
    inputs = tokenizer_bart([text_input], max_length=1024, return_tensors="pt", truncation=True).to(device)
    summary_ids = model_bart.generate(
        inputs['input_ids'],
        max_length=25,
        min_length=5,
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True
    )
    return tokenizer_bart.decode(summary_ids[0], skip_special_tokens=True).split(".")[0].strip() + "."

# ================================
# 6. Frame Extraction (1 FPS)
# ================================
def extract_frames_1fps(video_path, output_folder):
    os.makedirs(output_folder, exist_ok=True)
    vidcap = cv2.VideoCapture(video_path)
    fps = vidcap.get(cv2.CAP_PROP_FPS)
    interval = int(fps) if fps > 0 else 1
    success, image = vidcap.read()
    count, saved = 0, 0
    while success:
        if count % interval == 0:
            frame_path = os.path.join(output_folder, f"frame_{saved}.jpg")
            cv2.imwrite(frame_path, image)
            saved += 1
        success, image = vidcap.read()
        count += 1
    vidcap.release()
    return saved

# ================================
# 7. Generate Captions for Frames
# ================================
def generate_video_captions(frames_folder):
    captions = []
    for frame in sorted(os.listdir(frames_folder)):
        if frame.endswith(".jpg"):
            caption = caption_image(os.path.join(frames_folder, frame))
            captions.append(caption)
    return captions

# ================================
# 8. Semantic Correctness
# ================================
def semantic_correctness_score(caption, keywords):
    caption = caption.lower()
    matched = [kw for kw in keywords if kw.lower() in caption]
    missing = [kw for kw in keywords if kw.lower() not in caption]
    score = len(matched) / len(keywords) if len(keywords) > 0 else 0
    return score, matched, missing

# ================================
# 9. Load Metadata
# ================================
metadata_path = "/content/drive/MyDrive/Animal_Kingdom/action_recognition/AR_metadata.xlsx"
df = pd.read_excel(metadata_path)
metadata_dict = {row['video_id']: row for _, row in df.iterrows()}

# ================================
# 10. Process Batch of 20 Videos
# ================================
video_folder = "/content/videos/video"
video_list = sorted(os.listdir(video_folder))[:20]

results = []

for video_name in video_list:
    video_id = os.path.splitext(video_name)[0]
    video_path = os.path.join(video_folder, video_name)
    frames_folder = f"/content/frames/{video_id}"

    print(f"\nProcessing video: {video_name}")

    # Step 1: Extract frames
    extracted = extract_frames_1fps(video_path, frames_folder)
    print(f"Extracted {extracted} frames (1 fps) from {video_name}")

    # Step 2: Generate BLIP captions
    frame_captions = generate_video_captions(frames_folder)
    print("Frame Captions:", frame_captions)

    # Step 3: Summarize with BART
    final_caption = summarize_captions_bart(frame_captions)
    print("Final Video Caption:", final_caption)

    # Step 4: Semantic correctness
    meta = metadata_dict.get(video_id, {})
    animals = meta.get("list_animal", [])
    actions_raw = meta.get("list_animal_action", "")
    try:
        actions = [act for (_, act) in eval(actions_raw)] if isinstance(actions_raw, str) else []
    except:
        actions = []

    keywords = [a.lower() for a in animals] + [a.lower() for a in actions]
    score, matched, missing = semantic_correctness_score(final_caption, keywords)

    results.append({
        "video_id": video_id,
        "final_caption": final_caption,
        "frame_captions": frame_captions,
        "keywords": keywords,
        "matched_keywords": matched,
        "missing_keywords": missing,
        "semantic_correctness_percent": f"{score*100:.1f}%"
    })

# ================================
# 11. Save Results
# ================================
results_df = pd.DataFrame(results)
results_df.to_csv("/content/batch20_results.csv", index=False)
print("\nBatch processing complete! Results saved to /content/batch20_results.csv")


In [None]:
import pandas as pd
import json

# Load previously generated batch results
batch_results_path = "/content/batch20_results.csv"
batch_df = pd.read_csv(batch_results_path)

# Load metadata
metadata_path = "/content/drive/MyDrive/Animal_Kingdom/action_recognition/AR_metadata.xlsx"
metadata_df = pd.read_excel(metadata_path)
metadata_dict = {row['video_id']: row for _, row in metadata_df.iterrows()}

# Semantic correctness function (keyword-based)
def semantic_correctness_score(caption, keywords):
    caption = caption.lower()
    matched = [kw for kw in keywords if kw.lower() in caption]
    missing = [kw for kw in keywords if kw.lower() not in caption]
    score = len(matched) / len(keywords) if len(keywords) > 0 else 0
    return score, matched, missing

# Process each video
semantic_results = []

for idx, row in batch_df.iterrows():
    video_id = row["video_id"]
    final_caption = row["final_caption"]

    # Get metadata for this video
    meta = metadata_dict.get(video_id, {})
    animals = meta.get("list_animal", [])
    actions_raw = meta.get("list_animal_action", "")
    try:
        actions = [act for (_, act) in eval(actions_raw)] if isinstance(actions_raw, str) else []
    except:
        actions = []

    keywords = [a.lower() for a in animals] + [a.lower() for a in actions]

    # Compute correctness
    score, matched, missing = semantic_correctness_score(final_caption, keywords)

    semantic_results.append({
        "video_id": video_id,
        "final_caption": final_caption,
        "keywords": keywords,
        "matched_keywords": matched,
        "missing_keywords": missing,
        "semantic_correctness_percent": f"{score*100:.1f}%"
    })

# Convert to DataFrame and save
semantic_df = pd.DataFrame(semantic_results)
semantic_df.to_csv("/content/semantic_correctness_batch20.csv", index=False)

print("Semantic correctness evaluation complete!")
print(semantic_df[["video_id", "semantic_correctness_percent", "matched_keywords", "missing_keywords"]])


BLIP + T5

In [None]:
# Core installations for BLIP, image processing, and LLM summarization
!pip install git+https://github.com/salesforce/BLIP.git
!pip install transformers
!pip install timm
!pip install opencv-python
!pip install pillow
!pip install nltk


In [None]:
import cv2
import os

def extract_frames_1fps(video_path, output_folder):
    os.makedirs(output_folder, exist_ok=True)
    vidcap = cv2.VideoCapture(video_path)
    fps = vidcap.get(cv2.CAP_PROP_FPS)
    interval = int(fps)  # 1 frame per second

    success, image = vidcap.read()
    count, saved = 0, 0
    while success:
        if count % interval == 0:
            frame_path = os.path.join(output_folder, f"frame_{saved:03d}.jpg")
            cv2.imwrite(frame_path, image)
            saved += 1
        success, image = vidcap.read()
        count += 1

    vidcap.release()
    print(f" Extracted {saved} frames (1 fps) from: {video_path}")

# Video path
video_path = "/content/videos/video/LKBDONQN.mp4"
frames_folder = "/content/frames/LKBDONQN"
extract_frames_1fps(video_path, frames_folder)


In [None]:
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

def generate_caption(image_path):
    image = Image.open(image_path).convert('RGB')
    inputs = processor(image, return_tensors="pt").to(device)
    out = model.generate(**inputs)
    return processor.decode(out[0], skip_special_tokens=True)


In [None]:
import os

frames_folder = "/content/frames/LKBDONQN"
captions = []

# List frames in sorted order
frame_files = sorted(f for f in os.listdir(frames_folder) if f.endswith(".jpg"))

for fname in frame_files:
    path = os.path.join(frames_folder, fname)
    caption = generate_caption(path)
    captions.append((fname, caption))
    print(f"{fname}: {caption}")


In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

# Load FLAN-T5
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base").to(device)

def summarize_captions(captions):
    # Extract just the caption strings
    caption_texts = [cap for _, cap in captions]
    combined_text = " ".join(caption_texts)

    # Enhanced prompt for better detail retention
    prompt = (
    "From the following frame captions, write one caption that combines all the information. "
    "Do not lose any specific detail such as color, position, or object type. Be precise and explicit: "
    f"{combined_text}"
    )


    # Tokenize and summarize
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(device)
    outputs = model.generate(**inputs, max_length=60, min_length=10, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return summary

# Example usage (assuming 'captions' is already defined)
final_caption = summarize_captions(captions)
print("\n Final Video-Level Caption:\n", final_caption)


In [None]:
import os, cv2, torch
from PIL import Image
import pandas as pd
from transformers import Blip2Processor, Blip2ForConditionalGeneration, T5Tokenizer, T5ForConditionalGeneration

device = "cuda" if torch.cuda.is_available() else "cpu"

# === Load BLIP-2 model (OPT 2.7B) ===
blip_processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
blip_model = Blip2ForConditionalGeneration.from_pretrained(
    "Salesforce/blip2-opt-2.7b",
    device_map="auto",
    torch_dtype=torch.float16
)

# === Load FLAN-T5 ===
t5_tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
t5_model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base").to(device)

# === Frame Extraction ===
def extract_frames_1fps(video_path, output_folder):
    os.makedirs(output_folder, exist_ok=True)
    vidcap = cv2.VideoCapture(video_path)
    fps = vidcap.get(cv2.CAP_PROP_FPS)
    interval = int(fps)
    success, image = vidcap.read()
    count = saved = 0
    while success:
        if count % interval == 0:
            fpath = os.path.join(output_folder, f"frame_{saved:03d}.jpg")
            cv2.imwrite(fpath, image)
            saved += 1
        success, image = vidcap.read()
        count += 1
    vidcap.release()
    return saved

# === Caption with BLIP-2 ===
def generate_caption(image_path):
    image = Image.open(image_path).convert('RGB')
    inputs = blip_processor(images=image, return_tensors="pt").to(blip_model.device, torch.float16)
    outputs = blip_model.generate(**inputs, max_new_tokens=50)
    caption = blip_processor.batch_decode(outputs, skip_special_tokens=True)[0]
    return caption

# === Summarize with FLAN-T5 ===
def summarize_captions(captions_list):
    combined = " ".join(captions_list)
    prompt = (
        "Generate a single, highly descriptive and detailed caption from the following frame captions. "
        "Retain unique information like color, size, and position: " + combined
    )
    inputs = t5_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(device)
    outputs = t5_model.generate(**inputs, max_length=30, min_length=10, length_penalty=2.0, num_beams=4, early_stopping=True)
    return t5_tokenizer.decode(outputs[0], skip_special_tokens=True)

# === MAIN PROCESSING ===
video_dir = "/content/videos/video/"
frames_root = "/content/frames/"
frame_caption_data = []
video_caption_data = []

video_files = sorted([f for f in os.listdir(video_dir) if f.endswith(".mp4")])[:20]

for video_file in video_files:
    video_id = os.path.splitext(video_file)[0]
    video_path = os.path.join(video_dir, video_file)
    frame_folder = os.path.join(frames_root, video_id)

    print(f"\n Processing: {video_file}")
    n_frames = extract_frames_1fps(video_path, frame_folder)
    print(f"  → Extracted {n_frames} frames.")

    frame_captions = []
    for fname in sorted(os.listdir(frame_folder)):
        if fname.endswith(".jpg"):
            fpath = os.path.join(frame_folder, fname)
            caption = generate_caption(fpath)
            frame_captions.append(caption)
            frame_caption_data.append([video_id, fname, caption])
            print(f"    {fname}: {caption}")

    final_caption = summarize_captions(frame_captions)
    video_caption_data.append([video_id, final_caption])
    print(f" Final Caption: {final_caption}")

# === SAVE OUTPUT ===
frame_df = pd.DataFrame(frame_caption_data, columns=["video_id", "frame", "frame_caption"])
frame_df.to_csv("/content/frame_level_captions.csv", index=False)

video_df = pd.DataFrame(video_caption_data, columns=["video_id", "video_caption"])
video_df.to_csv("/content/video_level_captions.csv", index=False)

print("\n Saved:")
print(" - /content/frame_level_captions.csv")
print(" - /content/video_level_captions.csv")


In [None]:
from google.colab import files

files.download("/content/frame_level_captions.csv")
files.download("/content/video_level_captions.csv")


In [None]:
import pandas as pd

# === Load Outputs ===
video_df = pd.read_csv("/content/video_level_captions.csv")

# === Load Metadata ===
metadata_path = "/content/drive/MyDrive/Animal_Kingdom/action_recognition/AR_metadata.xlsx"
metadata_df = pd.read_excel(metadata_path)

# Convert metadata to dictionary for quick lookup
metadata_dict = {row['video_id']: row for _, row in metadata_df.iterrows()}

# === Semantic Matching Function ===
def semantic_correctness_score(caption, keywords):
    caption = caption.lower()
    matched = [kw for kw in keywords if kw.lower() in caption]
    missing = [kw for kw in keywords if kw.lower() not in caption]
    score = len(matched) / len(keywords) if len(keywords) > 0 else 0
    return score, matched, missing

# === Evaluate Each Video ===
semantic_results = []

for idx, row in video_df.iterrows():
    video_id = row["video_id"]
    final_caption = row["video_caption"]

    # Get metadata
    meta = metadata_dict.get(video_id, {})
    animals = meta.get("list_animal", [])
    actions_raw = meta.get("list_animal_action", "")

    # Parse list_animal_action (a stringified list of tuples)
    try:
        actions = [act for (_, act) in eval(actions_raw)] if isinstance(actions_raw, str) else []
    except:
        actions = []

    # Combine animal and action keywords
    keywords = [a.lower() for a in animals] + [a.lower() for a in actions]

    # Score the caption
    score, matched, missing = semantic_correctness_score(final_caption, keywords)

    semantic_results.append({
        "video_id": video_id,
        "video_caption": final_caption,
        "keywords": keywords,
        "matched_keywords": matched,
        "missing_keywords": missing,
        "semantic_correctness_percent": f"{score*100:.1f}%"
    })

# === Save and View Results ===
semantic_df = pd.DataFrame(semantic_results)
semantic_df.to_csv("/content/semantic_correctness_blip_flan.csv", index=False)

print("✅ Semantic correctness evaluation complete!\n")
print(semantic_df[["video_id", "semantic_correctness_percent", "matched_keywords", "missing_keywords"]])


BLIP2+BART

In [None]:
# ================================
# 1. Install Dependencies
# ================================
# !pip install git+https://github.com/salesforce/BLIP.git transformers timm accelerate opencv-python pandas -q

# ================================
# 2. Import Libraries
# ================================
import os, cv2, torch
import pandas as pd
from PIL import Image
from transformers import (
    Blip2Processor, Blip2ForConditionalGeneration,
    BartTokenizer, BartForConditionalGeneration
)

# ================================
# 3. Device Setup
# ================================
device = "cuda" if torch.cuda.is_available() else "cpu"
print(" Using device:", device)

# ================================
# 4. BLIP2 Setup (Captioning)
# ================================
blip_processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
blip_model = Blip2ForConditionalGeneration.from_pretrained(
    "Salesforce/blip2-opt-2.7b",
    device_map="auto",
    torch_dtype=torch.float16
)

def generate_caption(image_path):
    image = Image.open(image_path).convert('RGB')
    inputs = blip_processor(images=image, return_tensors="pt").to(blip_model.device, torch.float16)
    outputs = blip_model.generate(**inputs, max_new_tokens=50)
    return blip_processor.batch_decode(outputs, skip_special_tokens=True)[0]

# ================================
# 5. BART Setup (Summarization)
# ================================
bart_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
bart_model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn").to(device)

def summarize_captions_bart(captions):
    text_input = " ".join(captions)
    prompt = f"Generate one concise descriptive sentence: {text_input}"
    inputs = bart_tokenizer([prompt], return_tensors="pt", max_length=1024, truncation=True).to(device)
    summary_ids = bart_model.generate(
        inputs["input_ids"],
        max_length=25,
        min_length=5,
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True
    )
    return bart_tokenizer.decode(summary_ids[0], skip_special_tokens=True).split(".")[0].strip() + "."

# ================================
# 6. Frame Extraction
# ================================
def extract_frames_1fps(video_path, output_folder):
    os.makedirs(output_folder, exist_ok=True)
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    interval = int(fps) if fps > 0 else 1
    success, image = cap.read()
    count = saved = 0
    while success:
        if count % interval == 0:
            fpath = os.path.join(output_folder, f"frame_{saved:03d}.jpg")
            cv2.imwrite(fpath, image)
            saved += 1
        success, image = cap.read()
        count += 1
    cap.release()
    return saved

# ================================
# 7. Metadata Matching Function
# ================================
def semantic_correctness_score(caption, keywords):
    caption = caption.lower()
    matched = [kw for kw in keywords if kw.lower() in caption]
    missing = [kw for kw in keywords if kw.lower() not in caption]
    score = len(matched) / len(keywords) if keywords else 0
    return score, matched, missing

# ================================
# 8. Load Metadata
# ================================
metadata_path = "/content/drive/MyDrive/Animal_Kingdom/action_recognition/AR_metadata.xlsx"
meta_df = pd.read_excel(metadata_path)
metadata_dict = {row["video_id"]: row for _, row in meta_df.iterrows()}

# ================================
# 9. Process Videos
# ================================
video_dir = "/content/videos/video/"
video_files = sorted([f for f in os.listdir(video_dir) if f.endswith(".mp4")])[:20]

results = []

for video_file in video_files:
    video_id = os.path.splitext(video_file)[0]
    video_path = os.path.join(video_dir, video_file)
    frame_dir = f"/content/frames/{video_id}"

    print(f"\n Processing {video_id}")

    # Step 1: Extract frames
    extract_frames_1fps(video_path, frame_dir)

    # Step 2: Caption each frame
    frame_captions = []
    for fname in sorted(os.listdir(frame_dir)):
        if fname.endswith(".jpg"):
            path = os.path.join(frame_dir, fname)
            caption = generate_caption(path)
            frame_captions.append(caption)

    # Step 3: Summarize using BART
    final_caption = summarize_captions_bart(frame_captions)

    # Step 4: Metadata matching
    meta = metadata_dict.get(video_id, {})
    animals = meta.get("list_animal", [])
    actions_raw = meta.get("list_animal_action", "")
    try:
        actions = [act for (_, act) in eval(actions_raw)] if isinstance(actions_raw, str) else []
    except:
        actions = []

    keywords = [a.lower() for a in animals] + [a.lower() for a in actions]
    score, matched, missing = semantic_correctness_score(final_caption, keywords)

    results.append({
        "video_id": video_id,
        "final_caption": final_caption,
        "frame_captions": frame_captions,
        "keywords": keywords,
        "matched_keywords": matched,
        "missing_keywords": missing,
        "semantic_correctness_percent": f"{score*100:.1f}%"
    })

# ================================
# 10. Save Results
# ================================
results_df = pd.DataFrame(results)
results_df.to_csv("/content/blip2_bart_results.csv", index=False)

print("\n BLIP2 + BART Evaluation Complete! Results saved to:")
print(" /content/blip2_bart_results.csv")


In [None]:
import pandas as pd

# Load results
results_df = pd.read_csv("/content/blip2_bart_results.csv")

# Display key columns
print(results_df[["video_id", "semantic_correctness_percent", "matched_keywords", "missing_keywords"]])


4fps(BLIP2+BART)

In [None]:
# ================================
# 1. Install Dependencies
# ================================
# !pip install git+https://github.com/salesforce/BLIP.git transformers timm accelerate opencv-python pandas -q

# ================================
# 2. Import Libraries
# ================================
import os, cv2, torch
import pandas as pd
from PIL import Image
from transformers import (
    Blip2Processor, Blip2ForConditionalGeneration,
    BartTokenizer, BartForConditionalGeneration
)

# ================================
# 3. Device Setup
# ================================
device = "cuda" if torch.cuda.is_available() else "cpu"
print(" Using device:", device)

# ================================
# 4. BLIP2 Setup (Captioning)
# ================================
blip_processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
blip_model = Blip2ForConditionalGeneration.from_pretrained(
    "Salesforce/blip2-opt-2.7b",
    device_map="auto",
    torch_dtype=torch.float16
)

def generate_caption(image_path):
    image = Image.open(image_path).convert('RGB')
    inputs = blip_processor(images=image, return_tensors="pt").to(blip_model.device, torch.float16)
    outputs = blip_model.generate(**inputs, max_new_tokens=50)
    return blip_processor.batch_decode(outputs, skip_special_tokens=True)[0]

# ================================
# 5. BART Setup (Summarization)
# ================================
bart_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
bart_model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn").to(device)

def summarize_captions_bart(captions):
    text_input = " ".join(captions)
    prompt = f"Generate one concise descriptive sentence: {text_input}"
    inputs = bart_tokenizer([prompt], return_tensors="pt", max_length=1024, truncation=True).to(device)
    summary_ids = bart_model.generate(
        inputs["input_ids"],
        max_length=25,
        min_length=5,
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True
    )
    return bart_tokenizer.decode(summary_ids[0], skip_special_tokens=True).split(".")[0].strip() + "."

# ================================
# 6. Frame Extraction
# ================================
def extract_frames_4fps(video_path, output_folder):
    os.makedirs(output_folder, exist_ok=True)
    vidcap = cv2.VideoCapture(video_path)
    fps = vidcap.get(cv2.CAP_PROP_FPS)
    interval = int(fps / 4) if fps >= 4 else 1  # Capture every 0.25 sec
    success, image = vidcap.read()
    count, saved = 0, 0
    while success:
        if count % interval == 0:
            frame_path = os.path.join(output_folder, f"frame_{saved:03d}.jpg")
            cv2.imwrite(frame_path, image)
            saved += 1
        success, image = vidcap.read()
        count += 1
    vidcap.release()
    return saved

# ================================
# 7. Metadata Matching Function
# ================================
def semantic_correctness_score(caption, keywords):
    caption = caption.lower()
    matched = [kw for kw in keywords if kw.lower() in caption]
    missing = [kw for kw in keywords if kw.lower() not in caption]
    score = len(matched) / len(keywords) if keywords else 0
    return score, matched, missing

# ================================
# 8. Load Metadata
# ================================
metadata_path = "/content/drive/MyDrive/Animal_Kingdom/action_recognition/AR_metadata.xlsx"
meta_df = pd.read_excel(metadata_path)
metadata_dict = {row["video_id"]: row for _, row in meta_df.iterrows()}

# ================================
# 9. Process Videos
# ================================
video_dir = "/content/videos/video/"
video_files = sorted([f for f in os.listdir(video_dir) if f.endswith(".mp4")])[:20]

results = []

for video_file in video_files:
    video_id = os.path.splitext(video_file)[0]
    video_path = os.path.join(video_dir, video_file)
    frame_dir = f"/content/frames/{video_id}"

    print(f"\n Processing {video_id}")

    # Step 1: Extract frames
    extract_frames_4fps(video_path, frame_dir)

    # Step 2: Caption each frame
    frame_captions = []
    for fname in sorted(os.listdir(frame_dir)):
        if fname.endswith(".jpg"):
            path = os.path.join(frame_dir, fname)
            caption = generate_caption(path)
            frame_captions.append(caption)

    # Step 3: Summarize using BART
    final_caption = summarize_captions_bart(frame_captions)

    # Step 4: Metadata matching
    meta = metadata_dict.get(video_id, {})
    animals = meta.get("list_animal", [])
    actions_raw = meta.get("list_animal_action", "")
    try:
        actions = [act for (_, act) in eval(actions_raw)] if isinstance(actions_raw, str) else []
    except:
        actions = []

    keywords = [a.lower() for a in animals] + [a.lower() for a in actions]
    score, matched, missing = semantic_correctness_score(final_caption, keywords)

    results.append({
        "video_id": video_id,
        "final_caption": final_caption,
        "frame_captions": frame_captions,
        "keywords": keywords,
        "matched_keywords": matched,
        "missing_keywords": missing,
        "semantic_correctness_percent": f"{score*100:.1f}%"
    })

# ================================
# 10. Save Results
# ================================
results_df = pd.DataFrame(results)
results_df.to_csv("/content/blip2_bart_results.csv", index=False)

print("\n BLIP2 + BART Evaluation Complete! Results saved to:")
print(" /content/blip2_bart_results.csv")


In [None]:
import pandas as pd

# Load results
results_df = pd.read_csv("/content/blip2_bart_results.csv")

# Display key columns
print(results_df[["video_id", "semantic_correctness_percent", "matched_keywords", "missing_keywords"]])


4fps(BLIP2+BART) with better prompting

In [None]:
# ================================
# 1. Install Dependencies
# ================================
!pip install git+https://github.com/salesforce/BLIP.git transformers timm accelerate opencv-python pandas -q

# ================================
# 2. Import Libraries
# ================================
import os, cv2, torch
import pandas as pd
from PIL import Image
from transformers import (
    Blip2Processor, Blip2ForConditionalGeneration,
    BartTokenizer, BartForConditionalGeneration
)

# ================================
# 3. Device Setup
# ================================
device = "cuda" if torch.cuda.is_available() else "cpu"
print(" Using device:", device)

# ================================
# 4. BLIP2 Setup (Captioning)
# ================================
blip_processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
blip_model = Blip2ForConditionalGeneration.from_pretrained(
    "Salesforce/blip2-opt-2.7b",
    device_map="auto",
    torch_dtype=torch.float16
)

def generate_caption(image_path):
    image = Image.open(image_path).convert('RGB')
    inputs = blip_processor(images=image, return_tensors="pt").to(blip_model.device, torch.float16)
    outputs = blip_model.generate(**inputs, max_new_tokens=50)
    return blip_processor.batch_decode(outputs, skip_special_tokens=True)[0]

# ================================
# 5. BART Setup (Summarization)
# ================================
bart_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
bart_model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn").to(device)

def summarize_captions_bart(captions):
    text_input = " ".join(captions)
    prompt = (
        "Summarize the following wildlife video scenes with high detail and precision. "
        "Retain unique animal behaviors, actions, and surroundings: " + text_input
    )
    inputs = bart_tokenizer([prompt], return_tensors="pt", max_length=1024, truncation=True).to(device)
    summary_ids = bart_model.generate(
        inputs["input_ids"],
        max_length=60,
        min_length=15,
        num_beams=4,
        early_stopping=True,
        no_repeat_ngram_size=3,
        repetition_penalty=2.0
    )
    return bart_tokenizer.decode(summary_ids[0], skip_special_tokens=True).strip()


# ================================
# 6. Frame Extraction
# ================================
def extract_frames_4fps(video_path, output_folder):
    os.makedirs(output_folder, exist_ok=True)
    vidcap = cv2.VideoCapture(video_path)
    fps = vidcap.get(cv2.CAP_PROP_FPS)
    interval = int(fps / 4) if fps >= 4 else 1  # Capture every 0.25 sec
    success, image = vidcap.read()
    count, saved = 0, 0
    while success:
        if count % interval == 0:
            frame_path = os.path.join(output_folder, f"frame_{saved:03d}.jpg")
            cv2.imwrite(frame_path, image)
            saved += 1
        success, image = vidcap.read()
        count += 1
    vidcap.release()
    return saved

# ================================
# 7. Metadata Matching Function
# ================================
def semantic_correctness_score(caption, keywords):
    caption = caption.lower()
    matched = [kw for kw in keywords if kw.lower() in caption]
    missing = [kw for kw in keywords if kw.lower() not in caption]
    score = len(matched) / len(keywords) if keywords else 0
    return score, matched, missing

# ================================
# 8. Load Metadata
# ================================
metadata_path = "/content/drive/MyDrive/Animal_Kingdom/action_recognition/AR_metadata.xlsx"
meta_df = pd.read_excel(metadata_path)
metadata_dict = {row["video_id"]: row for _, row in meta_df.iterrows()}

# ================================
# 9. Process Videos
# ================================
video_dir = "/content/videos/video/"
video_files = sorted([f for f in os.listdir(video_dir) if f.endswith(".mp4")])[:20]

results = []

for video_file in video_files:
    video_id = os.path.splitext(video_file)[0]
    video_path = os.path.join(video_dir, video_file)
    frame_dir = f"/content/frames/{video_id}"

    print(f"\n Processing {video_id}")

    # Step 1: Extract frames
    extract_frames_4fps(video_path, frame_dir)

    # Step 2: Caption each frame
    frame_captions = []
    for fname in sorted(os.listdir(frame_dir)):
        if fname.endswith(".jpg"):
            path = os.path.join(frame_dir, fname)
            caption = generate_caption(path)
            frame_captions.append(caption)

    # Step 3: Summarize using BART
    final_caption = summarize_captions_bart(frame_captions)

    # Step 4: Metadata matching
    meta = metadata_dict.get(video_id, {})
    animals = meta.get("list_animal", [])
    actions_raw = meta.get("list_animal_action", "")
    try:
        actions = [act for (_, act) in eval(actions_raw)] if isinstance(actions_raw, str) else []
    except:
        actions = []

    keywords = [a.lower() for a in animals] + [a.lower() for a in actions]
    score, matched, missing = semantic_correctness_score(final_caption, keywords)

    results.append({
        "video_id": video_id,
        "final_caption": final_caption,
        "frame_captions": frame_captions,
        "keywords": keywords,
        "matched_keywords": matched,
        "missing_keywords": missing,
        "semantic_correctness_percent": f"{score*100:.1f}%"
    })

# ================================
# 10. Save Results
# ================================
results_df = pd.DataFrame(results)
results_df.to_csv("/content/blip2_bart_results.csv", index=False)

print("\n BLIP2 + BART Evaluation Complete! Results saved to:")
print(" /content/blip2_bart_results.csv")


In [None]:
import pandas as pd

# Load results
results_df = pd.read_csv("/content/blip2_bart_results.csv")

# Display key columns
print(results_df[["video_id", "semantic_correctness_percent"]])


A set of 1000 videos

In [None]:
# ================================
# 0) (Optional) Installs (Colab/Notebook)
# ================================
# If you already have these, you can skip this cell.
!pip install transformers timm accelerate opencv-python pandas pillow -q

# ================================
# 1) Imports
# ================================
import os, random, json, math, sys, gc, shutil, time
from pathlib import Path
import cv2
import torch
import pandas as pd
from PIL import Image
from typing import List, Tuple

# Try to enable Colab batch download
IN_COLAB = False
try:
    from google.colab import files as colab_files
    IN_COLAB = True
except Exception:
    IN_COLAB = False

from transformers import (
    Blip2Processor, Blip2ForConditionalGeneration,
    BartTokenizer, BartForConditionalGeneration
)

# ================================
# 2) Config
# ================================
VIDEO_DIR = "/content/videos/video"   # folder containing .mp4 files
FRAMES_ROOT = "/content/frames_4fps"  # where frames will be extracted
METADATA_XLSX = "/content/drive/MyDrive/Animal_Kingdom/action_recognition/AR_metadata.xlsx"

TOTAL_VIDEOS = 1000
BATCH_SIZE = 100                     # save & download every N videos
MAX_NEW_TOKENS_CAPTION = 50
SUMMARY_MAX_LEN = 60
SUMMARY_MIN_LEN = 15
SEED = 42

# If you want to keep frame extractions for debugging, set to True
KEEP_FRAMES = False

# ================================
# 3) Reproducibility
# ================================
random.seed(SEED)
torch.manual_seed(SEED)

# ================================
# 4) Device
# ================================
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# ================================
# 5) Load Models
# ================================
# BLIP-2 for per-frame captions
blip_processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
# Use bfloat16 or float16 if CUDA; fallback to float32 on CPU
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
blip_model = Blip2ForConditionalGeneration.from_pretrained(
    "Salesforce/blip2-opt-2.7b",
    device_map="auto" if torch.cuda.is_available() else None,
    torch_dtype=torch_dtype
)
if not torch.cuda.is_available():
    blip_model = blip_model.to(device)

# BART for summarization
bart_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
bart_model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn").to(device)

# ================================
# 6) Helper Functions
# ================================
def safe_makedirs(path: str):
    Path(path).mkdir(parents=True, exist_ok=True)

def extract_frames_4fps(video_path: str, out_dir: str) -> int:
    """Extract frames at ~4fps (every 0.25s). Returns #saved frames."""
    safe_makedirs(out_dir)
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    if not fps or math.isnan(fps) or fps <= 0:
        fps = 25.0  # fallback assumption
    interval = max(int(round(fps / 4.0)), 1)
    success, frame = cap.read()
    count = 0
    saved = 0
    while success:
        if count % interval == 0:
            frame_path = os.path.join(out_dir, f"frame_{saved:04d}.jpg")
            cv2.imwrite(frame_path, frame)
            saved += 1
        success, frame = cap.read()
        count += 1
    cap.release()
    return saved

def generate_caption(image_path: str) -> str:
    """Generate a single-frame caption using BLIP-2."""
    image = Image.open(image_path).convert("RGB")
    inputs = blip_processor(images=image, return_tensors="pt").to(blip_model.device, dtype=torch_dtype)
    with torch.no_grad():
        output_ids = blip_model.generate(**inputs, max_new_tokens=MAX_NEW_TOKENS_CAPTION)
    cap = blip_processor.batch_decode(output_ids, skip_special_tokens=True)[0]
    return cap.strip()

def summarize_captions_bart(captions: List[str]) -> str:
    """Summarize frame-level captions into one concise description."""
    if not captions:
        return ""
    text_input = " ".join(captions)
    prompt = (
        "Summarize the following wildlife video scenes with high detail and precision. "
        "Retain unique animal behaviors, actions, and surroundings: "
        + text_input
    )
    inputs = bart_tokenizer([prompt], return_tensors="pt", max_length=1024, truncation=True).to(device)
    with torch.no_grad():
        summary_ids = bart_model.generate(
            inputs["input_ids"],
            max_length=SUMMARY_MAX_LEN,
            min_length=SUMMARY_MIN_LEN,
            num_beams=4,
            early_stopping=True,
            no_repeat_ngram_size=3,
            repetition_penalty=2.0
        )
    return bart_tokenizer.decode(summary_ids[0], skip_special_tokens=True).strip()

def semantic_correctness_score(caption: str, keywords: List[str]) -> float:
    """Fraction of keywords present in caption (case-insensitive)."""
    if not keywords:
        return 0.0
    cap = caption.lower()
    hits = sum(1 for k in keywords if k and k.lower() in cap)
    return hits / len([k for k in keywords if k]) if keywords else 0.0

def parse_metadata_row(row: pd.Series) -> Tuple[List[str], List[str]]:
    """
    Returns (animals, actions) for a given metadata row.
    - animals: row['list_animal'], can be list or comma-separated str
    - actions: row['list_animal_action'], can be "[(animal, action), ...]" or list of such pairs or comma-separated actions
    """
    animals_raw = row.get("list_animal", [])
    actions_raw = row.get("list_animal_action", "")

    # Animals
    if isinstance(animals_raw, list):
        animals = animals_raw
    elif isinstance(animals_raw, str):
        # Try to split by comma if not a JSON-like list
        animals = [a.strip() for a in animals_raw.split(",") if a.strip()]
    else:
        animals = []

    # Actions
    actions = []
    if isinstance(actions_raw, list):
        # could be list of tuples or list of strings
        for item in actions_raw:
            if isinstance(item, (list, tuple)) and len(item) >= 2:
                actions.append(str(item[1]))
            elif isinstance(item, str):
                actions.append(item)
    elif isinstance(actions_raw, str) and actions_raw.strip():
        # Try to eval if it looks like a Python list of tuples; fall back to comma-split
        try:
            parsed = eval(actions_raw)
            if isinstance(parsed, list):
                for it in parsed:
                    if isinstance(it, (list, tuple)) and len(it) >= 2:
                        actions.append(str(it[1]))
                    elif isinstance(it, str):
                        actions.append(it)
        except Exception:
            actions = [a.strip() for a in actions_raw.split(",") if a.strip()]
    return animals, actions

def maybe_download(filepath: str):
    """Trigger file download in Colab; otherwise just print saved path."""
    if IN_COLAB:
        try:
            colab_files.download(filepath)
        except Exception as e:
            print(f"[WARN] Auto-download failed for {filepath}: {e}")
    else:
        print(f"[INFO] Saved: {filepath}")

# ================================
# 7) Load Metadata
# ================================
assert os.path.exists(METADATA_XLSX), f"Metadata not found: {METADATA_XLSX}"
meta_df = pd.read_excel(METADATA_XLSX)
# Build a map from video_id -> (animals, actions)
metadata_map = {}
for _, r in meta_df.iterrows():
    vid = str(r.get("video_id", "")).strip()
    if not vid:
        continue
    animals, actions = parse_metadata_row(r)
    metadata_map[vid] = (
        [a.lower() for a in animals],
        [a.lower() for a in actions]
    )

# ================================
# 8) Collect & Sample Videos
# ================================
all_videos = sorted([str(p) for p in Path(VIDEO_DIR).glob("*.mp4")])
if not all_videos:
    raise FileNotFoundError(f"No .mp4 files found under {VIDEO_DIR}")

sample_n = min(TOTAL_VIDEOS, len(all_videos))
random.shuffle(all_videos)
video_subset = all_videos[:sample_n]
print(f"Found {len(all_videos)} videos. Sampling {sample_n} for processing.")

# ================================
# 9) Main Loop (process in batches of 100)
# ================================
safe_makedirs(FRAMES_ROOT)
results_master = []  # keep small dicts to optionally save one final file if desired

num_batches = math.ceil(sample_n / BATCH_SIZE)
for b in range(num_batches):
    start = b * BATCH_SIZE
    end = min((b + 1) * BATCH_SIZE, sample_n)
    batch_videos = video_subset[start:end]
    batch_rows = []

    print(f"\n=== Processing batch {b+1}/{num_batches}: videos {start+1}–{end} ===")

    for idx, vpath in enumerate(batch_videos, start=1):
        video_id = Path(vpath).stem
        frame_dir = os.path.join(FRAMES_ROOT, video_id)
        try:
            # 1) Extract frames
            n_frames = extract_frames_4fps(vpath, frame_dir)

            # 2) Caption frames
            frame_files = sorted([str(p) for p in Path(frame_dir).glob("*.jpg")])
            frame_captions = []
            for f in frame_files:
                try:
                    cap = generate_caption(f)
                except Exception as e:
                    cap = ""  # skip faulty frame
                frame_captions.append(cap)

            # 3) Summarize
            final_caption = summarize_captions_bart(frame_captions)

            # 4) Semantic correctness
            animals, actions = metadata_map.get(video_id, ([], []))
            keywords = animals + actions
            score = semantic_correctness_score(final_caption, keywords)
            percent = round(score * 100.0, 1)

            # Append row (ONLY what you asked for)
            row = {
                "video_id": video_id,
                "summarized_caption": final_caption,
                "semantic_correctness_percent": f"{percent:.1f}%"
            }
            batch_rows.append(row)
            results_master.append(row)

        except Exception as e:
            # Log an error row so you don't lose tracking
            err_row = {
                "video_id": video_id,
                "summarized_caption": "",
                "semantic_correctness_percent": "0.0%",
                "error": str(e)
            }
            batch_rows.append(err_row)
            results_master.append(err_row)
        finally:
            # Optional: free memory & (optionally) cleanup frames
            if not KEEP_FRAMES:
                try:
                    shutil.rmtree(frame_dir, ignore_errors=True)
                except Exception:
                    pass
            gc.collect()

    # === Save and download this batch ===
    batch_df = pd.DataFrame(batch_rows)
    batch_csv = f"/content/semantic_batch_{b+1:02d}_{start+1:05d}-{end:05d}.csv"
    batch_df.to_csv(batch_csv, index=False)
    print(f"[SAVE] Batch {b+1} saved to: {batch_csv}")
    maybe_download(batch_csv)

# ================================
# 10) Done
# ================================
print(f"\n Finished processing {sample_n} videos.")
if sample_n >= 1000:
    print(" 1000 videos are done generating summarized captions and semantic correctness percentage.")
else:
    print("ℹ Processed fewer than 1000 videos because the folder had fewer files.")

# (Optional) Also save a single combined CSV:
combined_csv = "/content/semantic_results_all.csv"
pd.DataFrame(results_master).to_csv(combined_csv, index=False)
print(f"[SAVE] Combined results saved to: {combined_csv}")
maybe_download(combined_csv)


Hyperparameter tuning on
num_beams,early_stopping=True,no_repeat_ngram_size,length_penalty,repetition_penalty

In [None]:
# ================================
# 1. Install Dependencies
# ================================
!pip install git+https://github.com/salesforce/BLIP.git transformers timm accelerate opencv-python pandas -q

# ================================
# 2. Import Libraries
# ================================
import os, cv2, torch
import pandas as pd
from PIL import Image
from transformers import (
    Blip2Processor, Blip2ForConditionalGeneration,
    BartTokenizer, BartForConditionalGeneration
)

# ================================
# 3. Device Setup
# ================================
device = "cuda" if torch.cuda.is_available() else "cpu"
print(" Using device:", device)

# ================================
# 4. BLIP2 Setup (Captioning)
# ================================
blip_processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
blip_model = Blip2ForConditionalGeneration.from_pretrained(
    "Salesforce/blip2-opt-2.7b",
    device_map="auto",
    torch_dtype=torch.float16
)

def generate_caption(image_path):
    image = Image.open(image_path).convert('RGB')
    inputs = blip_processor(images=image, return_tensors="pt").to(blip_model.device, torch.float16)
    outputs = blip_model.generate(**inputs, max_new_tokens=50)
    return blip_processor.batch_decode(outputs, skip_special_tokens=True)[0]

# ================================
# 5. BART Setup (Summarization)
# ================================
bart_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
bart_model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn").to(device)

def summarize_captions_bart(captions):
    text_input = " ".join(captions)
    prompt = (
        "Summarize the following wildlife video scenes with high detail and precision. "
        "Retain unique animal behaviors, actions, and surroundings: " + text_input
    )
    inputs = bart_tokenizer([prompt], return_tensors="pt", max_length=1024, truncation=True).to(device)
    summary_ids = bart_model.generate(
        inputs["input_ids"],
        max_length=50,
        min_length=15,
        num_beams=4,
        early_stopping=True,
        no_repeat_ngram_size=2,
        length_penalty=2.0,
        repetition_penalty=1.1
    )
    return bart_tokenizer.decode(summary_ids[0], skip_special_tokens=True).strip()


# ================================
# 6. Frame Extraction
# ================================
def extract_frames_4fps(video_path, output_folder):
    os.makedirs(output_folder, exist_ok=True)
    vidcap = cv2.VideoCapture(video_path)
    fps = vidcap.get(cv2.CAP_PROP_FPS)
    interval = int(fps / 4) if fps >= 4 else 1  # Capture every 0.25 sec
    success, image = vidcap.read()
    count, saved = 0, 0
    while success:
        if count % interval == 0:
            frame_path = os.path.join(output_folder, f"frame_{saved:03d}.jpg")
            cv2.imwrite(frame_path, image)
            saved += 1
        success, image = vidcap.read()
        count += 1
    vidcap.release()
    return saved

# ================================
# 7. Metadata Matching Function
# ================================
def semantic_correctness_score(caption, keywords):
    caption = caption.lower()
    matched = [kw for kw in keywords if kw.lower() in caption]
    missing = [kw for kw in keywords if kw.lower() not in caption]
    score = len(matched) / len(keywords) if keywords else 0
    return score, matched, missing

# ================================
# 8. Load Metadata
# ================================
metadata_path = "/content/drive/MyDrive/Animal_Kingdom/action_recognition/AR_metadata.xlsx"
meta_df = pd.read_excel(metadata_path)
metadata_dict = {row["video_id"]: row for _, row in meta_df.iterrows()}

# ================================
# 9. Process Videos
# ================================
video_dir = "/content/videos/video/"
video_files = sorted([f for f in os.listdir(video_dir) if f.endswith(".mp4")])[:20]

results = []

for video_file in video_files:
    video_id = os.path.splitext(video_file)[0]
    video_path = os.path.join(video_dir, video_file)
    frame_dir = f"/content/frames/{video_id}"

    print(f"\n Processing {video_id}")

    # Step 1: Extract frames
    extract_frames_4fps(video_path, frame_dir)

    # Step 2: Caption each frame
    frame_captions = []
    for fname in sorted(os.listdir(frame_dir)):
        if fname.endswith(".jpg"):
            path = os.path.join(frame_dir, fname)
            caption = generate_caption(path)
            frame_captions.append(caption)

    # Step 3: Summarize using BART
    final_caption = summarize_captions_bart(frame_captions)

    # Step 4: Metadata matching
    meta = metadata_dict.get(video_id, {})
    animals = meta.get("list_animal", [])
    actions_raw = meta.get("list_animal_action", "")
    try:
        actions = [act for (_, act) in eval(actions_raw)] if isinstance(actions_raw, str) else []
    except:
        actions = []

    keywords = [a.lower() for a in animals] + [a.lower() for a in actions]
    score, matched, missing = semantic_correctness_score(final_caption, keywords)

    results.append({
        "video_id": video_id,
        "final_caption": final_caption,
        "frame_captions": frame_captions,
        "keywords": keywords,
        "matched_keywords": matched,
        "missing_keywords": missing,
        "semantic_correctness_percent": f"{score*100:.1f}%"
    })

# ================================
# 10. Save Results
# ================================
results_df = pd.DataFrame(results)
results_df.to_csv("/content/blip2_bart_results.csv_1", index=False)

print("\n BLIP2 + BART Evaluation Complete! Results saved to:")
print(" /content/blip2_bart_results.csv")


In [None]:
import pandas as pd

# Load results
results_df = pd.read_csv("/content/blip2_bart_results.csv_1")

# Display key columns
print(results_df[["video_id", "semantic_correctness_percent"]])


In [None]:
# ================================
# 1. Install Dependencies
# ================================
# !pip install git+https://github.com/salesforce/BLIP.git transformers timm accelerate opencv-python pandas -q

# ================================
# 2. Import Libraries
# ================================
import os, cv2, torch
import pandas as pd
from PIL import Image
from transformers import (
    Blip2Processor, Blip2ForConditionalGeneration,
    BartTokenizer, BartForConditionalGeneration
)

# ================================
# 3. Device Setup
# ================================
device = "cuda" if torch.cuda.is_available() else "cpu"
print(" Using device:", device)

# ================================
# 4. BLIP2 Setup (Captioning)
# ================================
blip_processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
blip_model = Blip2ForConditionalGeneration.from_pretrained(
    "Salesforce/blip2-opt-2.7b",
    device_map="auto",
    torch_dtype=torch.float16
)

def generate_caption(image_path):
    image = Image.open(image_path).convert('RGB')
    inputs = blip_processor(images=image, return_tensors="pt").to(blip_model.device, torch.float16)
    outputs = blip_model.generate(**inputs, max_new_tokens=50)
    return blip_processor.batch_decode(outputs, skip_special_tokens=True)[0]

# ================================
# 5. BART Setup (Summarization)
# ================================
bart_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
bart_model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn").to(device)

def summarize_captions_bart(captions):
    text_input = " ".join(captions)
    prompt = (
        "Summarize the following wildlife video scenes with high detail and precision. "
        "Retain unique animal behaviors, actions, and surroundings: " + text_input
    )
    inputs = bart_tokenizer([prompt], return_tensors="pt", max_length=1024, truncation=True).to(device)
    summary_ids = bart_model.generate(
        inputs["input_ids"],
        max_length=50,
        min_length=15,
        num_beams=6,
        early_stopping=True,
        no_repeat_ngram_size=3,
        length_penalty=1.0,
        repetition_penalty=1.15
    )
    return bart_tokenizer.decode(summary_ids[0], skip_special_tokens=True).strip()


# ================================
# 6. Frame Extraction
# ================================
def extract_frames_4fps(video_path, output_folder):
    os.makedirs(output_folder, exist_ok=True)
    vidcap = cv2.VideoCapture(video_path)
    fps = vidcap.get(cv2.CAP_PROP_FPS)
    interval = int(fps / 4) if fps >= 4 else 1  # Capture every 0.25 sec
    success, image = vidcap.read()
    count, saved = 0, 0
    while success:
        if count % interval == 0:
            frame_path = os.path.join(output_folder, f"frame_{saved:03d}.jpg")
            cv2.imwrite(frame_path, image)
            saved += 1
        success, image = vidcap.read()
        count += 1
    vidcap.release()
    return saved

# ================================
# 7. Metadata Matching Function
# ================================
def semantic_correctness_score(caption, keywords):
    caption = caption.lower()
    matched = [kw for kw in keywords if kw.lower() in caption]
    missing = [kw for kw in keywords if kw.lower() not in caption]
    score = len(matched) / len(keywords) if keywords else 0
    return score, matched, missing

# ================================
# 8. Load Metadata
# ================================
metadata_path = "/content/drive/MyDrive/Animal_Kingdom/action_recognition/AR_metadata.xlsx"
meta_df = pd.read_excel(metadata_path)
metadata_dict = {row["video_id"]: row for _, row in meta_df.iterrows()}

# ================================
# 9. Process Videos
# ================================
video_dir = "/content/videos/video/"
video_files = sorted([f for f in os.listdir(video_dir) if f.endswith(".mp4")])[:20]

results = []

for video_file in video_files:
    video_id = os.path.splitext(video_file)[0]
    video_path = os.path.join(video_dir, video_file)
    frame_dir = f"/content/frames/{video_id}"

    print(f"\n Processing {video_id}")

    # Step 1: Extract frames
    extract_frames_4fps(video_path, frame_dir)

    # Step 2: Caption each frame
    frame_captions = []
    for fname in sorted(os.listdir(frame_dir)):
        if fname.endswith(".jpg"):
            path = os.path.join(frame_dir, fname)
            caption = generate_caption(path)
            frame_captions.append(caption)

    # Step 3: Summarize using BART
    final_caption = summarize_captions_bart(frame_captions)

    # Step 4: Metadata matching
    meta = metadata_dict.get(video_id, {})
    animals = meta.get("list_animal", [])
    actions_raw = meta.get("list_animal_action", "")
    try:
        actions = [act for (_, act) in eval(actions_raw)] if isinstance(actions_raw, str) else []
    except:
        actions = []

    keywords = [a.lower() for a in animals] + [a.lower() for a in actions]
    score, matched, missing = semantic_correctness_score(final_caption, keywords)

    results.append({
        "video_id": video_id,
        "final_caption": final_caption,
        "frame_captions": frame_captions,
        "keywords": keywords,
        "matched_keywords": matched,
        "missing_keywords": missing,
        "semantic_correctness_percent": f"{score*100:.1f}%"
    })

# ================================
# 10. Save Results
# ================================
results_df = pd.DataFrame(results)
results_df.to_csv("/content/blip2_bart_results.csv_2", index=False)

print("\n BLIP2 + BART Evaluation Complete! Results saved to:")
print(" /content/blip2_bart_results.csv_2")


In [None]:
import pandas as pd

# Load results
results_df = pd.read_csv("/content/blip2_bart_results.csv_2")

# Display key columns
print(results_df[["video_id", "semantic_correctness_percent"]])


In [None]:
# ================================
# 1. Install Dependencies
# ================================
# !pip install git+https://github.com/salesforce/BLIP.git transformers timm accelerate opencv-python pandas -q

# ================================
# 2. Import Libraries
# ================================
import os, cv2, torch
import pandas as pd
from PIL import Image
from transformers import (
    Blip2Processor, Blip2ForConditionalGeneration,
    BartTokenizer, BartForConditionalGeneration
)

# ================================
# 3. Device Setup
# ================================
device = "cuda" if torch.cuda.is_available() else "cpu"
print(" Using device:", device)

# ================================
# 4. BLIP2 Setup (Captioning)
# ================================
blip_processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
blip_model = Blip2ForConditionalGeneration.from_pretrained(
    "Salesforce/blip2-opt-2.7b",
    device_map="auto",
    torch_dtype=torch.float16
)

def generate_caption(image_path):
    image = Image.open(image_path).convert('RGB')
    inputs = blip_processor(images=image, return_tensors="pt").to(blip_model.device, torch.float16)
    outputs = blip_model.generate(**inputs, max_new_tokens=50)
    return blip_processor.batch_decode(outputs, skip_special_tokens=True)[0]

# ================================
# 5. BART Setup (Summarization)
# ================================
bart_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
bart_model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn").to(device)

def summarize_captions_bart(captions):
    text_input = " ".join(captions)
    prompt = (
    "You are an expert wildlife describer. Summarize the following wildlife video text faithfully and with high precision. "
    "Write one concise paragraph that mentions habitat/surroundings and lists 3–6 distinct Subject–Action–Object events. "
    "Text: " + text_input
)


    inputs = bart_tokenizer([prompt], return_tensors="pt", max_length=1024, truncation=True).to(device)
    summary_ids = bart_model.generate(
        inputs["input_ids"],
        max_length=60,
        min_length=15,
        num_beams=4,
        early_stopping=True,
        no_repeat_ngram_size=3,
        length_penalty=2.0,
        repetition_penalty=2.0
    )
    return bart_tokenizer.decode(summary_ids[0], skip_special_tokens=True).strip()


# ================================
# 6. Frame Extraction
# ================================
def extract_frames_4fps(video_path, output_folder):
    os.makedirs(output_folder, exist_ok=True)
    vidcap = cv2.VideoCapture(video_path)
    fps = vidcap.get(cv2.CAP_PROP_FPS)
    interval = int(fps / 4) if fps >= 4 else 1  # Capture every 0.25 sec
    success, image = vidcap.read()
    count, saved = 0, 0
    while success:
        if count % interval == 0:
            frame_path = os.path.join(output_folder, f"frame_{saved:03d}.jpg")
            cv2.imwrite(frame_path, image)
            saved += 1
        success, image = vidcap.read()
        count += 1
    vidcap.release()
    return saved

# ================================
# 7. Metadata Matching Function
# ================================
def semantic_correctness_score(caption, keywords):
    caption = caption.lower()
    matched = [kw for kw in keywords if kw.lower() in caption]
    missing = [kw for kw in keywords if kw.lower() not in caption]
    score = len(matched) / len(keywords) if keywords else 0
    return score, matched, missing

# ================================
# 8. Load Metadata
# ================================
metadata_path = "/content/drive/MyDrive/Animal_Kingdom/action_recognition/AR_metadata.xlsx"
meta_df = pd.read_excel(metadata_path)
metadata_dict = {row["video_id"]: row for _, row in meta_df.iterrows()}

# ================================
# 9. Process Videos
# ================================
video_dir = "/content/videos/video/"
video_files = sorted([f for f in os.listdir(video_dir) if f.endswith(".mp4")])[:20]

results = []

for video_file in video_files:
    video_id = os.path.splitext(video_file)[0]
    video_path = os.path.join(video_dir, video_file)
    frame_dir = f"/content/frames/{video_id}"

    print(f"\n Processing {video_id}")

    # Step 1: Extract frames
    extract_frames_4fps(video_path, frame_dir)

    # Step 2: Caption each frame
    frame_captions = []
    for fname in sorted(os.listdir(frame_dir)):
        if fname.endswith(".jpg"):
            path = os.path.join(frame_dir, fname)
            caption = generate_caption(path)
            frame_captions.append(caption)

    # Step 3: Summarize using BART
    final_caption = summarize_captions_bart(frame_captions)

    # Step 4: Metadata matching
    meta = metadata_dict.get(video_id, {})
    animals = meta.get("list_animal", [])
    actions_raw = meta.get("list_animal_action", "")
    try:
        actions = [act for (_, act) in eval(actions_raw)] if isinstance(actions_raw, str) else []
    except:
        actions = []

    keywords = [a.lower() for a in animals] + [a.lower() for a in actions]
    score, matched, missing = semantic_correctness_score(final_caption, keywords)

    results.append({
        "video_id": video_id,
        "final_caption": final_caption,
        "frame_captions": frame_captions,
        "keywords": keywords,
        "matched_keywords": matched,
        "missing_keywords": missing,
        "semantic_correctness_percent": f"{score*100:.1f}%"
    })

# ================================
# 10. Save Results
# ================================
results_df = pd.DataFrame(results)
results_df.to_csv("/content/blip2_bart_results.csv_5", index=False)

print("\n BLIP2 + BART Evaluation Complete! Results saved to:")
print(" /content/blip2_bart_results.csv_5")


In [None]:
import pandas as pd

# Load results
results_df = pd.read_csv("/content/blip2_bart_results.csv_4")

# Display key columns
print(results_df[["video_id", "semantic_correctness_percent"]])


InstructBLIP

In [None]:
from huggingface_hub import login
login(token="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")


In [None]:
from transformers import InstructBlipProcessor, InstructBlipForConditionalGeneration

processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-flan-t5-xl")
model = InstructBlipForConditionalGeneration.from_pretrained("Salesforce/instructblip-flan-t5-xl").to("cuda")


In [None]:
import os
import cv2
import torch
from PIL import Image
from transformers import InstructBlipProcessor, InstructBlipForConditionalGeneration

# ========== 1. Setup ==========
device = "cuda" if torch.cuda.is_available() else "cpu"
video_id = "LKBDONQN"
video_path = f"/content/videos/video/{video_id}.mp4"
frame_dir = f"/content/frames/{video_id}"
os.makedirs(frame_dir, exist_ok=True)

# ========== 2. Load InstructBLIP ==========
processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-flan-t5-xl")
model = InstructBlipForConditionalGeneration.from_pretrained("Salesforce/instructblip-flan-t5-xl").to(device)

# ========== 3. Extract Frames (4 FPS) ==========
def extract_frames_4fps(video_path, output_folder):
    vidcap = cv2.VideoCapture(video_path)
    fps = vidcap.get(cv2.CAP_PROP_FPS)
    interval = int(fps / 4) if fps >= 4 else 1
    count = saved = 0
    success, image = vidcap.read()
    while success:
        if count % interval == 0:
            fpath = os.path.join(output_folder, f"frame_{saved:03d}.jpg")
            cv2.imwrite(fpath, image)
            saved += 1
        success, image = vidcap.read()
        count += 1
    vidcap.release()
    return saved

print(f"Extracting frames from {video_path} ...")
num_frames = extract_frames_4fps(video_path, frame_dir)
print(f"Extracted {num_frames} frames.")

# ========== 4. Caption Each Frame ==========
def generate_caption_instructblip(image_path, instruction="Describe the animal's behavior in this image."):
    image = Image.open(image_path).convert("RGB")
    inputs = processor(images=image, text=instruction, return_tensors="pt").to(device)
    outputs = model.generate(**inputs, max_new_tokens=50)
    return processor.batch_decode(outputs, skip_special_tokens=True)[0]

print("\nGenerating captions per frame...\n")
frame_captions = []
for fname in sorted(os.listdir(frame_dir)):
    if fname.endswith(".jpg"):
        fpath = os.path.join(frame_dir, fname)
        caption = generate_caption_instructblip(fpath)
        print(f"{fname}: {caption}")
        frame_captions.append(caption)


In [None]:
from transformers import BartTokenizer, BartForConditionalGeneration

# Load BART summarization model
bart_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
bart_model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn").to(device)

def summarize_captions(captions, max_length=30, min_length=15):
    context = " ".join(captions)
    prompt = "Summarize the following wildlife observations:\n" + context

    inputs = bart_tokenizer([prompt], return_tensors="pt", max_length=1024, truncation=True).to(device)
    summary_ids = bart_model.generate(
        inputs["input_ids"],
        max_length=max_length,
        min_length=min_length,
        num_beams=4,
        early_stopping=True,
        no_repeat_ngram_size=3,
        repetition_penalty=2.0,
    )
    return bart_tokenizer.decode(summary_ids[0], skip_special_tokens=True).strip()

# Summarize and display result
video_summary = summarize_captions(frame_captions)
print("\n Final Video Caption (Summary):", video_summary)
