In [None]:
!pip install torch
!pip install transformers
!pip install pillow
!conda install --yes -c pytorch pytorch=1.7.1 torchvision cudatoolkit=11.0
!pip install opencv-python
!pip install matplotlib
!pip install ftfy regex tqdm
!pip install git+https://github.com/openai/CLIP.git

In [2]:
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")  # Should match your NVCC (12.8)
print(f"GPU: {torch.cuda.get_device_name(0)}")

CUDA available: True
CUDA version: 11.8
GPU: NVIDIA RTX A4500


In [None]:
from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
from PIL import Image
import torch
import subprocess
import os
import shutil
from pathlib import Path

# Model Initialization
def initialize_models():
    processor = LlavaNextProcessor.from_pretrained(
        "llava-hf/llava-v1.6-mistral-7b-hf",
        use_fast=True
    )
    
    model = LlavaNextForConditionalGeneration.from_pretrained(
        "llava-hf/llava-v1.6-mistral-7b-hf",
        torch_dtype=torch.float16,
        device_map=None 
    ).to('cuda:0').eval()

    model.generation_config.pad_token_id = model.config.eos_token_id
    model.generation_config.eos_token_id = model.config.eos_token_id

    print(f"Model loaded on {next(model.parameters()).device}")
    print(f"VRAM usage: {torch.cuda.memory_allocated()/1e9:.2f}GB")
    
    return processor, model

# Extract 3 frames per 1-minute segment
def extract_representative_frames(video_path, output_dir="segments"):
    os.makedirs(output_dir, exist_ok=True)
    
    project_root = Path(__file__).parent
    ffmpeg_path = project_root / "ffmpeg" / "bin" / "ffmpeg.exe"
    
    if not ffmpeg_path.exists():
        raise FileNotFoundError(f"FFmpeg not found at {ffmpeg_path}")

    subprocess.run([
        str(ffmpeg_path),
        "-i", video_path,
        "-vf", "select='(gt(scene,0.35))'",
        "-vsync", "0",
        "-frame_pts", "1",
        f"{output_dir}/frame_%04d.png"
    ], check=True)
    
    frame_paths = sorted([str(p) for p in Path(output_dir).glob("frame_*.png")])
    return frame_paths

# Process frames and summarize each segment
def describe_frames_group(frames, processor, model):
    images = []
    for frame_path in frames:
        try:
            img = Image.open(frame_path).convert("RGB")
            images.append(img)
        except Exception as e:
            print(f"Could not open {frame_path}: {e}")
            return None

    texts = ["[INST] <image>\nDescribe this image. [/INST]"] * len(images)
    inputs = processor(images=images, text=texts, return_tensors="pt", padding=True).to("cuda:0")

    with torch.inference_mode():
        outputs = model.generate(
            **inputs, max_new_tokens=100, do_sample=False
        )

    descriptions = [processor.decode(o, skip_special_tokens=True) for o in outputs]
    combined_description = " ".join(descriptions)
    return combined_description.strip()

# Save results
def save_results(results, output_file="minute_descriptions.txt"):
    with open(output_file, "w", encoding='utf-8') as f:
        for minute, desc in results:
            f.write(f"[Minute {minute}] {desc}\n")

# Main
def main():
    torch.cuda.empty_cache()
    processor, model = initialize_models()

    video_path = "30min_vid.mp4"
    frames_per_segment = extract_representative_frames(video_path)

    results = []
    for i, frame_path in enumerate(frames_per_segment):
        print(f"Processing minute {i}...")
        description = describe_frames_group([frame_path], processor, model)
        if description:
            results.append((i, description))

    save_results(results)
    print(f"Completed! Generated descriptions for {len(results)} minutes.")

if __name__ == "__main__":
    main()

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Model loaded on cuda:0
VRAM usage: 15.14GB
Processing minute 0...
Processing minute 1...
Processing minute 2...
Processing minute 3...
Processing minute 4...
Processing minute 5...
Processing minute 6...
Processing minute 7...
Processing minute 8...
Processing minute 9...
Processing minute 10...
Processing minute 11...
Processing minute 12...
Processing minute 13...
Processing minute 14...
Processing minute 15...
Processing minute 16...
Processing minute 17...
Processing minute 18...
Processing minute 19...
Processing minute 20...
Processing minute 21...
Processing minute 22...
Processing minute 23...
Processing minute 24...
Processing minute 25...
Processing minute 26...
Processing minute 27...
Could not open segments/frame_1620_2.png: [Errno 2] No such file or directory: 'segments/frame_1620_2.png'
Completed! Generated descriptions for 27 minutes.


In [None]:
import cv2
import random
from PIL import Image
import torch
import clip
from torchvision import transforms
import matplotlib.pyplot as plt
import torch.nn.functional as torch_func
from numpy.linalg import norm
import numpy as np

similar_segments = []
ground_truth = [
                  [[0, 47], [48, 237], [238, 330], [331, 898], [899, 1204], [1205, 1499], [1500, 1662]],
                  [[0, 31], [32, 404], [405, 623], [624, 1045], [1046, 1130], [1131, 1551], [1552, 1781]]
                 ]
text_labels = [["introduction", "recap: the surprise pi", "the game plan", "how to analyze the blocks", "the geometry puzzle", "small angle approximation", "the value of pure puzzles"],
               ["introduction", "twirling ties", "tarski plank problem", "monge's theorem", "3d volume, 4d answer", "the hypercube stack", "the sadness of higher dimensions"]]

cosine_accuracy = {}

def seconds_to_mmss(seconds):
    minutes = int(seconds) // 60
    remaining_seconds = int(seconds) % 60
    return f"{minutes:02}:{remaining_seconds:02}"

def calculate_iou(seg1, seg2):
    start1, end1 = seg1
    start2, end2 = seg2
    intersection = max(0, min(end1, end2) - max(start1, start2))
    union = max(end1, end2) - min(start1, start2)
    return intersection / union if union > 0 else 0

def compare_vectors(similarity_output):
  for i in range(len(similarity_output) - 1):
    vec1 = similarity_output[i]
    vec2 = similarity_output[i + 1]

    cos_sim = torch_func.cosine_similarity(vec1.unsqueeze(0), vec2.unsqueeze(0)).item()
    euclid_dist = torch.dist(vec1, vec2).item()

    if (cos_sim > 0.85 and euclid_dist < 0.2):
      similar_segments.append([i, i+1])
    # print(f"Comparing row {i} & {i+1}:")
    # print(f"  Cosine Similarity: {cos_sim:.4f}")
    # print(f"  Euclidean Distance: {euclid_dist:.4f}")

def combine_segments(current_segments, combine_segments):
  # print(f"Segments to combine: {combine_segments}")
  from collections import defaultdict

  adjacency = defaultdict(set)
  for i, j in combine_segments:
      adjacency[i].add(j)
      adjacency[j].add(i)

  visited = set()
  groups = []

  for node in sorted(adjacency.keys()):
      if node not in visited:
          stack = [node]
          group = []

          while stack:
              current = stack.pop()
              if current not in visited:
                  visited.add(current)
                  group.append(current)
                  stack.extend(adjacency[current])

          if len(group) > 1:
              groups.append(sorted(group))

  segments_to_remove = set()
  for group in groups:
      start_idx = group[0]
      end_idx = group[-1]

      current_segments[start_idx] = [current_segments[start_idx][0], current_segments[end_idx][1]]

      for idx in group[1:]:
          segments_to_remove.add(idx)

  for idx in sorted(segments_to_remove, reverse=True):
      del current_segments[idx]

  return current_segments

def analyze_thresholds(image_features, ground_truth, thresholds=np.linspace(0.7, 0.95, 20)):
    segment_counts = []
    segment_frequency = np.zeros(len(image_features))  # Tracks how often each segment appears

    for threshold in thresholds:
        temp_similar_segments = []
        for i in range(len(image_features) - 1):
            vec1 = image_features[i]
            vec2 = image_features[i + 1]
            cos_sim = torch_func.cosine_similarity(vec1.unsqueeze(0), vec2.unsqueeze(0)).item()
            if cos_sim > threshold:
                temp_similar_segments.append([i, i + 1])
                segment_frequency[i] += 1
                segment_frequency[i + 1] += 1

        temp_segments = [[i * 60, (i + 1) * 60] for i in range(len(image_features))]
        merged = combine_segments(temp_segments.copy(), temp_similar_segments)
        print(f"\nCosine threshold: {threshold:.2f}")
        segment_counts.append(len(merged))

        matches = 0
        ious = []
        for gt in ground_truth:
            best_iou = 0
            for merged_seg in merged:
                iou = calculate_iou(gt, merged_seg)
                best_iou = max(best_iou, iou)
            ious.append(best_iou)
            if best_iou > 0.5:
                matches += 1

        accuracy = round((matches / len(ground_truth)), 2)
        threshold = round(threshold, 2)
        cosine_accuracy.setdefault(threshold, []).append(accuracy)
        print(f"IoUs: {[f'{iou:.2f}' for iou in ious]}")
        # print(f"Matched Segments: {matches}/{len(ground_truth)}")
        print(f"Accuracy: {accuracy:.2f}")

    # # Plot: Threshold vs Final Segment Count
    # plt.figure(figsize=(10, 4))
    # plt.plot(thresholds, segment_counts, marker='o')
    # plt.xlabel("Cosine Similarity Threshold")
    # plt.ylabel("Number of Segments After Merging")
    # plt.title("Impact of Cosine Threshold on Segment Count")
    # plt.grid(True)
    # plt.show()

    # # Plot: Segment Similarity Frequency
    # plt.figure(figsize=(12, 4))
    # x_labels = [f"{seconds_to_mmss(i*60)}" for i in range(len(segment_frequency))]
    # plt.bar(range(len(segment_frequency)), segment_frequency, tick_label=x_labels)
    # plt.xticks(rotation=45, ha='right')
    # plt.xlabel("Segment Start Time")
    # plt.ylabel("Similarity Frequency")
    # plt.title("Segment Similarity Frequency Across Thresholds")
    # plt.tight_layout()
    # plt.show()

    # Optional: Return which segment(s) are most similar
    # top_segments = np.argsort(segment_frequency)[-3:][::-1]  # top 3 segments
    # print("Most clustered segments:")
    # for idx in top_segments:
    #     print(f"Segment {idx} ({seconds_to_mmss(idx*60)} - {seconds_to_mmss((idx+1)*60)}), Frequency: {segment_frequency[idx]}")

def load_clip_text_inputs_from_file(file_path="minute_descriptions.txt"):
    with open(file_path, "r", encoding="utf-8") as f:
        descriptions = []
        for line in f:
            if "]" in line:
                description = line.strip().split("]", 1)[-1].strip()
                if description:
                    descriptions.append(description)
        return descriptions
        
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)
video_path = ["30min_vid.mp4"]

def run_evaluation_model(video_path):
  for video_idx, video in enumerate(video_path):
    print(f"Video idx: {video_idx}; video_path: {video}")
    cap = cv2.VideoCapture(video)

    # calculate how many frames per 1m segment & how many segments
    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    duration_sec = total_frames / fps
    num_segments = int(duration_sec // 60)

    frames_per_segment = int(fps * 60)

    random_frames = []
    segments = []

    # loop for choosing random frame
    for segment_idx in range(num_segments):
        start_frame = segment_idx * frames_per_segment
        end_frame = start_frame + frames_per_segment

        start_time, end_time = start_frame / fps, end_frame / fps
        segments.append([start_time, end_time])
        # Random frame from segment
        random_frame_index = random.randint(start_frame, min(end_frame - 1, total_frames - 1))

        # Get index and append to frame array for querying
        cap.set(cv2.CAP_PROP_POS_FRAMES, random_frame_index)
        ret, frame = cap.read()
        if ret:
            # Convert BGR (OpenCV) to RGB (PIL)
            img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
            random_frames.append((img, preprocess(img)))

    cap.release()

    # batch processing
    image_inputs = torch.stack([p for _, p in random_frames])
    image_inputs = image_inputs.to(device)

    descriptions = load_clip_text_inputs_from_file("minute_descriptions.txt")
    text_inputs = clip.tokenize(descriptions, truncate=True)
    text_inputs = text_inputs.to(device)

    # cosine similarity of text query and image feature/video frame
    with torch.no_grad():
        image_features = model.encode_image(image_inputs)
        text_features = model.encode_text(text_inputs)
        similarity = (image_features @ text_features.T).softmax(dim=-1)
        print(similarity)

    for i in range(3):
      print(f"ground truth: {ground_truth[video_idx]}")
      analyze_thresholds(image_features, ground_truth=ground_truth[video_idx])

    for cosine_threshold in cosine_accuracy:
      average_accuracy = np.mean(cosine_accuracy[cosine_threshold])
      print(f"Cosine threshold: {cosine_threshold:.2f}; Average Accuracy: {average_accuracy:.2f}")
      # print(f"Average Accuracy: {average_accuracy}")
      # for accuracy in cosine_accuracy[cosine_threshold]:
      #   print(f"Accuracy: {accuracy}")

run_evaluation_model(video_path)

# compare_vectors(similarity)
# print(combine_segments(segments, similar_segments))

# percentages = (similarity.to(torch.float32) * 100).to("cpu")