In [None]:
import os
import cv2
import torch
import numpy as np
from tqdm import tqdm
from transformers import VideoMAEModel
from google.colab import drive

drive.mount('/content/drive')

CONFIG = {
    'device': 'cuda' if torch.cuda.is_available() else 'cpu',
    'base_dir': '/content/drive/MyDrive/VEA/',
    'video_subdir': 'data16',
    'output_subdir': 'features_visual',
    'model_name': 'MCG-NJU/videomae-base',
    'num_segments': 8,
    'frames_per_segment': 16,
    'target_size': 224,
    'mini_batch_size': 4
}

def sample_video_frames(video_path):
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        return None

    v_len = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    total_frames = CONFIG['num_segments'] * CONFIG['frames_per_segment']
    indices = np.linspace(0, v_len - 1, total_frames).astype(int)

    frames = []
    for i in indices:
        cap.set(cv2.CAP_PROP_POS_FRAMES, i)
        ret, frame = cap.read()
        if not ret:
            frame = np.zeros((CONFIG['target_size'], CONFIG['target_size'], 3), dtype=np.uint8)
        else:
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame = cv2.resize(frame, (CONFIG['target_size'], CONFIG['target_size']), interpolation=cv2.INTER_LINEAR)
        frames.append(frame)
    cap.release()

    frames = np.stack(frames) / 255.0
    frames = torch.tensor(frames).float()
    frames = frames.permute(0, 3, 1, 2)
    frames = frames.reshape(CONFIG['num_segments'], CONFIG['frames_per_segment'], 3, CONFIG['target_size'], CONFIG['target_size'])
    return frames

def extract_video_feature(model, video_tensor, device):
    num_segments = video_tensor.shape[0]
    features_list = []

    if not video_tensor.is_contiguous():
        video_tensor = video_tensor.contiguous()

    with torch.no_grad():
        for i in range(0, num_segments, CONFIG['mini_batch_size']):
            batch_slice = video_tensor[i : i + CONFIG['mini_batch_size']].to(device)
            outputs = model(pixel_values=batch_slice)
            cls_tokens = outputs.last_hidden_state[:, 0, :]
            features_list.append(cls_tokens.cpu().numpy())

    return np.concatenate(features_list, axis=0)

def main():
    device = torch.device(CONFIG['device'])

    video_dir = os.path.join(CONFIG['base_dir'], CONFIG['video_subdir'])
    output_dir = os.path.join(CONFIG['base_dir'], CONFIG['output_subdir'])
    os.makedirs(output_dir, exist_ok=True)

    model = VideoMAEModel.from_pretrained(CONFIG['model_name']).to(device)
    model.eval()

    video_files = [f for f in os.listdir(video_dir) if f.endswith(('.mp4', '.avi', '.mov', '.mkv'))]

    for vf in tqdm(video_files):
        out_path = os.path.join(output_dir, os.path.splitext(vf)[0] + ".npy")
        if os.path.exists(out_path):
            continue

        try:
            video_path = os.path.join(video_dir, vf)
            video_tensor = sample_video_frames(video_path)

            if video_tensor is None:
                continue

            feature = extract_video_feature(model, video_tensor, device)
            np.save(out_path, feature)

        except Exception:
            torch.cuda.empty_cache()

if __name__ == "__main__":
    main()