# CLS 토큰 only, fintuned 모델 사용

In [None]:
from pathlib import Path
import torch
import torch.nn as nn
import numpy as np
from decord import VideoReader
from torchvision import transforms
from torchvision.transforms import InterpolationMode
import json
from tqdm import tqdm
import sys
import random

# === 하이퍼파라미터 및 경로 통일 ===
ROOT = Path(r'D:/golfDataset/dataset')
FUSION_DIR = Path(r'D:/Jabez/golf/fusion')
PER_VIDEO_DIR = FUSION_DIR / 'embedding_data' / 'timesformer' / 'per_video'
PER_VIDEO_DIR.mkdir(parents=True, exist_ok=True)
MODEL_PATH = Path(r'D:\Jabez\golf\Timesformer_finetune\best_timesformer.pth')
PRETRAINED = Path(r"D:\timesformer\pretrained\TimeSformer_divST_8x32_224_K600.pyth")
NUM_FRAMES = 16
CLIPS_PER_VID = 8
IMG_SIZE = 224
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

sys.path.append(r'D:/timesformer')
from timesformer.models.vit import TimeSformer

# === finetune 코드와 동일한 전처리 정의 ===
def eval_clip(frames):
    out = []
    for f in frames:
        img = transforms.functional.to_pil_image(f)
        img = transforms.functional.resize(img, 256, interpolation=InterpolationMode.BICUBIC)
        img = transforms.functional.center_crop(img, IMG_SIZE)
        t = transforms.functional.to_tensor(img)
        t = transforms.functional.normalize(t, [0.45]*3, [0.225]*3)
        out.append(t)
    return torch.stack(out)

# === finetune 코드와 동일한 샘플링 함수 ===
def uniform_sample(L, N):
    if L >= N:
        return np.linspace(0, L-1, N).astype(int)
    return np.pad(np.arange(L), (0, N-L), mode='edge')

# === 수정된 load_clip 함수 ===
def load_clip(path: Path, num_clips: int):
    vr = VideoReader(str(path))
    L = len(vr)
    # ⚠️ 수정된 부분: 비디오 프레임 수가 0일 때 early return
    if L == 0:
        print(f"[경고] 프레임이 0인 비디오: {path}")
        return [] # 빈 리스트 반환
    
    seg_edges = np.linspace(0, L, num_clips + 1, dtype=int)
    clips = []
    for s0, s1 in zip(seg_edges[:-1], seg_edges[1:]):
        idx = uniform_sample(s1 - s0, NUM_FRAMES) + s0
        arr = vr.get_batch(idx).asnumpy().astype(np.uint8)  # (T,H,W,3)
        clip = torch.from_numpy(arr).permute(0, 3, 1, 2).contiguous() # (T,C,H,W)
        processed_clip = eval_clip(clip) # (T,C,H,W)
        clips.append(processed_clip.permute(1, 0, 2, 3)) # (C,T,H,W)
    return clips

# train, test 폴더 내 balanced_true/false/crop_video/*.mp4 모두 처리
mapping = {'balanced_true': 1, 'false': 0}
all_mp4s = []

for split in ['train', 'test']:
    split_root = ROOT / split
    for cat, lbl in mapping.items():
        vd = split_root / cat / 'crop_video'
        if not vd.exists(): continue
        for mp4 in vd.glob('*.mp4'):
            all_mp4s.append((mp4, lbl, cat, split))

print(f'총 {len(all_mp4s)}개 mp4 처리')

# TimeSformerWithDropout 클래스 추가
class TimeSformerWithDropout(nn.Module):
    def __init__(self, base_model, dropout_p):
        super().__init__()
        self.base = base_model
        self.dropout = nn.Dropout(dropout_p)
        self.head = getattr(self.base, "head", None)
        if self.head is None and hasattr(self.base, "model"):
            self.head = self.base.model.head

    def forward(self, x):
        base = self.base
        if hasattr(base, "forward_features"): feats = base.forward_features(x)
        elif hasattr(base, "model") and hasattr(base.model, "forward_features"): feats = base.model.forward_features(x)
        else: feats = base(x)
        return self.head(self.dropout(feats))

class TimeSformerEmbed(nn.Module):
    def __init__(self, model_path, img_size, num_frames, num_classes, pretrained_path):
        super().__init__()
        self.base = TimeSformer(
            img_size=img_size,
            num_frames=num_frames,
            num_classes=num_classes,
            attention_type='divided_space_time',
            pretrained_model=str(pretrained_path)
        )
        
        torch.serialization.add_safe_globals([
            np._core.multiarray.scalar, 
            np.dtype, 
            np.dtypes.Float64DType
        ])
        
        model_with_dropout = TimeSformerWithDropout(self.base, dropout_p=0.2)
        
        ckpt = torch.load(model_path, map_location="cpu", weights_only=False)
        
        model_with_dropout.load_state_dict(ckpt["model"])
        
        self.base = model_with_dropout.base
        self.base.head = nn.Identity()
        self.base.cls_head = nn.Identity()

    def forward(self, x):
        return self.base(x)

embed_model = TimeSformerEmbed(
    model_path=MODEL_PATH,
    img_size=IMG_SIZE,
    num_frames=NUM_FRAMES,
    num_classes=2,
    pretrained_path=PRETRAINED
).to(DEVICE)
embed_model.eval()

# 임베딩 추출 및 저장
for mp4, lbl, cat, split in tqdm(all_mp4s, desc='Extracting', ncols=80):
    vid = mp4.stem
    out_path = PER_VIDEO_DIR / f'{vid}.npy'
    meta_path = PER_VIDEO_DIR / f'{vid}.json'
    
    clips = load_clip(mp4, num_clips=CLIPS_PER_VID)
    
    if not clips: # 비디오가 비어있으면 건너뜁니다
        continue

    feats = []
    for clip in clips:
        c = clip.unsqueeze(0).to(DEVICE)
        with torch.no_grad():
            out = embed_model.base.model.forward_features(c)
        cls = out[:,0,:] if out.ndim==3 else out
        feats.append(cls.squeeze(0).cpu().numpy())
    emb = np.stack(feats,0).mean(0)
    np.save(out_path, emb)
    meta = {
        'video_id': vid, 'label': lbl, 'category': cat, 'split': split,
        'mp4_path': str(mp4)
    }
    with open(meta_path, 'w', encoding='utf-8') as f:
        json.dump(meta, f, ensure_ascii=False, indent=2)

총 2117개 mp4 처리


Extracting:  48%|█████████████              | 1023/2117 [28:31<30:30,  1.67s/it]


ValueError: can't extend empty axis 0 using modes other than 'constant' or 'empty'