In [None]:
#!/usr/bin/env python3
import sys
sys.path.append(r"D:\timesformer")  # timesformer 모듈 경로 추가

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision.transforms import InterpolationMode, functional as F
from pathlib import Path
import numpy as np
import random
from decord import VideoReader
from tqdm import tqdm
from timesformer.models.vit import TimeSformer

# ----------------- 하이퍼파라미터 ----------------------------
ROOT          = Path(r"D:\golfDataset\dataset\train")
NUM_FRAMES    = 32
CLIPS_PER_VID = 5
IMG_SIZE      = 224
BATCH_SIZE    = 1
TEST_RATIO    = 0.1  # 테스트셋 비율
device        = "cuda" if torch.cuda.is_available() else "cpu"

# ----------------- 재현성 ----------------------------
SEED = 42
torch.manual_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)
torch.backends.cudnn.deterministic = True

# ----------------- 전처리 함수 ----------------------------
def preprocess_tensor(img_tensor):
    img = F.resize(img_tensor, 256, interpolation=InterpolationMode.BICUBIC)
    img = F.center_crop(img, IMG_SIZE)
    img = F.normalize(img, [0.45]*3, [0.225]*3)
    return img


def uniform_sample(length, num):
    if length >= num:
        return np.linspace(0, length - 1, num).astype(int)
    return np.pad(np.arange(length), (0, num - length), mode='edge')


def load_clip(path: Path):
    vr = VideoReader(str(path))
    L = len(vr)
    seg_edges = np.linspace(0, L, CLIPS_PER_VID + 1, dtype=int)
    clips = []
    for start, end in zip(seg_edges[:-1], seg_edges[1:]):
        idx = uniform_sample(end - start, NUM_FRAMES) + start
        arr = vr.get_batch(idx).asnumpy().astype(np.uint8)        # (T, H, W, 3)
        clip = torch.from_numpy(arr).permute(0,3,1,2).float()/255.0
        clip = torch.stack([preprocess_tensor(f) for f in clip])  # (T,3,H,W)
        clips.append(clip.permute(1,0,2,3))                        # (3,T,H,W)
    return clips

class SwingDataset(Dataset):
    def __init__(self, root: Path):
        mapping = {"balanced_true": 1, "false": 0}
        self.samples = [(p,l) for sub,l in mapping.items()
                        for p in (root/sub/"crop_video").glob("*.mp4")]
        print(f"✅ {len(self.samples)} samples found in {root}")

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        path, label = self.samples[idx]
        return torch.stack(load_clip(path)), label

# ----------------- 데이터로더 구성 ----------------------------
ds_full = SwingDataset(ROOT)
n_test = int(len(ds_full)*TEST_RATIO)
n_train = len(ds_full)-n_test
train_ds, test_ds = random_split(ds_full, [n_train,n_test])
train_ld = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=False,
                      num_workers=0, pin_memory=True)
test_ld  = DataLoader(test_ds,  batch_size=BATCH_SIZE, shuffle=False,
                      num_workers=0, pin_memory=True)

# ----------------- 모델 로드 및 헤드 교체 ----------------------------
PRETRAIN_PYTH = Path(r"D:\timesformer\pretrained\TimeSformer_divST_96x4_224_K600.pyth")
model = TimeSformer(
    img_size=IMG_SIZE, num_frames=NUM_FRAMES,
    num_classes=2, attention_type='divided_space_time',
    pretrained_model=str(PRETRAIN_PYTH)
).to(device)

# 외부 wrapper head 제거
model.head = nn.Identity()
if hasattr(model, 'cls_head'):
    model.cls_head = nn.Identity()
# 내부 ViT head 제거
model.model.head = nn.Identity()
if hasattr(model.model, 'cls_head'):
    model.model.cls_head = nn.Identity()

model.eval()

# ----------------- 임베딩 추출 ----------------------------
all_train_embs, all_train_labels = [], []
with torch.no_grad():
    for clips, label in tqdm(train_ld, desc="Train Embeddings", ncols=80):
        clips = clips.squeeze(0).to(device)           # (CLIPS_PER_VID,3,T,H,W)
        # 내부 ViT feature extraction
        feats = model.model.forward_features(clips)   # (CLIPS_PER_VID, num_patches+1, D) or (CLIPS_PER_VID, D)
        if feats.ndim == 3:
            cls_embs = feats[:, 0, :]                # (CLIPS_PER_VID, D)
        else:
            cls_embs = feats                         # (CLIPS_PER_VID, D)
        emb = cls_embs.mean(dim=0).cpu().numpy()      # (D,)
        all_train_embs.append(emb)
        all_train_labels.append(label.item())

all_test_embs, all_test_labels = [], []
with torch.no_grad():
    for clips, label in tqdm(test_ld, desc="Test Embeddings", ncols=80):
        clips = clips.squeeze(0).to(device)
        feats = model.model.forward_features(clips)
        if feats.ndim == 3:
            cls_embs = feats[:, 0, :]
        else:
            cls_embs = feats
        emb = cls_embs.mean(dim=0).cpu().numpy()
        all_test_embs.append(emb)
        all_test_labels.append(label.item())

# ----------------- 저장 ----------------------------
np.save(r"embbeding_data\timesformer\train_embeddings.npy", np.stack(all_train_embs))
np.save(r"embbeding_data\timesformer\train_labels.npy",   np.array(all_train_labels))
np.save(r"embbeding_data\timesformer\test_embeddings.npy",  np.stack(all_test_embs))
np.save(r"embbeding_data\timesformer\test_labels.npy",     np.array(all_test_labels))

print("✅ Embeddings and labels saved.")


✅ 436 samples found in D:\golfDataset\dataset\train


Train Embeddings: 100%|███████████████████████| 393/393 [10:55<00:00,  1.67s/it]
Test Embeddings: 100%|██████████████████████████| 43/43 [01:13<00:00,  1.70s/it]

✅ Embeddings and labels saved.





## 1개 데이터 테스트

In [9]:
import sys
sys.path.append(r"D:\timesformer")

import torch
import torch.nn as nn
from torch.utils.data import random_split
from pathlib import Path
import numpy as np
from decord import VideoReader
from timesformer.models.vit import TimeSformer

# (중략) 기존 SwingDataset, load_clip, preprocess_tensor 등 정의 부분은 그대로 사용

# 하이퍼파라미터
ROOT          = Path(r"D:\golfDataset\dataset\train")
NUM_FRAMES    = 32
CLIPS_PER_VID = 5
IMG_SIZE      = 224
TEST_RATIO    = 0.1
device        = "cuda" if torch.cuda.is_available() else "cpu"

# 1) 데이터셋 준비
ds_full = SwingDataset(ROOT)
n_test  = int(len(ds_full) * TEST_RATIO)
n_train = len(ds_full) - n_test
train_ds, _ = random_split(ds_full, [n_train, n_test])

# 2) 모델 로드 & head 교체
PRE_PTH = Path(r"D:\timesformer\pretrained\TimeSformer_divST_96x4_224_K600.pyth")
model = TimeSformer(
    img_size=IMG_SIZE,
    num_frames=NUM_FRAMES,
    num_classes=2,
    attention_type='divided_space_time',
    pretrained_model=str(PRE_PTH)
).to(device)
# 외부 wrapper 헤드 제거 (남겨둬도 무해)
model.head = nn.Identity()
if hasattr(model, 'cls_head'):
    model.cls_head = nn.Identity()

# ↓ 핵심: 내부 ViT에도 head Identity 적용
model.model.head = nn.Identity()
if hasattr(model.model, 'cls_head'):
    model.model.cls_head = nn.Identity()

model.eval()

# 3) 첫 번째 샘플만 추출
clips, label = train_ds[0]            # clips: (CLIPS_PER_VID, 3, T, H, W), label: int
clips = clips.to(device)              # GPU로 옮기기

# 4) 임베딩 계산

with torch.no_grad():
    clips = clips.squeeze(0).to(device)       # (5,3,T,H,W)

    # 1) 내부 ViT으로부터 features 추출
    feats = model.model.forward_features(clips)

    # 2) feats가 3D면 [CLS] 토큰(0번)만, 2D면 그대로 사용
    if feats.ndim == 3:
        cls_embs = feats[:, 0, :]            # (5, D)
    else:
        cls_embs = feats                     # already (5, D)

    # 3) 클립별 평균 → (D,)
    emb = cls_embs.mean(dim=0).cpu().numpy()

print(f"[CLS] 임베딩 차원 D: {emb.shape[0]}")


# 고친 부분
print(f"클립 개수 (CLIPS_PER_VID): {cls_embs.shape[0]}")
print(f"[CLS] 토큰 임베딩 차원 D: {cls_embs.shape[1]}")
print(f"최종 평균 임베딩 shape: {emb.shape}")
print(f"레이블: {label}")


✅ 436 samples found in D:\golfDataset\dataset\train
[CLS] 임베딩 차원 D: 768
클립 개수 (CLIPS_PER_VID): 5
[CLS] 토큰 임베딩 차원 D: 768
최종 평균 임베딩 shape: (768,)
레이블: 0
