### train, test 섞은 후, train, valid, test 분할코드  
test 데이터에 대한 일반화 성능 검증은 힘듦, 대신 성능은 잘나옴

model은  
'D:\\mmaction2\\checkpoints\\stgcnpp_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d_20221228-86e1e77a.pth'  
"D:\mmaction2\work_dirs\my_stgcnpp\best_acc_top1_epoch_5.pth"

In [1]:
#!/usr/bin/env python3
"""
unified_split_and_run.py

1) D:\golfDataset\dataset\train\<class>\crop_keypoint 에서 모든 CSV ID 수집
2) 계층적 비율로 train/valid/test 분할 (TimeSformer 가능한 ID 기준)
3) 각각 ID txt 저장 및 PKL 생성
4) TimeSformer와 ST-GCN embedding 추출 (정렬 포함)
"""

import random
import subprocess
import pickle
import pandas as pd
import numpy as np
from pathlib import Path
import os
from collections import defaultdict
import sys

# ───────────────────────────────────────────────────────────────
# 설정
TRUE_ROOT   = Path(r"D:\golfDataset\dataset")
TRAIN_ROOT  = TRUE_ROOT / 'train'
TEST_ROOT   = TRUE_ROOT / 'test'
CUR_DIR     = Path(os.getcwd()).resolve()
IDS_DIR     = CUR_DIR / 'ids_txt'
TS_ENV      = 'timesformer'
STGCN_ENV   = 'mmaction'
MODEL       = 'D:\\mmaction2\\checkpoints\\stgcnpp_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d_20221228-86e1e77a.pth'
SPLIT_RATIO = (0.8, 0.1, 0.1)  # train/valid/test
SEED        = 125125214


MAPPING_BODY25_TO_COCO17 = [
    0,16,15,18,17,
    5,2,6,3,7,
    4,12,9,13,10,
    14,11
]

def run(cmd, env):
    full = ['conda', 'run', '-n', env] + cmd
    print("[RUN]", " ".join(full))
    proc = subprocess.run(full, capture_output=True, text=True)
    if proc.stdout:
        print(proc.stdout)
    if proc.stderr:
        print(proc.stderr, file=sys.stderr)
    if proc.returncode != 0:
        raise subprocess.CalledProcessError(proc.returncode, proc.args, output=proc.stdout, stderr=proc.stderr)

def load_and_process(csv_path: Path, img_shape=(1080,1920), threshold=0.1, norm='0to1'):
    df = pd.read_csv(csv_path)
    T, _ = df.shape
    V25 = 25
    kp25 = np.zeros((1, T, V25, 2), dtype=np.float32)
    score25 = np.zeros((1, T, V25), dtype=np.float32)
    for t, row in enumerate(df.values):
        vals = row.reshape(V25, 3)
        kp25[0, t] = vals[:, :2]
        score25[0, t] = vals[:, 2]
    mask = score25 < threshold
    kp25[mask] = 0
    score25[mask] = 0
    h, w = img_shape
    if norm == '0to1':
        kp25[..., 0] /= w
        kp25[..., 1] /= h
    kp17 = kp25[:, :, MAPPING_BODY25_TO_COCO17, :]
    score17 = score25[:, :, MAPPING_BODY25_TO_COCO17]
    return {
        'total_frames': T,
        'img_shape': img_shape,
        'original_shape': img_shape,
        'keypoint': kp17,
        'keypoint_score': score17
    }

def make_pkl(id_list, out_path: Path):
    annotations = []
    sorted_ids = []
    for vid in id_list:
        for base in [TRAIN_ROOT, TEST_ROOT]:
            p_true  = base / 'balanced_true' / 'crop_keypoint' / f"{vid}.csv"
            p_false = base / 'false' / 'crop_keypoint' / f"{vid}.csv"
            if p_true.exists():
                p = p_true
                label = 1
                break
            elif p_false.exists():
                p = p_false
                label = 0
                break
        else:
            print(f"[⚠️] {vid}.csv not found in both folders.")
            continue

        try:
            info = load_and_process(p)
            info.update({
                'frame_dir': vid,
                'label': label,
                'img_shape': info['img_shape'],
                'original_shape': info['original_shape'],
                'metainfo': {'frame_dir': vid, 'img_shape': info['img_shape']}
            })
            annotations.append(info)
            sorted_ids.append(vid)
        except Exception as e:
            print(f"[❌] Failed to load {vid}: {e}")

    data = {'annotations': annotations, 'split': {'xsub_val': sorted_ids}}
    out_path.parent.mkdir(parents=True, exist_ok=True)
    with open(out_path, 'wb') as f:
        pickle.dump(data, f, protocol=4)
    np.save(out_path.with_name(out_path.stem + '_ids.npy'), np.array(sorted_ids))
    return len(annotations)



def main():
    random.seed(SEED)

    ts_ids = set()
    per_video_dir = CUR_DIR / 'embedding_data' / 'timesformer' / 'per_video'
    for json_file in per_video_dir.glob("*.json"):
        ts_ids.add(json_file.stem)

    ids_by_label = defaultdict(list)

    for ROOT in [TRAIN_ROOT, TEST_ROOT]:
        for cat in ['balanced_true', 'false']:
            kp_dir = ROOT / cat / 'crop_keypoint'
            video_dirs = [ROOT / cat / 'crop_video', ROOT / cat / 'video']
            label = 1 if cat == 'balanced_true' else 0
            if not kp_dir.exists(): continue
            for csv in kp_dir.glob("*.csv"):
                vid = csv.stem
                if vid not in ts_ids:
                    continue
                if any((vd / f"{vid}.mp4").exists() for vd in video_dirs):
                    ids_by_label[label].append(vid)

    # 분할
    train_ids, valid_ids, test_ids = [], [], []
    for label, ids in ids_by_label.items():
        random.shuffle(ids)
        n = len(ids)
        n_train = int(n * SPLIT_RATIO[0])
        n_valid = int(n * SPLIT_RATIO[1])
        train_ids += ids[:n_train]
        valid_ids += ids[n_train:n_train + n_valid]
        test_ids  += ids[n_train + n_valid:]

    random.shuffle(train_ids)
    random.shuffle(valid_ids)
    random.shuffle(test_ids)


    IDS_DIR.mkdir(parents=True, exist_ok=True)
    (IDS_DIR / 'train_ids.txt').write_text('\n'.join(train_ids))
    (IDS_DIR / 'valid_ids.txt').write_text('\n'.join(valid_ids))
    (IDS_DIR / 'test_ids.txt').write_text('\n'.join(test_ids))

    base_pkl = TRUE_ROOT / 'crop_pkl'
    make_pkl(train_ids, base_pkl / 'skeleton_dataset_train.pkl')
    make_pkl(valid_ids, base_pkl / 'skeleton_dataset_valid.pkl')
    make_pkl(test_ids,  base_pkl / 'skeleton_dataset_test.pkl')

    run(['python', '-u', str(CUR_DIR/'assemble_timesformer_embeddings.py'),
         '--per-video-dir', str(per_video_dir),
         '--id-list', str(IDS_DIR/'train_ids.txt'),
         '--out-dir', str(CUR_DIR/'embedding_data/timesformer/train')], TS_ENV)

    run(['python', '-u', str(CUR_DIR/'assemble_timesformer_embeddings.py'),
         '--per-video-dir', str(per_video_dir),
         '--id-list', str(IDS_DIR/'valid_ids.txt'),
         '--out-dir', str(CUR_DIR/'embedding_data/timesformer/valid')], TS_ENV)

    run(['python', '-u', str(CUR_DIR/'assemble_timesformer_embeddings.py'),
         '--per-video-dir', str(per_video_dir),
         '--id-list', str(IDS_DIR/'test_ids.txt'),
         '--out-dir', str(CUR_DIR/'embedding_data/timesformer/test')], TS_ENV)

    run(['python', '-u', 'D:\\Jabez\\golf\\fusion\\extract_embedding_stgcn.py',
         '--cfg',      'D:\\mmaction2\\configs\\skeleton\\stgcnpp\\my_stgcnpp.py',
         '--ckpt',     MODEL,
         '--device',   'cuda:0',
         '--out-dir',  str(CUR_DIR/'embedding_data/stgcnpp'),
         '--train-pkl', str(base_pkl / 'skeleton_dataset_train.pkl'),
         '--valid-pkl', str(base_pkl / 'skeleton_dataset_valid.pkl'),
         '--test-pkl',  str(base_pkl / 'skeleton_dataset_test.pkl'),
         '--num-workers', '0'], STGCN_ENV)

    print("\n✅ 모든 작업 완료!")

if __name__ == '__main__':
    main()


[RUN] conda run -n timesformer python -u D:\Jabez\golf\fusion\assemble_timesformer_embeddings.py --per-video-dir D:\Jabez\golf\fusion\embedding_data\timesformer\per_video --id-list D:\Jabez\golf\fusion\ids_txt\train_ids.txt --out-dir D:\Jabez\golf\fusion\embedding_data\timesformer\train
Saved: (1026, 768), (1026,), (1026,)


[RUN] conda run -n timesformer python -u D:\Jabez\golf\fusion\assemble_timesformer_embeddings.py --per-video-dir D:\Jabez\golf\fusion\embedding_data\timesformer\per_video --id-list D:\Jabez\golf\fusion\ids_txt\valid_ids.txt --out-dir D:\Jabez\golf\fusion\embedding_data\timesformer\valid
Saved: (127, 768), (127,), (127,)


[RUN] conda run -n timesformer python -u D:\Jabez\golf\fusion\assemble_timesformer_embeddings.py --per-video-dir D:\Jabez\golf\fusion\embedding_data\timesformer\per_video --id-list D:\Jabez\golf\fusion\ids_txt\test_ids.txt --out-dir D:\Jabez\golf\fusion\embedding_data\timesformer\test
Saved: (130, 768), (130,), (130,)


[RUN] conda run -n mmaction

# train, test 폴더 원천 분리 코드

In [4]:
# 📦 완전 분리 train/valid/test dataset 생성 & 추출 스크립트
import random
import pickle
import pandas as pd
import numpy as np
from pathlib import Path
from collections import defaultdict
import os
import subprocess
import sys

# ─────────────────────────────────────────────
# 사용자 환경 설정
ROOT         = Path(r"D:\golfDataset\dataset")
CUR_DIR      = Path(os.getcwd()).resolve()
IDS_DIR      = CUR_DIR / 'ids_txt'
TS_ENV       = 'timesformer'
STGCN_ENV    = 'mmaction'
MODEL        = 'D:\\mmaction2\\checkpoints\\stgcnpp_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d_20221228-86e1e77a.pth'
SPLIT_RATIO  = (0.9, 0.1)    # train/valid (train 폴더 내에서만 분리)
SEED         = 124124
TRAIN_ROOT   = ROOT / "train"
TEST_ROOT    = ROOT / "test"

MAPPING_BODY25_TO_COCO17 = [
    0,16,15,18,17,
    5,2,6,3,7,
    4,12,9,13,10,
    14,11
]

def run(cmd, env):
    full = ['conda', 'run', '-n', env] + cmd
    print("[RUN]", " ".join(full))
    proc = subprocess.run(full, capture_output=True, text=True)
    if proc.stdout:
        print(proc.stdout)
    if proc.stderr:
        print(proc.stderr, file=sys.stderr)
    if proc.returncode != 0:
        raise subprocess.CalledProcessError(proc.returncode, proc.args, output=proc.stdout, stderr=proc.stderr)

def load_and_process(csv_path: Path, img_shape=(1080,1920), threshold=0.1, norm='0to1'):
    df = pd.read_csv(csv_path)
    T, _ = df.shape
    V25 = 25
    kp25 = np.zeros((1, T, V25, 2), dtype=np.float32)
    score25 = np.zeros((1, T, V25), dtype=np.float32)
    for t, row in enumerate(df.values):
        vals = row.reshape(V25, 3)
        kp25[0, t] = vals[:, :2]
        score25[0, t] = vals[:, 2]
    mask = score25 < threshold
    kp25[mask] = 0
    score25[mask] = 0
    h, w = img_shape
    if norm == '0to1':
        kp25[..., 0] /= w
        kp25[..., 1] /= h
    kp17 = kp25[:, :, MAPPING_BODY25_TO_COCO17, :]
    score17 = score25[:, :, MAPPING_BODY25_TO_COCO17]
    return {
        'total_frames': T,
        'img_shape': img_shape,
        'original_shape': img_shape,
        'keypoint': kp17,
        'keypoint_score': score17
    }

def fix_nan_inf(kp):
    # kp: (1, T, V, 2)
    # 1. NaN/Inf → np.nan으로 통일
    kp = np.where(np.isfinite(kp), kp, np.nan)
    # 2. 각 joint별로 선형보간 (프레임축)
    for v in range(kp.shape[2]):
        for c in range(2):
            arr = kp[0, :, v, c]
            nans = np.isnan(arr)
            if nans.any():
                idx = np.arange(arr.shape[0])
                arr[nans] = np.interp(idx[nans], idx[~nans], arr[~nans]) if (~nans).any() else 0
                kp[0, :, v, c] = arr
    # 3. 남은 NaN은 0으로
    kp = np.nan_to_num(kp)
    return kp

# make_pkl 내부에서 사용 예시
def make_pkl(id_list, out_path: Path, label_map, removed_ids=None):
    annotations = []
    sorted_ids = []
    for vid in id_list:
        label = label_map[vid]
        p = label['csv']
        try:
            info = load_and_process(p)
            kp = info['keypoint']
            # NaN/Inf 보강
            if np.isnan(kp).any() or np.isinf(kp).any():
                print(f"[⚠️] NaN/Inf in keypoint: {vid} → 보간/보강 처리")
                kp = fix_nan_inf(kp)
                info['keypoint'] = kp
            # keypoint가 전부 0이면 제외
            if (kp == 0).all():
                print(f"[❌] All-zero keypoint: {vid} → 제외")
                if removed_ids is not None:
                    removed_ids.append(vid)
                continue
            info.update({
                'frame_dir': vid,
                'label': label['lbl'],
                'img_shape': info['img_shape'],
                'original_shape': info['original_shape'],
                'metainfo': {'frame_dir': vid, 'img_shape': info['img_shape']}
            })
            annotations.append(info)
            sorted_ids.append(vid)
        except Exception as e:
            print(f"[❌] Failed to load {vid}: {e}")

    data = {'annotations': annotations, 'split': {'xsub_val': sorted_ids}}
    out_path.parent.mkdir(parents=True, exist_ok=True)
    with open(out_path, 'wb') as f:
        pickle.dump(data, f, protocol=4)
    np.save(out_path.with_name(out_path.stem + '_ids.npy'), np.array(sorted_ids))
    return len(annotations)

def collect_ids(kp_dir, label):
    # 각 폴더 내의 csv별로 (mp4 존재하면) id: {csv, lbl}로 기록
    ids = dict()
    video_dir = kp_dir.parent / "crop_video"
    for csv in kp_dir.glob("*.csv"):
        vid = csv.stem
        mp4 = video_dir / f"{vid}.mp4"
        if not mp4.exists(): continue
        ids[vid] = {'csv': csv, 'lbl': label}
    return ids

def main():
    random.seed(SEED)

    # -------------------------------
    # [1] Train/Valid Pool (train 폴더 기준)
    # -------------------------------
    train_ids_map = dict()
    for cat, lbl in [('balanced_true',1), ('false',0)]:
        kp_dir = TRAIN_ROOT / cat / 'crop_keypoint'
        train_ids_map.update(collect_ids(kp_dir, lbl))
    all_train_ids = list(train_ids_map.keys())
    random.shuffle(all_train_ids)
    n_train = int(len(all_train_ids) * SPLIT_RATIO[0])
    train_ids = all_train_ids[:n_train]
    valid_ids = all_train_ids[n_train:]

    # -------------------------------
    # [2] Test Pool (test 폴더 기준)
    # -------------------------------
    test_ids_map = dict()
    for cat, lbl in [('balanced_true',1), ('false',0)]:
        kp_dir = TEST_ROOT / cat / 'crop_keypoint'
        test_ids_map.update(collect_ids(kp_dir, lbl))
    test_ids = list(test_ids_map.keys())

    print(f"[INFO] #Train={len(train_ids)} #Valid={len(valid_ids)} #Test={len(test_ids)}")

    IDS_DIR.mkdir(parents=True, exist_ok=True)
    (IDS_DIR / 'train_ids.txt').write_text('\n'.join(train_ids))
    (IDS_DIR / 'valid_ids.txt').write_text('\n'.join(valid_ids))
    (IDS_DIR / 'test_ids.txt').write_text('\n'.join(test_ids))

    removed_train, removed_valid, removed_test = [], [], []

    base_pkl = (ROOT / 'crop_pkl')
    base_pkl.mkdir(exist_ok=True, parents=True)
    make_pkl(train_ids, base_pkl / 'skeleton_dataset_train.pkl', train_ids_map, removed_train)
    make_pkl(valid_ids, base_pkl / 'skeleton_dataset_valid.pkl', train_ids_map, removed_valid)
    make_pkl(test_ids,  base_pkl / 'skeleton_dataset_test.pkl', test_ids_map, removed_test)

    # All-zero 샘플이 제거된 id 리스트로 txt 파일 갱신
    train_ids_final = [i for i in train_ids if i not in removed_train]
    valid_ids_final = [i for i in valid_ids if i not in removed_valid]
    test_ids_final  = [i for i in test_ids  if i not in removed_test]

    IDS_DIR.mkdir(parents=True, exist_ok=True)
    (IDS_DIR / 'train_ids.txt').write_text('\n'.join(train_ids_final))
    (IDS_DIR / 'valid_ids.txt').write_text('\n'.join(valid_ids_final))
    (IDS_DIR / 'test_ids.txt').write_text('\n'.join(test_ids_final))

    # 이후 Timesformer/TS, ST-GCN 추출 코드에서 이 id 리스트를 사용
    per_video_dir = CUR_DIR / 'embedding_data' / 'timesformer' / 'per_video'
    run(['python', '-u', str(CUR_DIR/'assemble_timesformer_embeddings.py'),
         '--per-video-dir', str(per_video_dir),
         '--id-list', str(IDS_DIR/'train_ids.txt'),
         '--out-dir', str(CUR_DIR/'embedding_data/timesformer/train')], TS_ENV)
    run(['python', '-u', str(CUR_DIR/'assemble_timesformer_embeddings.py'),
         '--per-video-dir', str(per_video_dir),
         '--id-list', str(IDS_DIR/'valid_ids.txt'),
         '--out-dir', str(CUR_DIR/'embedding_data/timesformer/valid')], TS_ENV)
    run(['python', '-u', str(CUR_DIR/'assemble_timesformer_embeddings.py'),
         '--per-video-dir', str(per_video_dir),
         '--id-list', str(IDS_DIR/'test_ids.txt'),
         '--out-dir', str(CUR_DIR/'embedding_data/timesformer/test')], TS_ENV)

    # --- ST-GCN: train/valid/test ---
    run(['python', '-u', 'D:\\Jabez\\golf\\fusion\\extract_embedding_stgcn.py',
         '--cfg',      'D:\\Jabez\\golf\\fusion\\mmaction_files\\my_stgcnpp.py',
         '--ckpt',     MODEL,
         '--device',   'cuda:0',
         '--out-dir',  str(CUR_DIR/'embedding_data/stgcnpp'),
         '--train-pkl', str(base_pkl / 'skeleton_dataset_train.pkl'),
         '--valid-pkl', str(base_pkl / 'skeleton_dataset_valid.pkl'),
         '--test-pkl',  str(base_pkl / 'skeleton_dataset_test.pkl'),
         '--num-workers', '0'], STGCN_ENV)

    print("\n✅ 모든 작업 완료!")

if __name__ == '__main__':
    main()

[INFO] #Train=247 #Valid=28 #Test=0
[⚠️] NaN/Inf in keypoint: 20201116_General_002_DOS_A_F40_MM_053_crop → 보간/보강 처리
[⚠️] NaN/Inf in keypoint: 20201116_General_003_DOS_A_F30_MM_060_crop → 보간/보강 처리
[RUN] conda run -n timesformer python -u D:\Jabez\golf\fusion\assemble_timesformer_embeddings.py --per-video-dir D:\Jabez\golf\fusion\embedding_data\timesformer\per_video --id-list D:\Jabez\golf\fusion\ids_txt\train_ids.txt --out-dir D:\Jabez\golf\fusion\embedding_data\timesformer\train


KeyboardInterrupt: 