## Train, valid 데이터 로드 & 임베딩 추출코드 호출(in conda env)

In [1]:
#!/usr/bin/env python3
"""
parent_split_and_run.py

1) D:\golfDataset\dataset\train 내 CSV 이름으로 ID 수집 → 90:10 split (계층적 샘플링 적용)
2) train_ids.txt / valid_ids.txt 생성
3) train/valid 각각 PKL(annotations + split) 생성
4) extract_embedding_timesformer.py, extract_embedding_stgcn.py 호출하여 임베딩 추출

반드시 mmaction 환경에서 실행해야함,
timesformer는 txt를 통해 데이터를 받기에 numpy, pands, pkle 모듈에 상관없지만,
mmaction은 pkl 파일을 직접 읽어야 하므로 numpy._core ↔ numpy.core 호환 패치가 필요하다.
"""

import random
import subprocess
import pickle
import pandas as pd
import numpy as np
from pathlib import Path
import os
from collections import defaultdict # 새로 추가
import sys

# ───────────────────────────────────────────────────────────────
# 설정
ROOT       = Path(r"D:\golfDataset\dataset\train")
CUR_DIR    = Path(os.getcwd()).resolve()
# .resolve()는 현재 작업 디렉토리를 절대경로로 변환
IDS_DIR    = CUR_DIR / 'ids_txt'    # ID 리스트 저장 디렉토리
TEST_RATIO = 0.2 # 훈련 80%, 테스트 20%
SEED       = 42

TS_ENV     = 'timesformer'
STGCN_ENV  = 'mmaction'
# ───────────────────────────────────────────────────────────────

# Body25 → COCO17 인덱스 매핑
MAPPING_BODY25_TO_COCO17 = [
    0,16,15,18,17,
    5,2,6,3,7,
    4,12,9,13,10,
    14,11
]


def run(cmd, env):
    full = ['conda', 'run', '-n', env] + cmd
    print("[RUN]", " ".join(full))
    # subprocess 에서 stdout/stderr 를 캡처
    proc = subprocess.run(full, capture_output=True, text=True)
    # 먼저 항상 출력
    if proc.stdout:
        print(proc.stdout)
    if proc.stderr:
        print(proc.stderr, file=sys.stderr)
    # 종료 코드가 0이 아니면 에러로 처리
    if proc.returncode != 0:
        print(f"[ERROR] `{env}` 환경 실행 실패 (exit code {proc.returncode})", file=sys.stderr)
        raise subprocess.CalledProcessError(
            proc.returncode, proc.args, output=proc.stdout, stderr=proc.stderr
        )


def load_and_process(csv_path: Path,
                     img_shape=(1080,1920),
                     confidence_threshold=0.1,
                     normalize_method='0to1') -> dict:
    df = pd.read_csv(csv_path)
    T, cols = df.shape
    V25 = 25
    kp25 = np.zeros((1, T, V25, 2), dtype=np.float32)
    score25 = np.zeros((1, T, V25), dtype=np.float32)
    for t, row in enumerate(df.values):
        vals = row.reshape(V25, 3)
        kp25[0, t] = vals[:, :2]
        score25[0, t] = vals[:, 2]
    mask = score25 < confidence_threshold
    kp25[mask] = 0
    score25[mask] = 0
    h, w = img_shape
    if normalize_method == '0to1':
        kp25[..., 0] /= w
        kp25[..., 1] /= h
    kp17 = kp25[:, :, MAPPING_BODY25_TO_COCO17, :]
    score17 = score25[:, :, MAPPING_BODY25_TO_COCO17]
    return {
        'total_frames': T,
        'img_shape': img_shape,
        'original_shape': img_shape,
        'keypoint': kp17,
        'keypoint_score': score17
    }


def make_pkl(id_list, out_path: Path):
    annotations = []
    for vid in id_list:
        csv_file = None
        label = None
        for cat in ['balanced_true','false']:
            p = ROOT / cat / 'crop_keypoint' / f"{vid}.csv"
            if p.exists():
                csv_file = p
                label = 1 if cat=='balanced_true' else 0
                break
        if csv_file is None:
            print(f"[WARN] CSV not found for id={vid}")
            continue
        info = load_and_process(csv_file)
        info.update({
            'frame_dir': vid,
            'label': label,
            'img_shape': info['img_shape'],
            'original_shape': info['original_shape'],
            'metainfo': {'frame_dir':vid, 'img_shape':info['img_shape']}
        })
        annotations.append(info)
    data = {'annotations':annotations, 'split':{'xsub_val':id_list}}
    out_path.parent.mkdir(parents=True, exist_ok=True)
    with open(out_path, 'wb') as f:
        pickle.dump(data, f, protocol=4)
    return len(annotations)


def main():
    # 1) ID 수집 (레이블별로 분리하여 수집)
    # '0': [id_for_false_cat_1, id_for_false_cat_2, ...],
    # '1': [id_for_true_cat_1, id_for_true_cat_2, ...] 형태로 저장
    ids_by_label = defaultdict(list)
    all_ids_collected = False # 최소한 하나의 레이블에서 ID가 수집되었는지 확인

    for cat in ['balanced_true','false']:
        kp_dir = ROOT / cat / 'crop_keypoint'
        vid_dirs = [ROOT/cat/'crop_video', ROOT/cat/'video']
        current_label = 1 if cat=='balanced_true' else 0

        if not kp_dir.exists(): continue

        for csv_path in kp_dir.glob('*.csv'):
            vid_id = csv_path.stem
            # 해당 ID에 대한 비디오 파일이 실제로 존재하는지 확인 (원본 로직 유지)
            video_found = False
            for vd in vid_dirs:
                # glob은 제너레이터를 반환하므로 list()로 감싸서 확인하거나 next()를 사용
                if next(vd.glob(f"{vid_id}*.mp4"), None) is not None:
                    video_found = True
                    break
            if video_found:
                ids_by_label[current_label].append(vid_id) # 레이블을 int로 저장
                all_ids_collected = True # ID가 수집되었음을 표시

    if not all_ids_collected:
        raise RuntimeError(f"No matching CSV↔MP4 under {ROOT}. Please check `ROOT` path and data existence.")

    train_ids = []
    valid_ids = []

    # 2) 각 레이블별로 분할하여 train_ids와 valid_ids에 추가 (계층적 샘플링)
    random.seed(SEED) # 각 레이블별 분할에 동일한 시드 적용

    for label, ids_list in ids_by_label.items():
        if not ids_list: # 해당 레이블에 ID가 없으면 건너뛰기
            print(f"[WARN] No IDs found for label {label}. Skipping split for this label.")
            continue

        # 각 레이블별 ID 리스트를 섞음
        random.shuffle(ids_list)

        # 훈련/테스트 비율에 맞춰 분할
        split_idx = int(len(ids_list) * (1 - TEST_RATIO))
        train_ids.extend(ids_list[:split_idx])
        valid_ids.extend(ids_list[split_idx:])

        print(f"  레이블 {label}: 훈련 {len(ids_list[:split_idx])}개, 검증 {len(ids_list[split_idx:])}개 할당")


    # 최종적으로 train_ids와 valid_ids도 한 번 더 섞어주는 것이 좋습니다.
    # (나중에 학습 시 데이터 순서가 레이블별로 몰려있지 않도록)
    random.shuffle(train_ids)
    random.shuffle(valid_ids)

    # 3) ID 리스트 저장
    train_list = IDS_DIR/'train_ids.txt'
    valid_list = IDS_DIR/'valid_ids.txt'
    
    # IDS_DIR이 없으면 생성
    IDS_DIR.mkdir(parents=True, exist_ok=True)

    train_list.write_text('\n'.join(train_ids))
    valid_list.write_text('\n'.join(valid_ids))
    print(f"\n▶ 최종 ID 리스트 저장: {len(train_ids)} 훈련 ID / {len(valid_ids)} 검증 ID")
    print(f"  훈련 ID 파일: {train_list}")
    print(f"  검증 ID 파일: {valid_list}")

    # 4) PKL 생성
    base_pkl = ROOT/'crop_pkl'
    train_pkl = base_pkl/'skeleton_dataset_train.pkl'
    valid_pkl = base_pkl/'skeleton_dataset_valid.pkl'
    tcnt = make_pkl(train_ids, train_pkl)
    vcnt = make_pkl(valid_ids, valid_pkl)
    print(f"▶ PKL 생성 완료: 훈련 PKL={tcnt}개, 검증 PKL={vcnt}개")

    # 5) 임베딩 추출 (원본 코드의 주석 처리된 부분)
    # timesformer는 너무 오래 걸려서 주석처리함
    print(str(train_pkl), str(valid_pkl))
    run([
        'python', '-u', 'extract_embedding_timesformer.py',
        '--root',         str(ROOT),
        '--train-list', str(train_list),
        '--valid-list', str(valid_list),
        '--num-frames','32',
        '--clips-per-vid','5',
        '--img-size','224',
        '--batch-size','1',
        '--num-workers','0',
        '--pretrained', r"D:\TimeSformer\pretrained\TimeSformer_divST_96x4_224_K600.pyth",
        '--output-dir',r'embbeding_data\timesformer'
    ], TS_ENV)

    # print(str(train_pkl), str(valid_pkl)) # 주석처리된 TimesFormer 실행 여부와 관계없이 PKL 경로 출력
    # ST-GCN 임베딩 (PKL 경로 인수로 전달), stgcn의 경우 내부 경로 수정때문에 절대경로로 지정해 줘야한다
    run([
        'python', '-u', 'D:\\Jabez\\golf\\fusion\\extract_embedding_stgcn.py',
        '--cfg',          'D:\\mmaction2\\configs\\skeleton\\stgcnpp\\my_stgcnpp.py',
        '--ckpt',         'D:\mmaction2\work_dirs\my_stgcnpp\epoch_5.pth',
        '--device',       'cuda:0',
        '--out-dir',      'D:\\Jabez\\golf\\fusion\\embbeding_data\\stgcnpp',
        '--train-pkl',    str(train_pkl),
        '--valid-pkl',    str(valid_pkl),
        '--num-workers','0'
    ], STGCN_ENV)


    print("✅ 모든 작업이 완료되었습니다.")

if __name__=='__main__':
    main()

  레이블 1: 훈련 480개, 검증 120개 할당
  레이블 0: 훈련 326개, 검증 82개 할당

▶ 최종 ID 리스트 저장: 806 훈련 ID / 202 검증 ID
  훈련 ID 파일: D:\Jabez\golf\fusion\ids_txt\train_ids.txt
  검증 ID 파일: D:\Jabez\golf\fusion\ids_txt\valid_ids.txt
▶ PKL 생성 완료: 훈련 PKL=806개, 검증 PKL=202개
D:\golfDataset\dataset\train\crop_pkl\skeleton_dataset_train.pkl D:\golfDataset\dataset\train\crop_pkl\skeleton_dataset_valid.pkl
[RUN] conda run -n timesformer python -u extract_embedding_timesformer.py --root D:\golfDataset\dataset\train --train-list D:\Jabez\golf\fusion\ids_txt\train_ids.txt --valid-list D:\Jabez\golf\fusion\ids_txt\valid_ids.txt --num-frames 32 --clips-per-vid 5 --img-size 224 --batch-size 1 --num-workers 0 --pretrained D:\TimeSformer\pretrained\TimeSformer_divST_96x4_224_K600.pyth --output-dir embbeding_data\timesformer
✅ [None] samples loaded: 806
✅ train → 806 saved in embbeding_data\timesformer\train
✅ [train] samples loaded: 202
✅ valid → 202 saved in embbeding_data\timesformer\valid


[RUN] conda run -n mmaction python 


train Embedd:   0%|                                     | 0/806 [00:00<?, ?it/s]
train Embedd:   0%|                             | 1/806 [00:03<42:51,  3.19s/it]
train Embedd:   0%|                             | 2/806 [00:05<39:15,  2.93s/it]
train Embedd:   0%|                             | 3/806 [00:08<36:19,  2.71s/it]
train Embedd:   0%|▏                            | 4/806 [00:10<34:40,  2.59s/it]
train Embedd:   1%|▏                            | 5/806 [00:13<32:52,  2.46s/it]
train Embedd:   1%|▏                            | 6/806 [00:15<31:05,  2.33s/it]
train Embedd:   1%|▎                            | 7/806 [00:17<30:31,  2.29s/it]
train Embedd:   1%|▎                            | 8/806 [00:19<31:20,  2.36s/it]
train Embedd:   1%|▎                            | 9/806 [00:22<31:19,  2.36s/it]
train Embedd:   1%|▎                           | 10/806 [00:24<30:33,  2.30s/it]
train Embedd:   1%|▍                           | 11/806 [00:26<30:06,  2.27s/it]
train Embedd:   1%|▍       

07/27 12:42:03 - mmengine - [4m[97mINFO[0m - 
------------------------------------------------------------
System environment:
    sys.platform: win32
    Python: 3.8.20 (default, Oct  3 2024, 15:19:54) [MSC v.1929 64 bit (AMD64)]
    CUDA available: True
    MUSA available: False
    numpy_random_seed: 581934558
    GPU 0: NVIDIA GeForce RTX 4060 Laptop GPU
    CUDA_HOME: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.6
    NVCC: Cuda compilation tools, release 12.6, V12.6.20
    MSVC: Microsoft (R) C/C++ 최적화 컴파일러 버전 19.41.34120(x64)
    GCC: n/a
    PyTorch: 2.1.2+cu121
    PyTorch compiling details: PyTorch built with:
  - C++ Version: 199711
  - MSVC 192930151
  - Intel(R) Math Kernel Library Version 2020.0.2 Product Build 20200624 for Intel(R) 64 architecture applications
  - Intel(R) MKL-DNN v3.1.1 (Git Hash 64f6bcbcbab628e96f33a62c3e975f8535a7bde4)
  - OpenMP 2019
  - LAPACK is enabled (usually provided by MKL)
  - CPU capability usage: AVX512
  - CUDA Runtime 12.1
  

## Test 데이터에 대해

In [None]:
#!/usr/bin/env python3
"""
parent_make_test_and_run.py

1) D:\golfDataset\dataset\train 내 CSV 이름으로 ID 수집 → 전부 test
2) test_ids.txt 생성
3) test PKL(annotations + split:xsub_val) 생성
4) extract_embedding_timesformer.py 호출하여 TimeSformer 임베딩
5) extract_embedding_stgcn.py 호출하여 ST-GCN 임베딩
"""

import subprocess
import pickle
import sys
import numpy as np
import pandas as pd
from pathlib import Path
import os

# ───────────────────────────────────────────────────────────────
# 설정
ROOT        = Path(r"D:\golfDataset\dataset\test")
IDS_DIR     = Path(os.getcwd()).resolve() / 'ids_txt'
TS_ENV      = 'timesformer'
STGCN_ENV   = 'mmaction'
TEST_LIST   = IDS_DIR / 'test_ids.txt'
# ───────────────────────────────────────────────────────────────

# Body25 → COCO17 인덱스 매핑
MAPPING_BODY25_TO_COCO17 = [
    0,16,15,18,17,
    5,2,6,3,7,
    4,12,9,13,10,
    14,11
]

def run(cmd, env):
    full = ['conda', 'run', '-n', env] + cmd
    print("[RUN]", " ".join(full))
    proc = subprocess.run(full, capture_output=True, text=True)
    if proc.stdout:
        print(proc.stdout, end='')
    if proc.stderr:
        print(proc.stderr, file=sys.stderr, end='')
    if proc.returncode != 0:
        raise subprocess.CalledProcessError(
            proc.returncode, proc.args, output=proc.stdout, stderr=proc.stderr
        )

def load_and_process(csv_path: Path,
                     img_shape=(1080,1920),
                     confidence_threshold=0.1,
                     normalize_method='0to1') -> dict:
    df = pd.read_csv(csv_path)
    T, _ = df.shape
    V25 = 25
    kp25    = np.zeros((1, T, V25, 2), dtype=np.float32)
    score25 = np.zeros((1, T, V25),    dtype=np.float32)
    for t, row in enumerate(df.values):
        vals        = row.reshape(V25, 3)
        kp25[0, t]  = vals[:, :2]
        score25[0, t] = vals[:, 2]
    mask = score25 < confidence_threshold
    kp25[mask]    = 0
    score25[mask] = 0
    h, w = img_shape
    if normalize_method == '0to1':
        kp25[..., 0] /= w
        kp25[..., 1] /= h
    kp17    = kp25[:, :, MAPPING_BODY25_TO_COCO17, :]
    score17 = score25[:, :, MAPPING_BODY25_TO_COCO17]
    return {
        'total_frames':   T,
        'img_shape':      img_shape,
        'original_shape': img_shape,
        'keypoint':       kp17,
        'keypoint_score': score17
    }

def make_pkl(id_list, out_path: Path):
    annotations = []
    for vid in id_list:
        csv_file = None
        label    = None
        for cat in ['balanced_true', 'false']:
            p = ROOT / cat / 'crop_keypoint' / f"{vid}.csv"
            if p.exists():
                csv_file = p
                label    = 1 if cat == 'balanced_true' else 0
                break
        if csv_file is None:
            print(f"[WARN] CSV not found for id={vid}")
            continue

        info = load_and_process(csv_file)
        info.update({
            'frame_dir':      vid,
            'label':          label,
            'img_shape':      info['img_shape'],
            'original_shape': info['original_shape'],
            'metainfo':       {'frame_dir': vid, 'img_shape': info['img_shape']}
        })
        annotations.append(info)

    data = {
        'annotations': annotations,
        'split':       {'xsub_val': id_list}
    }
    out_path.parent.mkdir(parents=True, exist_ok=True)
    with open(out_path, 'wb') as f:
        pickle.dump(data, f, protocol=4)
    return len(annotations)

def main():
    # 1) ID 수집 (전체를 test)
    ids = []
    for cat in ['balanced_true', 'false']:
        kp_dir   = ROOT / cat / 'crop_keypoint'
        vid_dirs = [ROOT / cat / 'crop_video', ROOT / cat / 'video']
        if not kp_dir.exists():
            continue
        for csv_path in kp_dir.glob('*.csv'):
            vid_id = csv_path.stem
            if any(vd.glob(f"{vid_id}*.mp4") for vd in vid_dirs):
                ids.append(vid_id)

    if not ids:
        raise RuntimeError(f"No matching CSV↔MP4 under {ROOT}")

    # 2) test ID 리스트 저장
    IDS_DIR.mkdir(parents=True, exist_ok=True)
    TEST_LIST.write_text('\n'.join(ids))
    print(f"▶ {len(ids)} test IDs saved to {TEST_LIST}")

    # 3) test PKL 생성
    base_pkl = ROOT / 'crop_pkl'
    test_pkl = base_pkl / 'skeleton_dataset_test.pkl'
    cnt = make_pkl(ids, test_pkl)
    print(f"▶ PKL created: test={cnt} entries at {test_pkl}")

    # 4) TimeSformer 임베딩
    run([
        'python', '-u', 'extract_embedding_timesformer.py',
        '--root',        str(ROOT),
        '--test-list',   str(TEST_LIST),
        '--num-frames',  '32',
        '--clips-per-vid','5',
        '--img-size',    '224',
        '--batch-size',  '1',
        '--num-workers', '0',
        '--pretrained',  r"D:\TimeSformer\pretrained\TimeSformer_divST_96x4_224_K600.pyth",
        '--output-dir',  'embbeding_data\\timesformer'
    ], TS_ENV)

    # 5) ST-GCN 임베딩
    run([
        'python', '-u', 'D:\\Jabez\\golf\\fusion\\extract_embedding_stgcn.py',
        '--cfg',         'D:\\mmaction2\\configs\\skeleton\\stgcnpp\\my_stgcnpp.py',
        '--ckpt',        'D:\\mmaction2\\checkpoints\\stgcnpp_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d_20221228-86e1e77a.pth',
        '--device',      'cuda:0',
        '--out-dir',     'D:\\Jabez\\golf\\fusion\\embbeding_data\\stgcnpp',
        '--test-pkl',    str(test_pkl),
        '--num-workers', '0'
    ], STGCN_ENV)

    print("✅ All done.")

if __name__ == '__main__':
    main()


▶ 285 test IDs saved to D:\Jabez\golf\fusion\ids_txt\test_ids.txt
▶ PKL created: test=285 entries at D:\golfDataset\dataset\test\crop_pkl\skeleton_dataset_test.pkl
[RUN] conda run -n timesformer python -u extract_embedding_timesformer.py --root D:\golfDataset\dataset\test --test-list D:\Jabez\golf\fusion\ids_txt\test_ids.txt --num-frames 32 --clips-per-vid 5 --img-size 224 --batch-size 1 --num-workers 0 --pretrained D:\TimeSformer\pretrained\TimeSformer_divST_96x4_224_K600.pyth --output-dir embbeding_data\timesformer
✅ [None] samples loaded: 285
✅ test → 285 saved in embbeding_data\timesformer\test



NameError: name 'sys' is not defined