In [None]:
import os
import json
import shutil
import random
import subprocess
import pickle
from pathlib import Path
from tqdm import tqdm
import pandas as pd
import numpy as np

# ---------------- Configuration ----------------
# Input root where AIHub data (male/, female/ folders) are stored
INPUT_ROOT = Path(r"D:/golfDataset/스포츠 사람 동작 영상(골프)")
# Output root under dataset/train
DATASET_ROOT = Path(r"D:/golfDataset/dataset/train")
# OpenPose binary path
OPENPOSE_BIN = r"C:/openpose/openpose/bin/OpenPoseDemo.exe"
# Temporary JSON directory for intermediate OpenPose outputs
TMP_JSON_DIR = DATASET_ROOT / "_tmp_json"
# FPS for video encoding
FPS = 30
# Evaluation labels
TRUE_EVALS = {"best", "good", "normal"}
FALSE_EVALS = {"bad", "worst"}
# Margin for cropping box (e.g., 10% larger)
MARGIN = 0.1

# ---------------- Step 1: Classification ----------------
def classify_actions(input_root: Path, output_root: Path):
    for gender_folder in input_root.iterdir():
        if not gender_folder.is_dir():
            continue
        for js in gender_folder.rglob('*.json'):
            try:
                data = json.loads(js.read_text(encoding='utf-8'))
            except Exception:
                continue
            eval_label = data.get('evaluation', '').lower()
            if eval_label in TRUE_EVALS:
                label = 'true'
            elif eval_label in FALSE_EVALS:
                label = 'false'
            else:
                continue  # skip unrecognized
            # ensure dirs
            for ext in ['json', 'jpg']:
                (output_root / label / ext).mkdir(parents=True, exist_ok=True)
            # copy json
            shutil.copy2(js, output_root / label / 'json' / js.name)
            # copy jpg
            jpg = js.with_suffix('.jpg')
            if jpg.exists():
                shutil.copy2(jpg, output_root / label / 'jpg' / jpg.name)
    print("[Step 1] Classification complete.")

# ---------------- Step 2: JPG → MP4 ----------------
def images_to_video(img_dir: Path, video_dir: Path, fps: int = FPS):
    video_dir.mkdir(parents=True, exist_ok=True)
    # group by prefix before _####.jpg
    groups = {}
    for jp in img_dir.glob('*.jpg'):
        prefix = jp.stem.rsplit('_', 1)[0]
        groups.setdefault(prefix, []).append(jp)
    for prefix, files in groups.items():
        out_mp4 = video_dir / f"{prefix}.mp4"
        pattern = str(img_dir / f"{prefix}_%04d.jpg")
        cmd = ['ffmpeg', '-y', '-r', str(fps), '-i', pattern, '-c:v', 'libx264', str(out_mp4)]
        subprocess.run(cmd, check=True)
    # delete jpg
    for jp in img_dir.glob('*.jpg'):
        jp.unlink()

def convert_all_jpgs(output_root: Path):
    for label in ['true', 'false']:
        img_dir = output_root / label / 'jpg'
        video_dir = output_root / label / 'video'
        if img_dir.exists():
            print(f"[Step 2] Converting {label}/jpg to MP4...")
            images_to_video(img_dir, video_dir)
    print("[Step 2] JPG→MP4 conversion complete.")

# ---------------- Step 3: Balanced True Extraction ----------------
def extract_balanced_true(output_root: Path):
    false_vids = list((output_root / 'false' / 'video').glob('*.mp4'))
    num_false = len(false_vids)
    true_vids = list((output_root / 'true' / 'video').glob('*.mp4'))
    if num_false == 0:
        print("No false videos found; skipping balanced extraction.")
        return
    random.shuffle(true_vids)
    selected = true_vids[:num_false]
    bt_root = output_root / 'balanced_true'
    for sub in ['video', 'crop_video', 'crop_keypoint', 'crop_pkl']:
        (bt_root / sub).mkdir(parents=True, exist_ok=True)
    for vid in selected:
        shutil.copy2(vid, bt_root / 'video' / vid.name)
    print(f"[Step 3] Extracted {len(selected)} balanced_true videos.")

# ---------------- Step 4: OpenPose Crop & CSV ----------------
def run_openpose(video_path: Path, json_out: Path):
    json_out.mkdir(parents=True, exist_ok=True)
    cmd = [OPENPOSE_BIN, '--video', str(video_path), '--write_json', str(json_out), '--display', '0', '--render_pose', '0', '--tracking', '1', '--number_people_max', '2']
    subprocess.run(cmd, check=True)


def get_torso_bbox(json_dir: Path):
    coords = []
    for js in sorted(json_dir.glob('*.json')):
        data = json.loads(js.read_text(encoding='utf-8'))
        people = data.get('people', [])
        if not people:
            continue
        pts = np.array(people[0]['pose_keypoints_2d']).reshape(-1, 3)
        torso_idxs = [1, 8, 2, 5, 9, 12]
        torso = pts[torso_idxs, :2]
        coords.append(torso)
    if not coords:
        return None
    all_pts = np.vstack(coords)
    cx, cy = np.median(all_pts, axis=0)
    w, h = np.percentile(all_pts, 90, axis=0)
    w *= (1 + MARGIN)
    h *= (1 + MARGIN)
    x1 = max(0, int(cx - w / 2))
    y1 = max(0, int(cy - h / 2))
    return x1, y1, int(w), int(h)


def crop_video(input_vid: Path, output_vid: Path, bbox: tuple):
    x, y, w, h = bbox
    output_vid.parent.mkdir(parents=True, exist_ok=True)
    cmd = ['ffmpeg', '-y', '-i', str(input_vid), '-filter:v', f"crop={w}:{h}:{x}:{y}", str(output_vid)]
    subprocess.run(cmd, check=True)


def json_dir_to_csv(json_dir: Path, csv_path: Path):
    rows = []
    for js in sorted(json_dir.glob('*.json')):
        data = json.loads(js.read_text(encoding='utf-8'))
        people = data.get('people', [])
        if people:
            kps = people[0]['pose_keypoints_2d']
        else:
            kps = [0] * 75
        rows.append(kps)
    df = pd.DataFrame(rows)
    csv_path.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(csv_path, index=False, header=False)


def preprocess_videos(output_root: Path):
    for label in ['true', 'false', 'balanced_true']:
        in_vid_dir = output_root / label / 'video'
        out_crop_vid = output_root / label / 'crop_video'
        out_kp = output_root / label / 'crop_keypoint'
        if not in_vid_dir.exists():
            continue
        for vid in tqdm(list(in_vid_dir.glob('*.mp4')), desc=f"[Step 4] {label}"):
            name = vid.stem
            raw_json = TMP_JSON_DIR / f"raw_{name}"
            run_openpose(vid, raw_json)
            bbox = get_torso_bbox(raw_json)
            if bbox is None:
                print(f"Skipping {name}: no keypoints")
                continue
            crop_mp4 = out_crop_vid / f"{name}.mp4"
            crop_video(vid, crop_mp4, bbox)
            crop_json = TMP_JSON_DIR / f"crop_{name}"
            run_openpose(crop_mp4, crop_json)
            csv_file = out_kp / f"{name}.csv"
            json_dir_to_csv(crop_json, csv_file)
            shutil.rmtree(raw_json)
            shutil.rmtree(crop_json)
    print("[Step 4] Video crop & CSV extraction complete.")

# ---------------- Step 5: CSV → PKL ----------------
def convert_csv_to_pkl(output_root: Path):
    for label in ['true', 'false', 'balanced_true']:
        csv_dir = output_root / label / 'crop_keypoint'
        pkl_dir = output_root / label / 'crop_pkl'
        if not csv_dir.exists():
            continue
        pkl_dir.mkdir(parents=True, exist_ok=True)
        for csv_f in csv_dir.glob('*.csv'):
            seq = pd.read_csv(csv_f, header=None).values.tolist()
            record = {'keypoints': seq}
            out_pkl = pkl_dir / f"{csv_f.stem}.pkl"
            with open(out_pkl, 'wb') as f:
                pickle.dump(record, f)
    print("[Step 5] CSV to PKL conversion complete.")

# ---------------- Main Pipeline ----------------
def main():
    # ensure base train dirs
    (DATASET_ROOT / 'true').mkdir(parents=True, exist_ok=True)
    (DATASET_ROOT / 'false').mkdir(parents=True, exist_ok=True)
    classify_actions(INPUT_ROOT, DATASET_ROOT)
    convert_all_jpgs(DATASET_ROOT)
    extract_balanced_true(DATASET_ROOT)
    preprocess_videos(DATASET_ROOT)
    convert_csv_to_pkl(DATASET_ROOT)
    print("All preprocessing steps completed.")

if __name__ == '__main__':
    main()


## test 데이터 분리

In [None]:
import os
import shutil
import random
from pathlib import Path

# ---------------- Configuration ----------------
# Root of the preprocessed training dataset
TRAIN_ROOT = Path(r"D:/golfDataset/dataset/train")
# Destination for test split
TEST_ROOT = Path(r"D:/golfDataset/dataset/test")
# Backup directory for moved files
TMP_ROOT = Path(r"D:/golfDataset/dataset/tmp")
# Fraction of train data to move to test
SPLIT_RATIO = 0.2
# Random seed for reproducibility
SEED = 42


def split_train_test(train_root: Path, test_root: Path, tmp_root: Path,
                     split_ratio: float = SPLIT_RATIO, seed: int = SEED):
    random.seed(seed)
    test_root.mkdir(parents=True, exist_ok=True)
    tmp_root.mkdir(parents=True, exist_ok=True)

    for label_dir in train_root.iterdir():
        if not label_dir.is_dir():
            continue
        # gather all video stems
        videos = list((label_dir / 'video').glob('*.mp4'))
        stems = [v.stem for v in videos]
        if not stems:
            continue
        random.shuffle(stems)
        k = int(len(stems) * split_ratio)
        selected = set(stems[:k])

        # move related files for each selected stem
        for stem in selected:
            for subfolder in ['json', 'video', 'crop_video', 'crop_keypoint', 'crop_pkl']:
                src_dir = label_dir / subfolder
                if not src_dir.exists():
                    continue
                for file in src_dir.glob(f"{stem}.*"):
                    rel_path = Path(label_dir.name) / subfolder / file.name
                    dest_tmp = tmp_root / rel_path
                    dest_test = test_root / rel_path
                    dest_tmp.parent.mkdir(parents=True, exist_ok=True)
                    dest_test.parent.mkdir(parents=True, exist_ok=True)
                    # backup original
                    shutil.copy2(file, dest_tmp)
                    # move to test
                    shutil.move(str(file), str(dest_test))

    print(f"Splitting complete: {split_ratio*100}% of training data moved to test.")


if __name__ == '__main__':
    split_train_test(TRAIN_ROOT, TEST_ROOT, TMP_ROOT)
