# Import and configs


In [25]:
!pip install --no-deps koolbox



In [26]:
import json

from collections import defaultdict

import pandas as pd
import polars as pl


class HostVisibleError(Exception):
    pass

def single_lab_f1(lab_solution: pl.DataFrame, lab_submission: pl.DataFrame, beta: float = 1) -> float:
    label_frames: defaultdict[str, set[int]] = defaultdict(set)
    prediction_frames: defaultdict[str, set[int]] = defaultdict(set)

    for row in lab_solution.to_dicts():
        label_frames[row['label_key']].update(range(row['start_frame'], row['stop_frame']))

    for video in lab_solution['video_id'].unique():
        active_labels: str = lab_solution.filter(pl.col('video_id') == video)['behaviors_labeled'].first()  # ty: ignore
        active_labels: set[str] = set(json.loads(active_labels))
        predicted_mouse_pairs: defaultdict[str, set[int]] = defaultdict(set)

        for row in lab_submission.filter(pl.col('video_id') == video).to_dicts():
            # Since the labels are sparse, we can't evaluate prediction keys not in the active labels.
            if ','.join([str(row['agent_id']), str(row['target_id']), row['action']]) not in active_labels:
                continue

            new_frames = set(range(row['start_frame'], row['stop_frame']))
            
            # Ignore truly redundant predictions.
            new_frames = new_frames.difference(prediction_frames[row['prediction_key']])
            prediction_pair = ','.join([str(row['agent_id']), str(row['target_id'])])
            if predicted_mouse_pairs[prediction_pair].intersection(new_frames):
                # A single agent can have multiple targets per frame (ex: evading all other mice) but only one action per target per frame.
                raise HostVisibleError('Multiple predictions for the same frame from one agent/target pair')
            
            prediction_frames[row['prediction_key']].update(new_frames)
            predicted_mouse_pairs[prediction_pair].update(new_frames)

    tps = defaultdict(int)
    fns = defaultdict(int)
    fps = defaultdict(int)
    for key, pred_frames in prediction_frames.items():
        action = key.split('_')[-1]
        matched_label_frames = label_frames[key]
        tps[action] += len(pred_frames.intersection(matched_label_frames))
        fns[action] += len(matched_label_frames.difference(pred_frames))
        fps[action] += len(pred_frames.difference(matched_label_frames))

    distinct_actions = set()
    for key, frames in label_frames.items():
        action = key.split('_')[-1]
        distinct_actions.add(action)
        if key not in prediction_frames:
            fns[action] += len(frames)

    action_f1s = []
    for action in distinct_actions:
        if tps[action] + fns[action] + fps[action] == 0:
            action_f1s.append(0)
        else:
            action_f1s.append((1 + beta**2) * tps[action] / ((1 + beta**2) * tps[action] + beta**2 * fns[action] + fps[action]))
    return sum(action_f1s) / len(action_f1s)

def mouse_fbeta(solution: pd.DataFrame, submission: pd.DataFrame, beta: float = 1) -> float:

    if len(solution) == 0 or len(submission) == 0:
        raise ValueError('Missing solution or submission data')

    expected_cols = ['video_id', 'agent_id', 'target_id', 'action', 'start_frame', 'stop_frame']

    for col in expected_cols:
        if col not in solution.columns:
            raise ValueError(f'Solution is missing column {col}')
        if col not in submission.columns:
            raise ValueError(f'Submission is missing column {col}')

    solution: pl.DataFrame = pl.DataFrame(solution)
    submission: pl.DataFrame = pl.DataFrame(submission)
    assert (solution['start_frame'] <= solution['stop_frame']).all()
    assert (submission['start_frame'] <= submission['stop_frame']).all()
    solution_videos = set(solution['video_id'].unique())
    
    # Need to align based on video IDs as we can't rely on the row IDs for handling public/private splits.
    submission = submission.filter(pl.col('video_id').is_in(solution_videos))

    solution = solution.with_columns(
        pl.concat_str(
            [
                pl.col('video_id').cast(pl.Utf8),
                pl.col('agent_id').cast(pl.Utf8),
                pl.col('target_id').cast(pl.Utf8),
                pl.col('action'),
            ],
            separator='_',
        ).alias('label_key'),
    )
    submission = submission.with_columns(
        pl.concat_str(
            [
                pl.col('video_id').cast(pl.Utf8),
                pl.col('agent_id').cast(pl.Utf8),
                pl.col('target_id').cast(pl.Utf8),
                pl.col('action'),
            ],
            separator='_',
        ).alias('prediction_key'),
    )

    lab_scores = []
    for lab in solution['lab_id'].unique():
        lab_solution = solution.filter(pl.col('lab_id') == lab).clone()
        lab_videos = set(lab_solution['video_id'].unique())
        lab_submission = submission.filter(pl.col('video_id').is_in(lab_videos)).clone()
        lab_scores.append(single_lab_f1(lab_solution, lab_submission, beta=beta))

    return sum(lab_scores) / len(lab_scores)

def score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str, beta: float = 1) -> float:
    solution = solution.drop(row_id_column_name, axis='columns', errors='ignore')
    submission = submission.drop(row_id_column_name, axis='columns', errors='ignore')
    return mouse_fbeta(solution, submission, beta=beta)

In [27]:
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.metrics import f1_score
from sklearn.base import clone
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from tqdm.notebook import tqdm
from koolbox import Trainer
import numpy as np
import itertools
import optuna
import warnings
import joblib
import glob
import gc
import logging
from sklearn.model_selection import StratifiedShuffleSplit

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler

warnings.filterwarnings('ignore')
optuna.logging.set_verbosity(optuna.logging.WARNING)
SEED = 1245

In [28]:

class CFG:
    train_path = "/kaggle/input/MABe-mouse-behavior-detection/train.csv"
    test_path = "/kaggle/input/MABe-mouse-behavior-detection/test.csv"
    train_annotation_path = "/kaggle/input/MABe-mouse-behavior-detection/train_annotation"
    train_tracking_path = "/kaggle/input/MABe-mouse-behavior-detection/train_tracking"
    test_tracking_path = "/kaggle/input/MABe-mouse-behavior-detection/test_tracking"

    model_root = "/kaggle/working"
    model_name = "tcnn"
    model_dir = f"{model_root}/{model_name}"

    patience = 5

    # mode = "submit"
    mode = "validate"
    
    n_splits = 3
    cv = StratifiedGroupKFold(n_splits)


# Loading and preprocessing

In [29]:
train = pd.read_csv(CFG.train_path)
test = pd.read_csv(CFG.test_path)

train["n_mice"] = 4 - train[["mouse1_strain", "mouse2_strain", "mouse3_strain", "mouse4_strain"]].isna().sum(axis=1)
train_without_mbae = train.query("~lab_id.str.startswith('MABe22_')")


In [30]:
# get unique raw entries
body_parts_tracked_list = list(np.unique(train.body_parts_tracked))

## Creating label dataframe

In [31]:
def create_solution_df(dataset):
    solution = []
    missing_file = []
    for _, row in tqdm(dataset.iterrows(), total=len(dataset)):
        lab_id = row['lab_id']
        
        if lab_id.startswith('MABe22'): 
            continue
            
        video_id = row['video_id']
        path = f"{CFG.train_annotation_path}/{lab_id}/{video_id}.parquet"
        try:
            anno = pd.read_parquet(path)
        except FileNotFoundError:
            missing_file.append(path)
            continue

        anno['lab_id'] = lab_id
        anno['video_id'] = video_id
        anno['behaviors_labeled'] = row['behaviors_labeled']
        
        anno['target_id'] = np.where(anno.target_id != anno.agent_id, anno['target_id'].apply(lambda s: f"mouse{s}"), 'self')
        anno['agent_id'] = anno['agent_id'].apply(lambda s: f"mouse{s}")

        solution.append(anno)
    
    solution = pd.concat(solution)
    return solution, missing_file

# use method above to create ground truth df
if CFG.mode == 'validate':
    solution, missing = create_solution_df(train_without_mbae)
    logging.warning("Files not found:")
    print(missing)

  0%|          | 0/863 [00:00<?, ?it/s]



['/kaggle/input/MABe-mouse-behavior-detection/train_annotation/PleasantMeerkat/1375833299.parquet', '/kaggle/input/MABe-mouse-behavior-detection/train_annotation/SparklingTapir/139713291.parquet', '/kaggle/input/MABe-mouse-behavior-detection/train_annotation/SparklingTapir/167444193.parquet', '/kaggle/input/MABe-mouse-behavior-detection/train_annotation/SparklingTapir/329031399.parquet', '/kaggle/input/MABe-mouse-behavior-detection/train_annotation/SparklingTapir/361341393.parquet', '/kaggle/input/MABe-mouse-behavior-detection/train_annotation/SparklingTapir/484405601.parquet', '/kaggle/input/MABe-mouse-behavior-detection/train_annotation/SparklingTapir/610412175.parquet', '/kaggle/input/MABe-mouse-behavior-detection/train_annotation/SparklingTapir/687999061.parquet', '/kaggle/input/MABe-mouse-behavior-detection/train_annotation/SparklingTapir/801328824.parquet', '/kaggle/input/MABe-mouse-behavior-detection/train_annotation/SparklingTapir/834408298.parquet', '/kaggle/input/MABe-mouse-b

In [32]:
DROP_BODY_PARTS = [
    'headpiece_bottombackleft', 'headpiece_bottombackright',
    'headpiece_bottomfrontleft', 'headpiece_bottomfrontright',
    'headpiece_topbackleft', 'headpiece_topbackright',
    'headpiece_topfrontleft', 'headpiece_topfrontright',
    'spine_1', 'spine_2',
    'tail_middle_1', 'tail_middle_2', 'tail_midpoint'
]

def generate_mouse_data(dataset, mode=None, is_train=True):
    if is_train:
        data_dir = CFG.train_tracking_path
    else:
        data_dir = CFG.test_tracking_path

    for _, row in dataset.iterrows():
        lab_id = row.lab_id

        # skip if MABe lab or not string behaviors_labeled   
        if lab_id.startswith("MABe22") or not isinstance(row.behaviors_labeled, str):
            continue

        video_id = row.video_id
        tracking_path =  f"{data_dir}/{lab_id}/{video_id}.parquet";

        vid = pd.read_parquet(tracking_path)

        # > 5 bodyparts -> drop
        if len(np.unique(vid.bodypart)) > 5:
            vid = vid.query("~bodypart.isin(@DROP_BODY_PARTS)")

        # pivot
        pvid = vid.pivot(
            index="video_frame",
            columns=["mouse_id", "bodypart"],
            values=["x", "y"],
        )

        # delete vid for memory
        del vid
        gc.collect()

        # (coor, mouse, bodypart) -> (mouse, bodypart, coor) -> sorted columns 
        pvid = pvid.reorder_levels([1, 2, 0], axis=1).T.sort_index().T
        
        # pix to cm
        pvid = pvid / row.pix_per_cm_approx
        
        # behaviors_labeled is JSON list
        raw_behaviors = json.loads(row.behaviors_labeled)
        
        # remove ', duplicate by set then sort
        cleaned = {b.replace("'", "") for b in raw_behaviors} 
        cleaned = sorted(list(cleaned))

        # split into 3 cols
        behaviors_split = [b.split(",") for b in cleaned]
        vid_beh = pd.DataFrame(behaviors_split, columns=["agent", "target", "action"])

        if is_train:
            try: 
                anno_path = tracking_path.replace("train_tracking", "train_annotation")
                anno = pd.read_parquet(anno_path)
            except FileNotFoundError:
                continue
        
        # ---- SINGLE MOUSE ----
        if mode is None or mode == 'single' :
            # only get target == self
            vid_beh_single = vid_beh.query("target == 'self'")

            for agent_str in np.unique(vid_beh_single.agent):
                try:
                    # get the id (the last element of agent_str)
                    mouse_id = int(agent_str[-1])
                    
                    # get all action of this agent 
                    agent_actions = np.unique(vid_beh_single.query("agent == @agent_str").action)

                    # get tracking of this agent
                    single_mouse = pvid.loc[:, mouse_id] 
                    assert len(single_mouse) == len(pvid)
                
                    single_meta = pd.DataFrame({
                        "video_id": video_id,
                        "agent_id": agent_str,
                        "target_id": "self",
                        "video_frame": single_mouse.index, # index by frames
                    })

                    if is_train:
                        single_label = pd.DataFrame(0.0, columns=agent_actions, index=single_mouse.index)
                        anno_single = anno.query("(agent_id == @mouse_id) & (target_id == @mouse_id)")

                        for _, anno_row in anno_single.iterrows():
                            start = anno_row['start_frame']
                            end = anno_row['stop_frame']
                            action = anno_row['action']
                            single_label.loc[start:end, action] = 1.0

                        yield "single", single_mouse, single_meta, single_label

                    else:
                        yield "single", single_mouse, single_meta, agent_actions
                    
                except KeyError:
                    continue
        
        # ---- PAIR MOUSE ----
        if mode is None or mode == 'pair':
            # only get target != 'self'
            vid_behaviors_pair = vid_beh.query("target != 'self'")

            if len(vid_behaviors_pair) == 0:
                continue

            # get list of mouse_ids
            mouse_ids = np.unique(pvid.columns.get_level_values("mouse_id"))

            # permutation (agent, target) with agent != target
            for agent_id, target_id in itertools.permutations(mouse_ids, 2):
                agent_str = f"mouse{agent_id}"
                target_str = f"mouse{target_id}"

                # action of this (agent, target)
                pair_actions = np.unique(
                    vid_behaviors_pair.query("(agent == @agent_str) & (target == @target_str)").action
                )

                # tracking of these 2 mice
                mouse_pair = pd.concat(
                    [pvid[agent_id], pvid[target_id]],
                    axis=1,
                    keys=["A", "B"],  # A = agent, B = target
                )
                assert len(mouse_pair) == len(pvid)

                # metadata 
                pair_meta = pd.DataFrame({
                    "video_id": video_id,
                    "agent_id": agent_str,
                    "target_id": target_str,
                    "video_frame": mouse_pair.index,
                })

                if is_train:
                    # label: frame × action
                    pair_label = pd.DataFrame(0.0, columns=pair_actions, index=mouse_pair.index)
                    anno_pair = anno.query(
                        "(agent_id == @agent_id) & (target_id == @target_id)"
                    )

                    for _, anno_row in anno_pair.iterrows():
                        start = anno_row["start_frame"]
                        end = anno_row["stop_frame"]
                        action = anno_row["action"]
                        pair_label.loc[start:end, action] = 1.0

                    yield "pair", mouse_pair, pair_meta, pair_label

                else:
                    # test/val: list action
                    yield "pair", mouse_pair, pair_meta, pair_actions
    

# Feature Engineering


In [33]:
# helper

def scaled_window(n_frames_30fps , fps , min_frac=0.2, min_abs=1):
    ws = max(1, int(round(n_frames_30fps * float(fps) / 30.0)))
    min_periods = max(min_abs, int(round(ws * min_frac)))
    return ws, min_periods


def _fps_from_meta(meta_df, fallback_lookup: dict, default_fps: float = 30.0):
    if "frames_per_second" in meta_df.columns and pd.notnull(meta_df["frames_per_second"]).any():
        return float(meta_df["frames_per_second"].iloc[0])
    vid = meta_df["video_id"].iloc[0]
    return float(fallback_lookup.get(vid, default_fps))

def _scale(n_frames_at_30fps, fps, ref=30.0):
    return max(1, int(round(n_frames_at_30fps * float(fps) / ref)))

def _scale_signed(n_frames_at_30fps, fps, ref=30.0):
    if n_frames_at_30fps == 0:
        return 0
    s = 1 if n_frames_at_30fps > 0 else -1
    mag = max(1, int(round(abs(n_frames_at_30fps) * float(fps) / ref)))
    return s * mag

In [34]:
# feature for each mouse

def add_curvature_features(X, center_x, center_y, fps):
    # velocity & acceleration
    vx = center_x.diff()
    vy = center_y.diff()
    ax = vx.diff()
    ay = vy.diff()

    # curve ~ |v × a| / |v|^3
    cross = vx * ay - vy * ax
    speed = np.sqrt(vx**2 + vy**2)
    curvature = np.abs(cross) / (speed**3 + 1e-6)

    # avg curve in scales
    for base_w in [25, 50, 75]:
        ws, mp = scaled_window(base_w, fps, min_frac=0.2)
        X[f"curv_mean_{base_w}"] = curvature.rolling(ws, min_periods=mp).mean()
    
    angle = np.arctan2(vy, vx)
    angle_change = np.abs(angle.diff())
    ws, mp = scaled_window(30, fps, min_frac=0.2)
    X["turn_rate_30"] = angle_change.rolling(ws, min_periods=mp).sum()

    return X

def add_multiscale_features(X, center_x, center_y, fps):
    speed = np.sqrt(center_x.diff()**2 + center_y.diff()**2) * float(fps)

    scales = [20, 40, 60, 80]
    for base_w in scales:
        ws, mp = scaled_window(base_w, fps, min_frac=0.25)
        if len(speed) >= ws:
            X[f"sp_m{base_w}"] = speed.rolling(ws, min_periods=mp).mean()
            X[f"sp_s{base_w}"] = speed.rolling(ws, min_periods=mp).std()

    if all(f"sp_m{s}" in X.columns for s in (scales[0], scales[-1])):
        X["sp_ratio"] = X[f"sp_m{scales[0]}"] / (X[f"sp_m{scales[-1]}"] + 1e-6)

    return X

def add_state_features(X, center_x, center_y, fps):
    speed = np.sqrt(center_x.diff()**2 + center_y.diff()**2) * float(fps)

    ws_ma, mp_ma = scaled_window(15, fps, min_frac=1/3)
    speed_ma = speed.rolling(ws_ma, min_periods=mp_ma).mean()

    try:
        bins = [-np.inf, 0.5 * fps, 2.0 * fps, 5.0 * fps, np.inf]
        speed_states = pd.cut(speed_ma, bins=bins, labels=[0, 1, 2, 3]).astype(float)

        for base_w in [20, 40, 60, 80]:
            ws, mp = scaled_window(base_w, fps, min_frac=0.2)
            if len(speed_states) < ws:
                continue

            for state in [0, 1, 2, 3]:
                X[f"s{state}_{base_w}"] = (
                    (speed_states == state)
                    .astype(float)
                    .rolling(ws, min_periods=mp)
                    .mean()
                )

            state_changes = (speed_states != speed_states.shift(1)).astype(float)
            X[f"trans_{base_w}"] = state_changes.rolling(ws, min_periods=mp).sum()
    except Exception:
        pass

    return X

def add_longrange_features(X, center_x, center_y, fps):
    # longrange moving average
    for base_w in [30, 60, 120]:
        ws, mp = scaled_window(base_w, fps, min_frac=1/6, min_abs=5)
        if len(center_x) >= ws:
            X[f"x_ml{base_w}"] = center_x.rolling(ws, min_periods=mp).mean()
            X[f"y_ml{base_w}"] = center_y.rolling(ws, min_periods=mp).mean()
        
    # EWMA 
    for span in [30, 60, 120]:
        s, _ = scaled_window(span, fps, min_frac=0.0)  # min_periods sẽ set riêng
        X[f"x_e{span}"] = center_x.ewm(span=s, min_periods=1).mean()
        X[f"y_e{span}"] = center_y.ewm(span=s, min_periods=1).mean()

    # percentile rank of speed
    speed = np.sqrt(center_x.diff()**2 + center_y.diff()**2) * float(fps)
    for base_w in [30, 60, 120]:
        ws, mp = scaled_window(base_w, fps, min_frac=1 / 6, min_abs=5)
        if len(speed) >= ws:
            X[f"sp_pct{base_w}"] = speed.rolling(ws, min_periods=mp).rank(pct=True)

    return X


In [35]:
def add_single_extra_features(X, single_mouse, available_parts, fps):
    # posture curvature
    if all(p in available_parts for p in ['nose', 'body_center', 'tail_base']):
        # body_center - tail_base vs nose - body_center
        v1 = single_mouse['body_center'] - single_mouse['tail_base']
        v2 = single_mouse['nose'] - single_mouse['body_center']

        dot = v1['x'] * v2['x'] + v1['y'] * v2['y']
        n1 = np.sqrt(v1['x']**2 + v1['y']**2)
        n2 = np.sqrt(v2['x']**2 + v2['y']**2)

        X['pose_curv'] = (dot / (n1 * n2 + 1e-6)).astype(np.float32)

    # verticality/ rearing proxy
    if all(p in available_parts for p in ['nose', 'lateral_left', 'lateral_right']):
        nose_x = single_mouse['nose']['x']
        nose_y = single_mouse['nose']['y']
        lat_x = (single_mouse['lateral_left']['x'] + single_mouse['lateral_right']['x']) / 2.0
        lat_y = (single_mouse['lateral_left']['y'] + single_mouse['lateral_right']['y']) / 2.0

        nose_lat_dist = np.sqrt((nose_x - lat_x)**2 + (nose_y - lat_y)**2)
        X["nose_lateral_dist"] = nose_lat_dist.astype(np.float32)
        X["nose_lateral_vel"] = nose_lat_dist.diff().astype(np.float32)

    # ear dynamics
    if all(p in available_parts for p in ['ear_left', 'ear_right']):
        ear_d = np.sqrt(
            (single_mouse['ear_left']['x'] - single_mouse['ear_right']['x'])**2 +
            (single_mouse['ear_left']['y'] - single_mouse['ear_right']['y'])**2
        )
        X["ear_vel"] = ear_d.diff().astype(np.float32)
        X["ear_acc"] = ear_d.diff().diff().astype(np.float32)

    return X

In [36]:
def transform_single(single_mouse, body_parts_tracked, fps):
    available_parts = single_mouse.columns.get_level_values(0)

    # pairwise distance between body part (p1, p2) ---
    features = {}

    for p1, p2 in itertools.combinations(body_parts_tracked, 2):
        if p1 in available_parts and p2 in available_parts:
            diff = single_mouse[p1] - single_mouse[p2]      # (x,y) or (x,y,...) by frame
            dist2 = np.square(diff).sum(axis=1, skipna=False)
            features[f"{p1}+{p2}"] = dist2

    X = pd.DataFrame(features)

    # ensure order
    full_cols = [f"{p1}+{p2}" for p1, p2 in itertools.combinations(body_parts_tracked, 2)]
    X = X.reindex(columns=full_cols, copy=False)

    # raw speed by ear and tail (lag ~10 frame) ---
    if all(p in available_parts for p in ['ear_left', 'ear_right', 'tail_base']):
        lag = _scale(10, fps)
        past = single_mouse[['ear_left', 'ear_right', 'tail_base']].shift(lag)

        sp_lf  = np.square(single_mouse['ear_left']  - past['ear_left']).sum(axis=1, skipna=False)
        sp_rt  = np.square(single_mouse['ear_right'] - past['ear_right']).sum(axis=1, skipna=False)
        sp_lf2 = np.square(single_mouse['ear_left']  - past['tail_base']).sum(axis=1, skipna=False)
        sp_rt2 = np.square(single_mouse['ear_right'] - past['tail_base']).sum(axis=1, skipna=False)

        X[['sp_lf', 'sp_rt', 'sp_lf2', 'sp_rt2']] = np.column_stack([sp_lf, sp_rt, sp_lf2, sp_rt2])

    # elongation
    if 'nose+tail_base' in X.columns and 'ear_left+ear_right' in X.columns:
        X['elong'] = X['nose+tail_base'] / (X['ear_left+ear_right'] + 1e-6)

    # body angle
    if all(p in available_parts for p in ['nose', 'body_center', 'tail_base']):
        v1 = single_mouse['nose']      - single_mouse['body_center']
        v2 = single_mouse['tail_base'] - single_mouse['body_center']

        dot = v1['x'] * v2['x'] + v1['y'] * v2['y']
        norm1 = np.sqrt(v1['x']**2 + v1['y']**2)
        norm2 = np.sqrt(v2['x']**2 + v2['y']**2)
        body_ang = dot / (norm1 * norm2 + 1e-6)

        X['body_ang'] = body_ang.astype(np.float32)
        X['body_ang_vel'] = body_ang.diff().astype(np.float32)
        X['body_ang_acc'] = body_ang.diff().diff().astype(np.float32)

    # features by body_center
    if 'body_center' in available_parts:
        cx = single_mouse['body_center']['x']
        cy = single_mouse['body_center']['y']

        for base_w in [5, 15, 30, 60]:
            ws = _scale(base_w, fps)
            roll_kwargs = dict(window=ws, min_periods=1, center=True)

            X[f'cx_m{base_w}'] = cx.rolling(**roll_kwargs).mean()
            X[f'cy_m{base_w}'] = cy.rolling(**roll_kwargs).mean()
            X[f'cx_s{base_w}'] = cx.rolling(**roll_kwargs).std()
            X[f'cy_s{base_w}'] = cy.rolling(**roll_kwargs).std()

            X[f'x_rng{base_w}'] = cx.rolling(**roll_kwargs).max() - cx.rolling(**roll_kwargs).min()
            X[f'y_rng{base_w}'] = cy.rolling(**roll_kwargs).max() - cy.rolling(**roll_kwargs).min()

            # displacement & activity (from diff)
            dx = cx.diff()
            dy = cy.diff()
            disp = np.sqrt(dx.rolling(ws, min_periods=1).sum()**2 + dy.rolling(ws, min_periods=1).sum()**2)
            act = np.sqrt(dx.rolling(ws, min_periods=1).var() + dy.rolling(ws, min_periods=1).var())
            X[f'disp{base_w}'] = disp
            X[f'act{base_w}']  = act

        # advanced feature
        X = add_curvature_features(X, cx, cy, fps)
        X = add_multiscale_features(X, cx, cy, fps)
        X = add_state_features(X, cx, cy, fps)
        X = add_longrange_features(X, cx, cy, fps)

    # nose–tail_based distance by time 
    if all(p in available_parts for p in ['nose', 'tail_base']):
        nt = np.sqrt(
            (single_mouse['nose']['x'] - single_mouse['tail_base']['x'])**2 +
            (single_mouse['nose']['y'] - single_mouse['tail_base']['y'])**2
        )
        for lag in [10, 20, 40]:
            l = _scale(lag, fps)
            X[f'nt_lg{lag}'] = nt.shift(l)
            X[f'nt_df{lag}'] = nt - nt.shift(l)

    # ear distance & consistency
    if all(p in available_parts for p in ['ear_left', 'ear_right']):
        ear_d = np.sqrt(
            (single_mouse['ear_left']['x']   - single_mouse['ear_right']['x'])**2 +
            (single_mouse['ear_left']['y']   - single_mouse['ear_right']['y'])**2
        )
        # offset
        for off in [-30, -20, -10, 10, 20, 30]:
            o = _scale_signed(off, fps)
            X[f'ear_o{off}'] = ear_d.shift(-o)

        w = _scale(30, fps)
        roll_c = dict(window=w, min_periods=1, center=True)
        ear_mean = ear_d.rolling(**roll_c).mean()
        ear_std  = ear_d.rolling(**roll_c).std()
        X['ear_con'] = ear_std / (ear_mean + 1e-6)

    X = add_single_extra_features(X, single_mouse, available_parts, fps)

    return X.astype(np.float32, copy=False)


In [37]:
# feature for mice interaction
def add_interaction_features(X, mouse_pair, avail_A, avail_B, fps):
    if "body_center" not in avail_A or "body_center" not in avail_B:
        return X
    
    # coor
    Ax = mouse_pair["A"]["body_center"]["x"]
    Ay = mouse_pair["A"]["body_center"]["y"]
    Bx = mouse_pair["B"]["body_center"]["x"]
    By = mouse_pair["B"]["body_center"]["y"]

    # relative 
    rel_x = Ax - Bx
    rel_y = Ay - By
    rel_dist = np.sqrt(rel_x**2 + rel_y**2)

    # velocity
    A_vx = Ax.diff()
    A_vy = Ay.diff()
    B_vx = Bx.diff()
    B_vy = By.diff()

    # cosine angle between vector (A, B) and velocity vector
    A_lead = (A_vx * rel_x + A_vy * rel_y) / (np.sqrt(A_vx**2 + A_vy**2) * rel_dist + 1e-6)
    B_lead = (B_vx * (-rel_x) + B_vy * (-rel_y)) / (np.sqrt(B_vx**2 + B_vy**2) * rel_dist + 1e-6)

    # avg of A_lead, B_lead on windows
    for base_w in [30, 60]:
        ws, mp = scaled_window(base_w, fps, min_frac=1 / 6)
        X[f"A_ld{base_w}"] = A_lead.rolling(ws, min_periods=mp).mean()
        X[f"B_ld{base_w}"] = B_lead.rolling(ws, min_periods=mp).mean()
    
    # approach
    approach = -rel_dist.diff()
    chase = approach * B_lead
    ws, mp = scaled_window(30, fps, min_frac=1/6)
    X["chase_30"] = chase.rolling(ws, min_periods=mp).mean()

    # correlation of 2 mice speed in long windows
    A_sp = np.sqrt(A_vx**2 + A_vy**2)
    B_sp = np.sqrt(B_vx**2 + B_vy**2)
    for base_w in [60, 120]:
        ws, mp = scaled_window(base_w, fps, min_frac=1 / 6)
        X[f"sp_cor{base_w}"] = A_sp.rolling(ws, min_periods=mp).corr(B_sp)

    return X


In [38]:
def add_egocentric_interaction_features(X, mouse_pair, avail_A, avail_B, fps):
    # cjeck condition
    ok_A = all(p in avail_A for p in ['nose', 'tail_base', 'body_center'])
    ok_B = all(p in avail_B for p in ['nose', 'tail_base', 'body_center'])
    if not (ok_A and ok_B):
        return X

    # position
    Ax = mouse_pair['A']['body_center']['x']
    Ay = mouse_pair['A']['body_center']['y']
    Bx = mouse_pair['B']['body_center']['x']
    By = mouse_pair['B']['body_center']['y']

    # head direction of A/B
    headA_x = mouse_pair['A']['nose']['x'] - mouse_pair['A']['tail_base']['x']
    headA_y = mouse_pair['A']['nose']['y'] - mouse_pair['A']['tail_base']['y']
    headB_x = mouse_pair['B']['nose']['x'] - mouse_pair['B']['tail_base']['x']
    headB_y = mouse_pair['B']['nose']['y'] - mouse_pair['B']['tail_base']['y']

    # vector A → B
    relAB_x = Bx - Ax
    relAB_y = By - Ay

    # cos / sin angle between A and vector A→B
    dotA = headA_x * relAB_x + headA_y * relAB_y
    norm_headA = np.sqrt(headA_x**2 + headA_y**2) + 1e-6
    norm_relAB = np.sqrt(relAB_x**2 + relAB_y**2) + 1e-6

    cos_bearing_A = dotA / (norm_headA * norm_relAB)
    # sign of cross product → trái/phải
    crossA = headA_x * relAB_y - headA_y * relAB_x
    sin_bearing_A = crossA / (norm_headA * norm_relAB)

    X['A_bearing_cos'] = cos_bearing_A.astype(np.float32)
    X['A_bearing_sin'] = sin_bearing_A.astype(np.float32)

    # for B
    relBA_x = Ax - Bx
    relBA_y = Ay - By
    dotB = headB_x * relBA_x + headB_y * relBA_y
    norm_headB = np.sqrt(headB_x**2 + headB_y**2) + 1e-6
    norm_relBA = np.sqrt(relBA_x**2 + relBA_y**2) + 1e-6

    cos_bearing_B = dotB / (norm_headB * norm_relBA)
    crossB = headB_x * relBA_y - headB_y * relBA_x
    sin_bearing_B = crossB / (norm_headB * norm_relBA)

    X['B_bearing_cos'] = cos_bearing_B.astype(np.float32)
    X['B_bearing_sin'] = sin_bearing_B.astype(np.float32)

    # rolling stats 
    for base_w in [15, 30]:
        ws = _scale(base_w, fps)
        roll = dict(window=ws, min_periods=1, center=True)
        X[f'A_bearing_cos_m{base_w}'] = X['A_bearing_cos'].rolling(**roll).mean()
        X[f'A_bearing_sin_m{base_w}'] = X['A_bearing_sin'].rolling(**roll).mean()
        X[f'B_bearing_cos_m{base_w}'] = X['B_bearing_cos'].rolling(**roll).mean()
        X[f'B_bearing_sin_m{base_w}'] = X['B_bearing_sin'].rolling(**roll).mean()

    return X


In [39]:
def add_asymmetry_features(X, mouse_pair, avail_A, avail_B, fps):
    # need body_center to define speeds
    if 'body_center' not in avail_A or 'body_center' not in avail_B:
        return X

    Ax = mouse_pair['A']['body_center']['x']
    Ay = mouse_pair['A']['body_center']['y']
    Bx = mouse_pair['B']['body_center']['x']
    By = mouse_pair['B']['body_center']['y']

    # velocities (frame-wise differences)
    A_vx = Ax.diff()
    A_vy = Ay.diff()
    B_vx = Bx.diff()
    B_vy = By.diff()

    # instantaneous speeds
    A_sp = np.sqrt(A_vx**2 + A_vy**2)
    B_sp = np.sqrt(B_vx**2 + B_vy**2)

    # asymmetry: difference and ratio
    sp_diff = A_sp - B_sp
    sp_ratio = A_sp / (B_sp + 1e-6)

    X['sp_diff_inst'] = sp_diff.astype(np.float32)
    X['sp_ratio_inst'] = sp_ratio.astype(np.float32)

    # rolling stats over short/medium windows
    for base_w in [30, 60]:
        ws = _scale(base_w, fps)
        roll = dict(window=ws, min_periods=1, center=True)

        X[f'sp_diff_m{base_w}'] = (
            sp_diff.rolling(**roll).mean().astype(np.float32)
        )
        X[f'sp_ratio_m{base_w}'] = (
            sp_ratio.rolling(**roll).mean().astype(np.float32)
        )

    return X


In [40]:
def transform_pair(mouse_pair, body_parts_tracked, fps):
    avail_A = mouse_pair['A'].columns.get_level_values(0)
    avail_B = mouse_pair['B'].columns.get_level_values(0)

    # pairwise distance between A[p1] and B[p2]
    features = {}
    for p1, p2 in itertools.product(body_parts_tracked, repeat=2):
        if p1 in avail_A and p2 in avail_B:
            diff = mouse_pair['A'][p1] - mouse_pair['B'][p2]
            dist2 = np.square(diff).sum(axis=1, skipna=False)
            features[f"12+{p1}+{p2}"] = dist2

    X = pd.DataFrame(features)
    full_cols = [f"12+{p1}+{p2}" for p1, p2 in itertools.product(body_parts_tracked, repeat=2)]
    X = X.reindex(columns=full_cols, copy=False)

    # ear-left speed A/B (lag ~10 frame)
    if ('A', 'ear_left') in mouse_pair.columns and ('B', 'ear_left') in mouse_pair.columns:
        lag = _scale(10, fps)
        shA = mouse_pair['A']['ear_left'].shift(lag)
        shB = mouse_pair['B']['ear_left'].shift(lag)

        sp_A  = np.square(mouse_pair['A']['ear_left'] - shA).sum(axis=1, skipna=False)
        sp_AB = np.square(mouse_pair['A']['ear_left'] - shB).sum(axis=1, skipna=False)
        sp_B  = np.square(mouse_pair['B']['ear_left'] - shB).sum(axis=1, skipna=False)

        X[['sp_A', 'sp_AB', 'sp_B']] = np.column_stack([sp_A, sp_AB, sp_B])

    # elong = dist(nose, tail_base) / dist(ear_left, ear_right)
    have_A_elong = all(p in avail_A for p in ['nose', 'tail_base', 'ear_left', 'ear_right'])
    have_B_elong = all(p in avail_B for p in ['nose', 'tail_base', 'ear_left', 'ear_right'])

    if have_A_elong:
        nose_A = mouse_pair['A']['nose']
        tail_A = mouse_pair['A']['tail_base']
        el_A_l = mouse_pair['A']['ear_left']
        el_A_r = mouse_pair['A']['ear_right']

        nose_tail_A = np.square(nose_A - tail_A).sum(axis=1, skipna=False)
        ear_dist_A  = np.square(el_A_l - el_A_r).sum(axis=1, skipna=False)
        X['elong_A'] = nose_tail_A / (ear_dist_A + 1e-6)

    if have_B_elong:
        nose_B = mouse_pair['B']['nose']
        tail_B = mouse_pair['B']['tail_base']
        el_B_l = mouse_pair['B']['ear_left']
        el_B_r = mouse_pair['B']['ear_right']

        nose_tail_B = np.square(nose_B - tail_B).sum(axis=1, skipna=False)
        ear_dist_B  = np.square(el_B_l - el_B_r).sum(axis=1, skipna=False)
        X['elong_B'] = nose_tail_B / (ear_dist_B + 1e-6)

    # diff and ratio
    if 'elong_A' in X.columns and 'elong_B' in X.columns:
        X['elong_diff']  = X['elong_A'] - X['elong_B']
        X['elong_ratio'] = X['elong_A'] / (X['elong_B'] + 1e-6)

    # relative body angle between A and B
    if all(p in avail_A for p in ['nose', 'tail_base']) and all(p in avail_B for p in ['nose', 'tail_base']):
        dir_A = mouse_pair['A']['nose'] - mouse_pair['A']['tail_base']
        dir_B = mouse_pair['B']['nose'] - mouse_pair['B']['tail_base']

        dot = dir_A['x'] * dir_B['x'] + dir_A['y'] * dir_B['y']
        nA = np.sqrt(dir_A['x']**2 + dir_A['y']**2)
        nB = np.sqrt(dir_B['x']**2 + dir_B['y']**2)
        X['rel_ori'] = dot / (nA * nB + 1e-6)

    # nose-nose approach
    if 'nose' in avail_A and 'nose' in avail_B:
        nn_cur = np.square(mouse_pair['A']['nose'] - mouse_pair['B']['nose']).sum(axis=1, skipna=False)
        lag = _scale(10, fps)
        shA_n = mouse_pair['A']['nose'].shift(lag)
        shB_n = mouse_pair['B']['nose'].shift(lag)
        nn_past = np.square(shA_n - shB_n).sum(axis=1, skipna=False)
        X['appr'] = nn_cur - nn_past

    # distance categories by body_center
    if 'body_center' in avail_A and 'body_center' in avail_B:
        Ax = mouse_pair['A']['body_center']['x']
        Ay = mouse_pair['A']['body_center']['y']
        Bx = mouse_pair['B']['body_center']['x']
        By = mouse_pair['B']['body_center']['y']

        cd = np.sqrt((Ax - Bx)**2 + (Ay - By)**2)

        X['v_cls'] = (cd < 5.0).astype(float)
        X['cls']   = ((cd >= 5.0)  & (cd < 15.0)).astype(float)
        X['med']   = ((cd >= 15.0) & (cd < 30.0)).astype(float)
        X['far']   = (cd >= 30.0).astype(float)

        # stats on squared distance
        cd2 = np.square(mouse_pair['A']['body_center'] - mouse_pair['B']['body_center']).sum(axis=1, skipna=False)
        
        for base_w in [5, 15, 30, 60]:
            ws = _scale(base_w, fps)
            roll_c = dict(window=ws, min_periods=1, center=True)
    
            X[f'd_m{base_w}']  = cd2.rolling(**roll_c).mean()
            X[f'd_s{base_w}']  = cd2.rolling(**roll_c).std()
            X[f'd_mn{base_w}'] = cd2.rolling(**roll_c).min()
            X[f'd_mx{base_w}'] = cd2.rolling(**roll_c).max()
    
            d_var = cd2.rolling(**roll_c).var()
            X[f'int{base_w}'] = 1.0 / (1.0 + d_var)
    
            # dot product vận tốc body_center
            Axd = Ax.diff()
            Ayd = Ay.diff()
            Bxd = Bx.diff()
            Byd = By.diff()
            coord = Axd * Bxd + Ayd * Byd
            X[f'co_m{base_w}'] = coord.rolling(**roll_c).mean()
            X[f'co_s{base_w}'] = coord.rolling(**roll_c).std()

            # cosine similarity A,B speed (offset) ---
            Avx = Ax.diff()
            Avy = Ay.diff()
            Bvx = Bx.diff()
            Bvy = By.diff()
            vel_cos = (Avx * Bvx + Avy * Bvy) / (
                np.sqrt(Avx**2 + Avy**2) * np.sqrt(Bvx**2 + Bvy**2) + 1e-6
            )
        
            for off in [-30, -20, -10, 0, 10, 20, 30]:
                o = _scale_signed(off, fps)
                X[f'va_{off}'] = vel_cos.shift(-o)
        
            w = _scale(30, fps)
            roll_c30 = dict(window=w, min_periods=1, center=True)
            cd2_mean = cd2.rolling(**roll_c30).mean()
            cd2_std  = cd2.rolling(**roll_c30).std()
            X['int_con'] = cd2_std / (cd2_mean + 1e-6)

            # advanced features
            X = add_asymmetry_features(X, mouse_pair, avail_A, avail_B, fps)
            X = add_egocentric_interaction_features(X, mouse_pair, avail_A, avail_B, fps)
            X = add_interaction_features(X, mouse_pair, avail_A, avail_B, fps)

    # nose-nose distance + close percentage
    if 'nose' in avail_A and 'nose' in avail_B:
        nn = np.sqrt(
            (mouse_pair['A']['nose']['x'] - mouse_pair['B']['nose']['x'])**2 +
            (mouse_pair['A']['nose']['y'] - mouse_pair['B']['nose']['y'])**2
        )
        for lag in [10, 20, 40]:
            l = _scale(lag, fps)
            X[f'nn_lg{lag}'] = nn.shift(l)
            X[f'nn_ch{lag}'] = nn - nn.shift(l)
            is_close = (nn < 10.0).astype(float)
            X[f'cl_ps{lag}'] = is_close.rolling(l, min_periods=1).mean()
    

    return X.astype(np.float32, copy=False)


# Preparation for training

In [41]:
def clean_and_fill_submission(submission, meta_df, is_train=True):
    if is_train:
        tracking_dir = CFG.train_tracking_path
    else: 
        tracking_dir = CFG.test_tracking_path
    
    # remove where start >= stop
    prev_len = len(submission)
    submission = submission[submission['start_frame'] < submission['stop_frame']].copy()
    if len(submission) != prev_len:
        print("Dropped rows with start_frame > stop_frame")
    
    # remove overlap
    prev_len = len(submission)
    cleaned_groups = []

    for (_, grp) in submission.groupby(['video_id', 'agent_id', 'target_id']):
        grp = grp.sort_values('start_frame')
        keep_mask = np.ones(len(grp), dtype=bool)

        last_stop = -1
        for i, (_, row) in enumerate(grp.iterrows()):
            if row['start_frame'] < last_stop:
                keep_mask[i] = False
            else:
                last_stop = row['stop_frame']
        
        cleaned_groups.append(grp[keep_mask])

    submission = pd.concat(cleaned_groups, ignore_index=True)
    if len(submission) != prev_len:
        print("Dropped rows with overlapped intervals")   

    # dummy prediction for video have no prediction
    dummy_rows = []

    for _, row in meta_df.iterrows():
        lab_id = row["lab_id"]

        # remove MABe22 vids
        if isinstance(lab_id, str) and lab_id.startswith("MABe22"):
            continue
        
        # remove behaviors_labeled if not string
        if not isinstance(row.get("behaviors_labeled", None), str):
            continue

        video_id = row["video_id"]

        # if have prediction -> skip
        if (submission["video_id"] == video_id).any():
            continue

        print(f"Video {video_id} has no predictions. Filling dummy segments...")

        # read tracking
        path = f"{tracking_dir}/{lab_id}/{video_id}.parquet"
        vid = pd.read_parquet(path)

        # get list (agent, target, action) from meta
        raw_behaviors = json.loads(row["behaviors_labeled"])
        cleaned = {b.replace("'", "") for b in raw_behaviors}
        triplets = [b.split(",") for b in sorted(cleaned)]
        beh_df = pd.DataFrame(triplets, columns=["agent", "target", "action"])

        # get total frames of this video
        start_frame = vid["video_frame"].min()
        stop_frame = vid["video_frame"].max() + 1
        total_frames = stop_frame - start_frame

        # divide uniformly 
        for (agent, target), actions in beh_df.groupby(["agent", "target"]):
            n_actions = len(actions)
            if n_actions == 0:
                continue

            batch_len = int(np.ceil(total_frames / n_actions))

            for i, (_, act_row) in enumerate(actions.iterrows()):
                batch_start = start_frame + i * batch_len
                batch_stop = min(batch_start + batch_len, stop_frame)

                dummy_rows.append((
                    video_id,
                    act_row["agent"],
                    act_row["target"],
                    act_row["action"],
                    batch_start,
                    batch_stop,
                ))

    if dummy_rows:
        dummy_df = pd.DataFrame(
            dummy_rows,
            columns=["video_id", "agent_id", "target_id", "action", "start_frame", "stop_frame"],
        )
        submission = pd.concat([submission, dummy_df], ignore_index=True)
        print(f"Filled {len(dummy_rows)} dummy segments for empty videos")

    return submission


In [42]:
def tune_thresholds_per_class(y_true, y_pred_prob, class_names):
    best_thrs = {}
    best_f1s = {}
    
    # tạo mảng ngưỡng (1 dòng, 99 cột)
    thresholds = np.arange(0.01, 1.00, 0.01)
    
    # loop qua từng class (không tránh được, nhưng bên trong sẽ rất nhanh)
    for i, name in enumerate(class_names):
        y_t = y_true[:, i]      # Shape: (N,)
        y_p = y_pred_prob[:, i] # Shape: (N,)
        
        # Nếu không có mẫu dương nào, skip ngay
        if y_t.sum() == 0: 
            best_thrs[name] = 0.5
            best_f1s[name] = 0.0
            continue
        
        # --- BƯỚC CẢI TIẾN: VECTOR HÓA ---
        
        # 1. So sánh y_p với TẤT CẢ ngưỡng cùng lúc
        # y_p[:, None] shape là (N, 1)
        # thresholds[None, :] shape là (1, 99)
        # Kết quả pred_matrix shape là (N, 99) - Ma trận True/False
        pred_matrix = (y_p[:, None] >= thresholds[None, :])
        
        # 2. Tính True Positive (TP), False Positive (FP), False Negative (FN)
        # Nhân với y_t[:, None] (Shape N, 1) để broadcast
        
        # TP: Dự đoán True VÀ Thực tế True
        tp = (pred_matrix & (y_t[:, None] == 1)).sum(axis=0)
        
        # FP: Dự đoán True NHƯNG Thực tế False
        fp = (pred_matrix & (y_t[:, None] == 0)).sum(axis=0)
        
        # FN: Dự đoán False NHƯNG Thực tế True
        # (~pred_matrix) đảo ngược True/False
        fn = ((~pred_matrix) & (y_t[:, None] == 1)).sum(axis=0)
        
        # 3. Tính F1 Score cho cả 99 ngưỡng cùng lúc theo công thức
        # F1 = 2TP / (2TP + FP + FN)
        epsilon = 1e-7 # Tránh chia cho 0
        f1_scores = 2 * tp / (2 * tp + fp + fn + epsilon)
        
        # 4. Tìm vị trí có F1 cao nhất
        best_idx = np.argmax(f1_scores)
        
        best_f1 = f1_scores[best_idx]
        best_thr = thresholds[best_idx]
        
        best_thrs[name] = float(best_thr)
        best_f1s[name] = float(best_f1)
        
    return best_thrs, best_f1s

In [43]:
import torch
from torch.cuda.amp import autocast, GradScaler
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.metrics import f1_score
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import gc
import os
import joblib
import json
import warnings

# Tắt warning
warnings.filterwarnings('ignore')

MODEL_DIR = getattr(CFG, "model_dir", CFG.model_name)
if not os.path.exists(MODEL_DIR): os.makedirs(MODEL_DIR)


# --- CLASS: EARLY STOPPING ---
class EarlyStopping:
    def __init__(self, patience=5, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_score = None
        self.early_stop = False

    def __call__(self, val_f1, model, path):
        score = val_f1
        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_f1, model, path)
        elif score < self.best_score + self.min_delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_f1, model, path)
            self.counter = 0

    def save_checkpoint(self, val_f1, model, path):
        torch.save(model.state_dict(), path)

# --- LOSS: FOCAL LOSS ---
class FocalBCEWithLogitsLoss(nn.Module):
    def __init__(self, pos_weight=None, gamma=2.0, reduction='mean'):
        super().__init__()
        self.pos_weight = pos_weight
        self.gamma = gamma
        self.reduction = reduction
        
    def forward(self, logits, targets):
        bce_loss = F.binary_cross_entropy_with_logits(
            logits, targets, pos_weight=self.pos_weight, reduction='none'
        )
        probs = torch.sigmoid(logits)
        pt = torch.where(targets == 1, probs, 1 - probs)
        loss = (1 - pt) ** self.gamma * bce_loss
        
        if self.reduction == 'mean': return loss.mean()
        elif self.reduction == 'sum': return loss.sum()
        else: return loss

# ==========================================
# 1. PARAMETERS
# ==========================================
WINDOW_SIZE = 30  
BATCH_SIZE = 128
EPOCHS = 2       
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# ==========================================
# 2. DATASET
# ==========================================
class MABeLazyDataset(Dataset):
    def __init__(self, feat_list_np, scaler, label_list=None, window_size=30):
        self.window_size = window_size
        self.feat_list = feat_list_np   
        self.label_list = label_list
        self.scaler = scaler
        
        # Tạo index map
        self.index_map = []
        for v_idx, arr in enumerate(self.feat_list):
            length = len(arr)
            if length >= window_size:
                self.index_map.extend([(v_idx, t) for t in range(length - window_size)])
    
    def __len__(self): 
        return len(self.index_map)
    
    def __getitem__(self, idx):
        v_idx, t = self.index_map[idx]
        
        # Slicing trên Numpy nhanh hơn Pandas .iloc rất nhiều
        window = self.feat_list[v_idx][t : t + self.window_size]
        
        # Scale "On-the-fly"
        if self.scaler is not None:
            window = self.scaler.transform(window)
        
        # Transpose: (Seq, Feat) -> (Feat, Seq)
        X_out = torch.tensor(window, dtype=torch.float32).transpose(0, 1)
        
        if self.label_list is not None:
            center_frame_idx = t + self.window_size // 2
            y_out = torch.tensor(self.label_list[v_idx][center_frame_idx], dtype=torch.float32)
            return X_out, y_out
        else: 
            return X_out

# ==========================================
# 3. MODEL: RESNET-1D
# ==========================================
class ResidualBlock1D(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=3, dilation=1, dropout=0.2):
        super().__init__()
        padding = (kernel_size - 1) * dilation // 2
        
        self.conv1 = nn.Conv1d(in_channels, out_channels, kernel_size, padding=padding, dilation=dilation)
        self.bn1 = nn.BatchNorm1d(out_channels)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        
        self.conv2 = nn.Conv1d(out_channels, out_channels, kernel_size, padding=padding, dilation=dilation)
        self.bn2 = nn.BatchNorm1d(out_channels)
        
        self.shortcut = nn.Sequential()
        if in_channels != out_channels:
            self.shortcut = nn.Conv1d(in_channels, out_channels, kernel_size=1)

    def forward(self, x):
        residual = self.shortcut(x)
        x = self.relu(self.bn1(self.conv1(x)))
        x = self.dropout(x)
        x = self.bn2(self.conv2(x))
        x += residual
        return self.relu(x)

class MouseTCN(nn.Module):
    def __init__(self, n_feat=151, n_class=8, base_filters=64):
        super().__init__()
        self.stem = nn.Sequential(
            nn.Conv1d(n_feat, base_filters, kernel_size=7, padding=3),
            nn.BatchNorm1d(base_filters), nn.ReLU()
        )
        self.layer1 = ResidualBlock1D(base_filters, base_filters, dilation=1)
        self.layer2 = ResidualBlock1D(base_filters, base_filters*2, dilation=2)
        self.layer3 = ResidualBlock1D(base_filters*2, base_filters*4, dilation=4)
        
        self.global_pool = nn.AdaptiveAvgPool1d(1) 
        self.fc = nn.Linear(base_filters*4, n_class)
        
    def forward(self, x):
        x = self.stem(x)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.global_pool(x).squeeze(-1)
        return self.fc(x)


class Chomp1d(nn.Module):
    def __init__(self, chomp):
        super().__init__()
        self.chomp = chomp
    def forward(self, x):
        return x[..., :-self.chomp] if self.chomp > 0 else x

class TemporalBlock(nn.Module):
    def __init__(self, in_ch, out_ch, k=3, d=1, dropout=0.1):
        super().__init__()
        pad = (k - 1) * d
        self.net = nn.Sequential(
            nn.Conv1d(in_ch, out_ch, k, padding=pad, dilation=d),
            Chomp1d(pad),
            nn.BatchNorm1d(out_ch),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Conv1d(out_ch, out_ch, k, padding=pad, dilation=d),
            Chomp1d(pad),
            nn.BatchNorm1d(out_ch),
            nn.ReLU(),
            nn.Dropout(dropout),
        )
        self.down = nn.Conv1d(in_ch, out_ch, 1) if in_ch != out_ch else nn.Identity()
    def forward(self, x):
        return nn.ReLU()(self.net(x) + self.down(x))

class MouseTCN(nn.Module):
    def __init__(self, n_feat=151, n_class=8, channels=(64, 64, 128, 128), k=3, dropout=0.1):
        super().__init__()
        blocks = []
        in_c = n_feat
        for i, out_c in enumerate(channels):
            blocks.append(TemporalBlock(in_c, out_c, k=k, d=2**i, dropout=dropout))
            in_c = out_c
        self.tcn = nn.Sequential(*blocks)
        self.pool = nn.AdaptiveAvgPool1d(1)
        self.fc = nn.Linear(in_c, n_class)
    def forward(self, x):
        x = self.tcn(x)
        x = self.pool(x).squeeze(-1)
        return self.fc(x)
# ==========================================
# 4. TRAINING FUNCTION (GPU OPTIMIZED)
# ==========================================
def train_evaluate_cnn(feat_list, label_list, meta_list, section, mode, n_splits=3):
    if not os.path.exists(MODEL_DIR): os.makedirs(MODEL_DIR)

    # CHUYỂN ĐỔI DATA SANG NUMPY ---
    print(">>> Optimizing: Converting DataFrames to Numpy (CPU heavy once, then fast)...")
    
    # Union features
    ref_cols = sorted({c for df in feat_list for c in df.columns})

    # Convert toàn bộ feat_list
    feat_list_np = []
    for df in tqdm(feat_list, desc="To Numpy"):
        arr = df.reindex(columns=ref_cols, fill_value=0).values.astype(np.float32)
        arr = np.nan_to_num(arr, nan=0.0, posinf=0.0, neginf=0.0)
        feat_list_np.append(arr)

    del feat_list
    gc.collect()

    # XỬ LÝ LABELS
    if label_list:
        all_actions = set()
        for df in label_list: all_actions.update(df.columns)
        ref_label_cols = sorted(list(all_actions))
        # Label cũng chuyển sang Numpy list
        label_list_np = [df.reindex(columns=ref_label_cols, fill_value=0).values.astype(np.float32) for df in label_list]
    else:
        label_list_np = None
        # Dummy ref_cols nếu ko có label (chỉ chạy inference)
        ref_label_cols = ["dummy"] 

    # CHIA FOLD
    video_indices = np.arange(len(feat_list_np))
    video_labels = [] 
    if label_list_np:
        for arr in label_list_np:
            counts = arr.sum(axis=0)
            if counts.sum() == 0: video_labels.append(-1)
            else: video_labels.append(np.argmax(counts))
    else:
        video_labels = [0] * len(feat_list_np)

    video_labels = np.array(video_labels)
    video_groups = video_indices 

    sgkf = StratifiedGroupKFold(n_splits=n_splits)
    fold_scores = []
    
    # Biến để return
    model = None
    scaler = None

    for fold, (train_idx, val_idx) in enumerate(sgkf.split(video_indices, video_labels, video_groups)):
        print(f"\n>>> TRAINING FOLD {fold+1}/{n_splits} <<<")
        
        # Lấy dữ liệu từ Numpy list (Siêu nhanh)
        train_feats = [feat_list_np[i] for i in train_idx]
        val_feats = [feat_list_np[i] for i in val_idx]
        
        train_lbls = [label_list_np[i] for i in train_idx] if label_list_np else None
        val_lbls = [label_list_np[i] for i in val_idx] if label_list_np else None
        
        # --- SCALING ---
        print("Fitting Scaler...")
        scaler = StandardScaler()
        # Partial fit trên Numpy array
        for arr in tqdm(train_feats, desc="Fitting Scaler", leave=False):
            scaler.partial_fit(arr) 
        
        # --- DATASET & DATALOADER ---
        ds_train = MABeLazyDataset(train_feats, scaler, train_lbls, window_size=WINDOW_SIZE)
        ds_val = MABeLazyDataset(val_feats, scaler, val_lbls, window_size=WINDOW_SIZE)
        
        # Tăng Batch Size & Num Workers
        # Batch Size: 1024 (Tận dụng GPU)
        # Num Workers: 4 (Tải dữ liệu song song)
        train_bs = getattr(CFG, 'train_batch_size', 256)
        val_bs = getattr(CFG, 'val_batch_size', max(train_bs*2, train_bs))
        loader_train = DataLoader(ds_train, batch_size=train_bs, shuffle=True, num_workers=2, pin_memory=True) 
        loader_val = DataLoader(ds_val, batch_size=val_bs, shuffle=False, num_workers=2, pin_memory=True)
        
        # --- CALCULATE POS_WEIGHT ---
        pos_counts = np.zeros(len(ref_label_cols))
        total_samples = 0
        if train_lbls:
            for arr in train_lbls:
                if len(arr) >= WINDOW_SIZE:
                    valid_lbls = arr[WINDOW_SIZE//2 : -(WINDOW_SIZE//2)]
                    pos_counts += valid_lbls.sum(axis=0)
                    total_samples += len(valid_lbls)
        
        raw_weights = (total_samples - pos_counts) / (pos_counts + 1e-6)
        raw_weights = np.clip(raw_weights, 1.0, 100.0) 
        pos_weight_val = torch.tensor(raw_weights, dtype=torch.float32).to(DEVICE)

        # --- MODEL SETUP ---
        n_feat_in = len(ref_cols)
        model = MouseTCN(n_feat=n_feat_in, n_class=len(ref_label_cols)).to(DEVICE)
        
        optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-4)
        scaler_amp = GradScaler(enabled=torch.cuda.is_available())
        criterion = FocalBCEWithLogitsLoss(pos_weight=pos_weight_val, gamma=1.0)
        scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=2, verbose=True)
        save_path = f"{MODEL_DIR}/model_sec{section}_{mode}_fold{fold}.pth"
        early_stopping = EarlyStopping(patience=CFG.patience, min_delta=0.001)
        
        # --- TRAINING LOOP ---
        for epoch in range(EPOCHS): 
            model.train()
            train_loss = 0
            # Progress bar sẽ chạy mượt hơn nhiều
            for x_batch, y_batch in tqdm(loader_train, desc=f"Ep {epoch+1}", leave=False):
                x_batch, y_batch = x_batch.to(DEVICE), y_batch.to(DEVICE)
                optimizer.zero_grad()
                pred = model(x_batch)
                loss = criterion(pred, y_batch)
                if torch.isnan(loss): continue
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                optimizer.step()
                train_loss += loss.item()
            
            # Validation
            model.eval()
            val_probs, val_true = [], []
            with torch.no_grad():
                for x_b, y_b in loader_val:
                    val_probs.append(torch.sigmoid(model(x_b.to(DEVICE))).cpu())
                    val_true.append(y_b)
            
            mean_f1 = 0.0
            if val_probs:
                vp = torch.cat(val_probs).numpy()
                vt = torch.cat(val_true).numpy()
                mean_f1 = f1_score(vt, (vp > 0.5).astype(int), average='macro', zero_division=0)
                
                avg_train_loss = train_loss / len(loader_train) if len(loader_train) > 0 else 0
                print(f"Ep {epoch+1}: Loss {avg_train_loss:.4f} | Val F1: {mean_f1:.4f} | LR: {optimizer.param_groups[0]['lr']:.6f}")
                
                scheduler.step(mean_f1)
                early_stopping(mean_f1, model, save_path)
                if early_stopping.early_stop:
                    print("Early stopping triggered!")
                    break
        
        # --- RESULT ---
        model.load_state_dict(torch.load(save_path))
        model.eval()
        val_probs, val_true = [], []
        with torch.no_grad():
            for x_b, y_b in loader_val:
                val_probs.append(torch.sigmoid(model(x_b.to(DEVICE))).cpu())
                val_true.append(y_b)
        vp, vt = torch.cat(val_probs).numpy(), torch.cat(val_true).numpy()
        
        print(f"\n--- Final Threshold Tuning Fold {fold+1} ---")
        best_thrs, best_f1s = tune_thresholds_per_class(vt, vp, ref_label_cols)
        fold_scores.append(np.mean(list(best_f1s.values())))
        for k, v in best_f1s.items():
            print(f"\t{k}: {v:.4f} (thr={best_thrs[k]:.2f})")
            
        with open(f"{MODEL_DIR}/thresholds_sec{section}_{mode}_fold{fold}.json", "w") as f: json.dump(best_thrs, f)
        joblib.dump(scaler, f"{MODEL_DIR}/scaler_sec{section}_{mode}_fold{fold}.pkl")
        
        # --- CLEANUP ---
        del optimizer, loader_train, loader_val, ds_train, ds_val
        del train_feats, val_feats, train_lbls, val_lbls
        
        if fold < n_splits - 1:
            del model
            del scaler 
            
        torch.cuda.empty_cache()
        gc.collect()

    print(f"\n>>> MEAN F1 OVER {n_splits} FOLDS: {np.mean(fold_scores):.4f} <<<")
    return model, scaler

In [48]:

import torch
from torch.cuda.amp import autocast
import numpy as np
import pandas as pd
import joblib
import json
import os
import gc
from tqdm import tqdm

# ==========================================
# 1. MODEL + SCALER
# ==========================================
def load_resnet_resources(section, mode, n_folds=3, device='cuda'):
    print(f"Loading TCN Ensemble ({n_folds} Folds) for Section {section}...")
    MODEL_DIR = getattr(CFG, 'model_dir', CFG.model_name)
    os.makedirs(MODEL_DIR, exist_ok=True)
    base_dir = MODEL_DIR

    pack = {
        'scalers': [],
        'models': [],
        'thresholds': {},
        'n_feat': 0,
        'action_names': []
    }

    # Load scalers + models
    for fold in range(n_folds):
        scaler_path = f"{base_dir}/scaler_sec{section}_{mode}_fold{fold}.pkl"
        if not os.path.exists(scaler_path):
            continue
        pack['scalers'].append(joblib.load(scaler_path))
        model_path = f"{base_dir}/model_sec{section}_{mode}_fold{fold}.pth"
        if not os.path.exists(model_path):
            continue
        state = torch.load(model_path, map_location=device)
        conv_weight_keys = [k for k in state.keys() if getattr(state[k], 'ndim', 0) == 3 and 'weight' in k]
        if not conv_weight_keys:
            raise KeyError('No conv1d weight found in state dict')
        first_conv = sorted(conv_weight_keys)[0]
        pack['n_feat'] = state[first_conv].shape[1]
        last_key = [k for k in state.keys() if 'weight' in k][-1]
        n_class = state[last_key].shape[0]
        model = MouseTCN(n_feat=pack['n_feat'], n_class=n_class).to(device)
        model.load_state_dict(state); model.eval()
        pack['models'].append(model)

    # thresholds
    avg_thrs = {}; counts = {}; json_found = False
    for fold in range(n_folds):
        json_path = f"{base_dir}/thresholds_sec{section}_{mode}_fold{fold}.json"
        if os.path.exists(json_path):
            json_found = True
            with open(json_path, 'r') as f:
                thrs = json.load(f)
            for k, v in thrs.items():
                avg_thrs[k] = avg_thrs.get(k, 0.0) + v
                counts[k] = counts.get(k, 0) + 1
    if not json_found:
        print(f"Warning: No threshold files found for Sec {section}. Using default names.")
        dummy_actions = [f"action_{i}" for i in range(n_class)]
        for act in dummy_actions: avg_thrs[act] = 0.5
        pack['action_names'] = dummy_actions
    else:
        for k in avg_thrs:
            if counts.get(k, 0) > 0:
                avg_thrs[k] /= counts[k]
        pack['action_names'] = list(avg_thrs.keys())
    pack['thresholds'] = avg_thrs

    print(f"-> Detected: n_feat={pack['n_feat']}, n_class={len(pack['action_names'])}")
    print(f"-> Loaded {len(pack['models'])} models.")
    return pack

# ==========================================
# 2. PREDICT MULTICLASS (t? xgboost)
# ==========================================
def predict_multiclass(pred, meta, thresholds):
    pred_smoothed = pred.rolling(window=5, min_periods=1, center=True).mean()
    threshold_array = np.array([thresholds.get(col, 0.27) for col in pred.columns])
    margins = pred_smoothed.values - threshold_array[None, :]

    ama = np.argmax(margins, axis=1)
    max_margin = margins[np.arange(len(ama)), ama]
    ama = np.where(max_margin >= 0.0, ama, -1)
    ama = pd.Series(ama, index=meta.video_frame)

    changes_mask = (ama != ama.shift(1)).values
    ama_changes = ama[changes_mask]
    meta_changes = meta[changes_mask]
    mask = ama_changes.values >= 0
    mask[-1] = False

    submission_part = pd.DataFrame({
        'video_id': meta_changes['video_id'][mask].values,
        'agent_id': meta_changes['agent_id'][mask].values,
        'target_id': meta_changes['target_id'][mask].values,
        'action': pred.columns[ama_changes[mask].values],
        'start_frame': ama_changes.index[mask],
        'stop_frame': ama_changes.index[1:][mask[:-1]]
    })

    stop_video_id = meta_changes['video_id'][1:][mask[:-1]].values
    stop_agent_id = meta_changes['agent_id'][1:][mask[:-1]].values
    stop_target_id = meta_changes['target_id'][1:][mask[:-1]].values
    for i in range(len(submission_part)):
        vid = submission_part.video_id.iloc[i]
        ag = submission_part.agent_id.iloc[i]
        tg = submission_part.target_id.iloc[i]
        if i < len(stop_video_id):
            if stop_video_id[i] != vid or stop_agent_id[i] != ag or stop_target_id[i] != tg:
                new_stop_frame = meta.query('(video_id == @vid)').video_frame.max() + 1
                submission_part.iat[i, submission_part.columns.get_loc('stop_frame')] = new_stop_frame
        else:
            new_stop_frame = meta.query('(video_id == @vid)').video_frame.max() + 1
            submission_part.iat[i, submission_part.columns.get_loc('stop_frame')] = new_stop_frame

    duration = submission_part.stop_frame - submission_part.start_frame
    submission_part = submission_part[duration >= 3].reset_index(drop=True)
    return submission_part

# ==========================================
# 3. SUBMIT
# ==========================================
def submit(test_subset, fps_lookup, body_parts, mode, section, thresholds=None, is_train=False):
    submission_parts = []
    device = "cuda" if torch.cuda.is_available() else "cpu"
    window_size = WINDOW_SIZE

    try:
        pack = load_resnet_resources(section, mode, n_folds=CFG.n_splits, device=device)
        action_names = pack['action_names']
        final_thrs = pack['thresholds']
        expected_feat = pack['n_feat']
    except Exception as e:
        print(f"Error loading models for Sec {section}: {e}")
        return []

    sample_gen = generate_mouse_data(test_subset, mode, is_train=is_train)
    for _, track_df, meta_df, actions in tqdm(sample_gen, desc=f"Infer Sec {section}"):
        if is_train and not isinstance(actions, (list, tuple)):
            actions = list(actions.columns)
        video_id = meta_df["video_id"].iloc[0] if "video_id" in meta_df.columns else "Unknown"
        agent_id = meta_df["agent_id"].iloc[0] if "agent_id" in meta_df.columns else "unknown_agent"
        target_id = meta_df["target_id"].iloc[0] if "target_id" in meta_df.columns else "unknown_target"
        try:
            fps = _fps_from_meta(meta_df, fps_lookup, default_fps=30)
            X = transform_single(track_df, body_parts, fps) if mode == "single" else transform_pair(track_df, body_parts, fps)
            X = X.values if hasattr(X, 'values') else X
            X = np.nan_to_num(X.astype(np.float32), nan=0.0, posinf=0.0, neginf=0.0)
            if expected_feat is None:
                expected_feat = X.shape[1]
            if X.shape[1] != expected_feat:
                if X.shape[1] < expected_feat:
                    pad = np.zeros((X.shape[0], expected_feat - X.shape[1]), dtype=np.float32)
                    X = np.hstack([X, pad])
                else:
                    X = X[:, :expected_feat]

            
            n_frames = len(X)
            if n_frames < window_size or not action_names:
                continue
            n_windows = max(1, n_frames - window_size + 1)
            n_actions = len(action_names)
            accum_probs = np.zeros((n_windows, n_actions), dtype=np.float32)
            n_folds = len(pack["models"])
            infer_bs = getattr(CFG, 'infer_batch_size', 512)

            for i in range(n_folds):
                scaler = pack['scalers'][i]
                model = pack['models'][i]
                X_scaled = scaler.transform(np.nan_to_num(X))
                for start in range(0, n_windows, infer_bs):
                    end = min(start + infer_bs, n_windows)
                    windows = np.array([X_scaled[t: t+window_size] for t in range(start, end)], dtype=np.float32)
                    inp = torch.tensor(windows, dtype=torch.float32).transpose(1, 2).to(device)
                    with torch.no_grad():
                        probs = torch.sigmoid(model(inp)).cpu().numpy()
                        accum_probs[start:end] += probs
                    del inp, windows
                del X_scaled
                torch.cuda.empty_cache()

            center_probs = accum_probs / float(n_folds)
            full_probs = np.zeros((n_frames, len(action_names)), dtype=float)
            offset = window_size // 2
            full_probs[offset: offset + len(center_probs)] = center_probs
            full_probs[:offset] = center_probs[0]
            full_probs[offset + len(center_probs):] = center_probs[-1]

            pred_df = pd.DataFrame(full_probs, columns=action_names, index=meta_df['video_frame'])
            meta_use = meta_df[['video_id','agent_id','target_id','video_frame']]
            submission_part = predict_multiclass(pred_df, meta_use, final_thrs)
            submission_parts.append(submission_part)

        except Exception as e:
            print(f"!!! Error processing video {video_id}: {e}")
            gc.collect()
            continue

    if submission_parts:
        return [pd.concat(submission_parts, ignore_index=True)]
    return []


In [45]:

import os 

def process_mode(mode, subset, body_parts, fps_lookup, section, thresholds, f1_list, submission_list):
    # validate or test
    if CFG.mode == "validate":
        data_list, label_list, meta_list = [], [], []

        for switch, data, meta, label in generate_mouse_data(subset):
            if switch != mode:
                continue
            data_list.append(data)
            meta_list.append(meta)
            label_list.append(label)
            del data, meta, label
        gc.collect()

        if len(data_list) == 0:
            return  # no sample for this mode

        # features for each sample
        feats_parts = []
        for data_i, meta_i in zip(data_list, meta_list):
            fps_i = _fps_from_meta(meta_i, fps_lookup, default_fps=30.0)
            if mode == "single":
                X_i = transform_single(data_i, body_parts, fps_i)
            else:
                X_i = transform_pair(data_i, body_parts, fps_i)

            feats_parts.append(X_i.astype(np.float32))
            del X_i, fps_i
        gc.collect()

        # --- CNN ---
        print(f"Start training CNN for section {section}")

        # training CNN
        model, scaler = train_evaluate_cnn(feats_parts, label_list, meta_list, section, mode)

        # Load thresholds
        try:
            pack = load_resnet_resources(section, mode, n_folds=CFG.n_splits, device=("cuda" if torch.cuda.is_available() else "cpu"))
            thresholds[mode].setdefault(str(section), {})
            thresholds[mode][str(section)] = pack.get("thresholds", {})
        except Exception as e:
            print(f"Warning: could not load thresholds for section {section}: {e}")

        # Sau khi train, chạy inference trên tập train (validate) sinh submission_list
        temp_sub_list = submit(
            test_subset=subset,
            body_parts=body_parts,
            fps_lookup=fps_lookup,
            mode=mode,
            section=section,
            thresholds=thresholds,
            is_train=True
        )
        for df in temp_sub_list:
            submission_list.append(df)

        # Cleanup
        del feats_parts, label_list, meta_list
        gc.collect()

    else:
        print(f"Predicting Section {section} using ResNet Ensemble...")

        temp_sub_list = submit(
            test_subset=subset,
            body_parts=body_parts,
            fps_lookup=fps_lookup,
            mode=mode,
            section=section,
            thresholds=None,
            is_train=False
        )

        for df in temp_sub_list:
            submission_list.append(df)

        del temp_sub_list; gc.collect()


In [46]:

thresholds = {
    "single": {},
    "pair": {}
}


In [49]:
f1_list = []
submission_list = []
import traceback

# for section in range(1, len(body_parts_tracked_list)):
for section in range(1, len(body_parts_tracked_list)):
    body_parts_tracked_str = body_parts_tracked_list[section]

    try:
        body_parts = json.loads(body_parts_tracked_str)
        print(f"{section}/{len(body_parts_tracked_list)-1} Processing videos with: {body_parts}\n")

        if len(body_parts) > 5:
            body_parts = [b for b in body_parts if b not in DROP_BODY_PARTS]

        if CFG.mode == "validate":
            subset =  train[train.body_parts_tracked == body_parts_tracked_str]
        else:
            subset = test[test.body_parts_tracked == body_parts_tracked_str]

        # lookup FPS
        fps_lookup = (
            subset[["video_id", "frames_per_second"]]
            .drop_duplicates("video_id")
            .set_index("video_id")["frames_per_second"]
            .to_dict()
        )

        # single
        process_mode(
            mode="single",
            subset=subset,
            body_parts=body_parts,
            fps_lookup=fps_lookup,
            section=section,
            thresholds=thresholds,
            f1_list=f1_list,
            submission_list=submission_list,
        )

        # pair
        process_mode(
            mode="pair",
            subset=subset,
            body_parts=body_parts,
            fps_lookup=fps_lookup,
            section=section,
            thresholds=thresholds,
            f1_list=f1_list,
            submission_list=submission_list,
        )

        print(f"Length of submission_list: {len(submission_list)}\n")

    except Exception as e:
        print(f"\tError: {e}")



1/9 Processing videos with: ['body_center', 'ear_left', 'ear_right', 'headpiece_bottombackleft', 'headpiece_bottombackright', 'headpiece_bottomfrontleft', 'headpiece_bottomfrontright', 'headpiece_topbackleft', 'headpiece_topbackright', 'headpiece_topfrontleft', 'headpiece_topfrontright', 'lateral_left', 'lateral_right', 'neck', 'nose', 'tail_base', 'tail_midpoint', 'tail_tip']

Start training CNN for section 1
>>> Optimizing: Converting DataFrames to Numpy (CPU heavy once, then fast)...


To Numpy: 100%|██████████| 22/22 [00:01<00:00, 18.53it/s]



>>> TRAINING FOLD 1/3 <<<
Fitting Scaler...


                                                               

Ep 1: Loss 0.2265 | Val F1: 0.6886 | LR: 0.001000


                                                       

Ep 2: Loss 0.1215 | Val F1: 0.7210 | LR: 0.001000

--- Final Threshold Tuning Fold 1 ---
	rear: 0.5617 (thr=0.82)

>>> TRAINING FOLD 2/3 <<<
Fitting Scaler...


                                                               

Ep 1: Loss 0.2255 | Val F1: 0.7255 | LR: 0.001000


                                                       

Ep 2: Loss 0.1245 | Val F1: 0.7123 | LR: 0.001000

--- Final Threshold Tuning Fold 2 ---
	rear: 0.5381 (thr=0.65)

>>> TRAINING FOLD 3/3 <<<
Fitting Scaler...


                                                               

Ep 1: Loss 0.2252 | Val F1: 0.7785 | LR: 0.001000


                                                       

Ep 2: Loss 0.1278 | Val F1: 0.7524 | LR: 0.001000

--- Final Threshold Tuning Fold 3 ---
	rear: 0.6346 (thr=0.66)

>>> MEAN F1 OVER 3 FOLDS: 0.5782 <<<
Loading TCN Ensemble (3 Folds) for Section 1...
-> Detected: n_feat=142, n_class=1
-> Loaded 3 models.
Loading TCN Ensemble (3 Folds) for Section 1...
-> Detected: n_feat=142, n_class=1
-> Loaded 3 models.


Infer Sec 1: 22it [01:09,  3.15s/it]


Start training CNN for section 1
>>> Optimizing: Converting DataFrames to Numpy (CPU heavy once, then fast)...


To Numpy: 100%|██████████| 72/72 [00:04<00:00, 17.37it/s]



>>> TRAINING FOLD 1/3 <<<
Fitting Scaler...


                                                               

Ep 1: Loss 0.0382 | Val F1: 0.2476 | LR: 0.001000


                                                         

Ep 2: Loss 0.0190 | Val F1: 0.2880 | LR: 0.001000

--- Final Threshold Tuning Fold 1 ---
	approach: 0.3531 (thr=0.94)
	attack: 0.4680 (thr=0.92)
	avoid: 0.4980 (thr=0.92)
	chase: 0.5390 (thr=0.92)
	chaseattack: 0.5379 (thr=0.97)
	submit: 0.1103 (thr=0.30)

>>> TRAINING FOLD 2/3 <<<
Fitting Scaler...


                                                               

Ep 1: Loss 0.0439 | Val F1: 0.2043 | LR: 0.001000


                                                         

Ep 2: Loss 0.0231 | Val F1: 0.2325 | LR: 0.001000

--- Final Threshold Tuning Fold 2 ---
	approach: 0.3962 (thr=0.87)
	attack: 0.4918 (thr=0.91)
	avoid: 0.4875 (thr=0.83)
	chase: 0.2641 (thr=0.95)
	chaseattack: 0.5471 (thr=0.90)
	submit: 0.1795 (thr=0.57)

>>> TRAINING FOLD 3/3 <<<
Fitting Scaler...


                                                               

Ep 1: Loss 0.0380 | Val F1: 0.1494 | LR: 0.001000


                                                         

Ep 2: Loss 0.0202 | Val F1: 0.1981 | LR: 0.001000

--- Final Threshold Tuning Fold 3 ---
	approach: 0.3539 (thr=0.92)
	attack: 0.3458 (thr=0.95)
	avoid: 0.4182 (thr=0.81)
	chase: 0.5001 (thr=0.84)
	chaseattack: 0.4040 (thr=0.99)
	submit: 0.0147 (thr=0.08)

>>> MEAN F1 OVER 3 FOLDS: 0.3839 <<<
Loading TCN Ensemble (3 Folds) for Section 1...
-> Detected: n_feat=164, n_class=6
-> Loaded 3 models.
Loading TCN Ensemble (3 Folds) for Section 1...
-> Detected: n_feat=164, n_class=6
-> Loaded 3 models.


Infer Sec 1: 72it [04:11,  3.49s/it]


Length of submission_list: 94

2/9 Processing videos with: ['body_center', 'ear_left', 'ear_right', 'hip_left', 'hip_right', 'lateral_left', 'lateral_right', 'nose', 'spine_1', 'spine_2', 'tail_base', 'tail_middle_1', 'tail_middle_2', 'tail_tip']

Start training CNN for section 2
>>> Optimizing: Converting DataFrames to Numpy (CPU heavy once, then fast)...


To Numpy: 100%|██████████| 32/32 [00:00<00:00, 32.49it/s]



>>> TRAINING FOLD 1/3 <<<
Fitting Scaler...


                                                               

Ep 1: Loss 0.0545 | Val F1: 0.0630 | LR: 0.001000


                                                       

Ep 2: Loss 0.0330 | Val F1: 0.0535 | LR: 0.001000

--- Final Threshold Tuning Fold 1 ---
	huddle: 0.2463 (thr=0.82)
	rear: 0.0000 (thr=0.50)
	selfgroom: 0.0000 (thr=0.50)

>>> TRAINING FOLD 2/3 <<<
Fitting Scaler...


                                                               

Ep 1: Loss 0.0500 | Val F1: 0.1153 | LR: 0.001000


                                                       

Ep 2: Loss 0.0285 | Val F1: 0.1365 | LR: 0.001000

--- Final Threshold Tuning Fold 2 ---
	huddle: 0.4298 (thr=0.71)
	rear: 0.0000 (thr=0.50)
	selfgroom: 0.0000 (thr=0.50)

>>> TRAINING FOLD 3/3 <<<
Fitting Scaler...


                                                               

Ep 1: Loss 0.0666 | Val F1: 0.1991 | LR: 0.001000


                                                       

Ep 2: Loss 0.0389 | Val F1: 0.1934 | LR: 0.001000

--- Final Threshold Tuning Fold 3 ---
	huddle: 0.6253 (thr=0.71)
	rear: 0.0000 (thr=0.50)
	selfgroom: 0.0000 (thr=0.50)

>>> MEAN F1 OVER 3 FOLDS: 0.1446 <<<
Loading TCN Ensemble (3 Folds) for Section 2...
-> Detected: n_feat=151, n_class=3
-> Loaded 3 models.
Loading TCN Ensemble (3 Folds) for Section 2...
-> Detected: n_feat=151, n_class=3
-> Loaded 3 models.


Infer Sec 2: 32it [01:09,  2.17s/it]


Start training CNN for section 2
>>> Optimizing: Converting DataFrames to Numpy (CPU heavy once, then fast)...


To Numpy: 100%|██████████| 42/42 [00:01<00:00, 28.49it/s]



>>> TRAINING FOLD 1/3 <<<
Fitting Scaler...


                                                               

Ep 1: Loss 0.1394 | Val F1: 0.1026 | LR: 0.001000


                                                       

Ep 2: Loss 0.0964 | Val F1: 0.1227 | LR: 0.001000

--- Final Threshold Tuning Fold 1 ---
	intromit: 0.0000 (thr=0.50)
	mount: 0.0000 (thr=0.50)
	reciprocalsniff: 0.3711 (thr=0.74)
	sniff: 0.0000 (thr=0.50)
	sniffgenital: 0.3778 (thr=0.75)

>>> TRAINING FOLD 2/3 <<<
Fitting Scaler...


                                                               

Ep 1: Loss 0.1413 | Val F1: 0.1032 | LR: 0.001000


                                                       

Ep 2: Loss 0.0933 | Val F1: 0.1245 | LR: 0.001000

--- Final Threshold Tuning Fold 2 ---
	intromit: 0.0000 (thr=0.50)
	mount: 0.0000 (thr=0.50)
	reciprocalsniff: 0.3181 (thr=0.78)
	sniff: 0.0000 (thr=0.50)
	sniffgenital: 0.5082 (thr=0.80)

>>> TRAINING FOLD 3/3 <<<
Fitting Scaler...


                                                               

Ep 1: Loss 0.1439 | Val F1: 0.1087 | LR: 0.001000


                                                       

Ep 2: Loss 0.0942 | Val F1: 0.1323 | LR: 0.001000

--- Final Threshold Tuning Fold 3 ---
	intromit: 0.0000 (thr=0.50)
	mount: 0.0000 (thr=0.50)
	reciprocalsniff: 0.3365 (thr=0.70)
	sniff: 0.0000 (thr=0.50)
	sniffgenital: 0.3769 (thr=0.58)

>>> MEAN F1 OVER 3 FOLDS: 0.1526 <<<
Loading TCN Ensemble (3 Folds) for Section 2...
-> Detected: n_feat=183, n_class=5
-> Loaded 3 models.
Loading TCN Ensemble (3 Folds) for Section 2...
-> Detected: n_feat=183, n_class=5
-> Loaded 3 models.


Infer Sec 2: 42it [01:57,  2.79s/it]


Length of submission_list: 168

3/9 Processing videos with: ['body_center', 'ear_left', 'ear_right', 'lateral_left', 'lateral_right', 'neck', 'nose', 'tail_base', 'tail_midpoint', 'tail_tip']

Start training CNN for section 3
>>> Optimizing: Converting DataFrames to Numpy (CPU heavy once, then fast)...


To Numpy: 100%|██████████| 37/37 [00:04<00:00,  8.54it/s]



>>> TRAINING FOLD 1/3 <<<
Fitting Scaler...


                                                               

Ep 1: Loss 0.1732 | Val F1: 0.6819 | LR: 0.001000


                                                         

Ep 2: Loss 0.0944 | Val F1: 0.7306 | LR: 0.001000

--- Final Threshold Tuning Fold 1 ---
	rear: 0.5163 (thr=0.73)

>>> TRAINING FOLD 2/3 <<<
Fitting Scaler...


                                                               

Ep 1: Loss 0.1578 | Val F1: 0.6435 | LR: 0.001000


                                                         

Ep 2: Loss 0.0888 | Val F1: 0.6514 | LR: 0.001000

--- Final Threshold Tuning Fold 2 ---
	rear: 0.4648 (thr=0.87)

>>> TRAINING FOLD 3/3 <<<
Fitting Scaler...


                                                               

Ep 1: Loss 0.1434 | Val F1: 0.6074 | LR: 0.001000


                                                         

Ep 2: Loss 0.0782 | Val F1: 0.6181 | LR: 0.001000

--- Final Threshold Tuning Fold 3 ---
	rear: 0.4028 (thr=0.91)

>>> MEAN F1 OVER 3 FOLDS: 0.4613 <<<
Loading TCN Ensemble (3 Folds) for Section 3...
-> Detected: n_feat=142, n_class=1
-> Loaded 3 models.
Loading TCN Ensemble (3 Folds) for Section 3...
-> Detected: n_feat=142, n_class=1
-> Loaded 3 models.


Infer Sec 3: 37it [04:11,  6.79s/it]


Start training CNN for section 3
>>> Optimizing: Converting DataFrames to Numpy (CPU heavy once, then fast)...


To Numpy: 100%|██████████| 114/114 [00:15<00:00,  7.33it/s]



>>> TRAINING FOLD 1/3 <<<
Fitting Scaler...


                                                               

Ep 1: Loss 0.1296 | Val F1: 0.0894 | LR: 0.001000


                                                         

Ep 2: Loss 0.0857 | Val F1: 0.0853 | LR: 0.001000

--- Final Threshold Tuning Fold 1 ---
	approach: 0.3685 (thr=0.70)
	attack: 0.0902 (thr=0.89)
	avoid: 0.1824 (thr=0.88)
	chase: 0.0204 (thr=0.78)
	chaseattack: 0.1263 (thr=0.87)
	submit: 0.1276 (thr=0.56)

>>> TRAINING FOLD 2/3 <<<
Fitting Scaler...


                                                               

Ep 1: Loss 0.1408 | Val F1: 0.0603 | LR: 0.001000


                                                         

Ep 2: Loss 0.0893 | Val F1: 0.0523 | LR: 0.001000

--- Final Threshold Tuning Fold 2 ---
	approach: 0.1710 (thr=0.84)
	attack: 0.1327 (thr=0.76)
	avoid: 0.2225 (thr=0.90)
	chase: 0.0263 (thr=0.67)
	chaseattack: 0.0860 (thr=0.62)
	submit: 0.0525 (thr=0.96)

>>> TRAINING FOLD 3/3 <<<
Fitting Scaler...


                                                               

Ep 1: Loss 0.1471 | Val F1: 0.0479 | LR: 0.001000


                                                         

Ep 2: Loss 0.0981 | Val F1: 0.0719 | LR: 0.001000

--- Final Threshold Tuning Fold 3 ---
	approach: 0.3418 (thr=0.82)
	attack: 0.0430 (thr=0.95)
	avoid: 0.1339 (thr=0.84)
	chase: 0.0207 (thr=0.78)
	chaseattack: 0.0766 (thr=0.95)
	submit: 0.0638 (thr=0.56)

>>> MEAN F1 OVER 3 FOLDS: 0.1270 <<<
Loading TCN Ensemble (3 Folds) for Section 3...
-> Detected: n_feat=164, n_class=6
-> Loaded 3 models.
Loading TCN Ensemble (3 Folds) for Section 3...
-> Detected: n_feat=164, n_class=6
-> Loaded 3 models.


Infer Sec 3: 114it [14:55,  7.86s/it]


Length of submission_list: 319

4/9 Processing videos with: ['body_center', 'ear_left', 'ear_right', 'lateral_left', 'lateral_right', 'nose', 'tail_base', 'tail_tip']

Start training CNN for section 4
>>> Optimizing: Converting DataFrames to Numpy (CPU heavy once, then fast)...


To Numpy: 100%|██████████| 82/82 [00:05<00:00, 14.38it/s]



>>> TRAINING FOLD 1/3 <<<
Fitting Scaler...


                                                               

KeyboardInterrupt: 

In [None]:


# ==========================================
# FINAL SUBMISSION GENERATION
# ==========================================

def convert_frames_to_intervals(df_frames, meta_df):
    """
    Chuyển DataFrame d?ng Frame-wise (0/1) sang Intervals (start/stop).
    Gi? nguy?n video_id / agent_id / target_id t? inference v? g?p c?c ?o?n li?n t?c.
    """
    required_cols = {"video_id", "agent_id", "target_id", "frame"}
    missing = required_cols - set(df_frames.columns)
    if missing:
        raise ValueError(f"Missing required columns: {missing}")

    intervals = []

    grouped = df_frames.groupby(['video_id', 'agent_id', 'target_id'])

    print("Converting frames to intervals...")
    for (vid, agent, target), grp in tqdm(grouped):
        act_cols = [c for c in grp.columns if c not in required_cols]
        grp = grp.sort_values('frame')
        frames = grp['frame'].values

        for action in act_cols:
            vals = grp[action].values
            if vals.sum() == 0:
                continue

            padded = np.r_[0, vals, 0]
            diff = np.diff(padded)
            starts = np.where(diff == 1)[0]
            stops = np.where(diff == -1)[0]

            for s, e in zip(starts, stops):
                start_f = frames[s]
                stop_f = frames[e-1] + 1
                intervals.append({
                    'video_id': vid,
                    'agent_id': agent,
                    'target_id': target,
                    'action': action,
                    'start_frame': start_f,
                    'stop_frame': stop_f
                })

    # G?p c?c interval li?n k?/ch?ng nhau c?a c?ng action
    df_int = pd.DataFrame(intervals)
    if not df_int.empty:
        merged = []
        df_int = df_int.sort_values(['video_id', 'agent_id', 'target_id', 'action', 'start_frame'])
        for key, grp in df_int.groupby(['video_id', 'agent_id', 'target_id', 'action']):
            grp = grp.sort_values('start_frame')
            cur = None
            for _, row in grp.iterrows():
                if cur is None:
                    cur = row[['video_id','agent_id','target_id','action','start_frame','stop_frame']].to_dict()
                    continue
                # n?i c?c ?o?n ch?m nhau ho?c ch?ng nhau
                if row['start_frame'] <= cur['stop_frame'] or row['start_frame'] == cur['stop_frame'] + 1:
                    cur['stop_frame'] = max(cur['stop_frame'], row['stop_frame'])
                else:
                    merged.append(cur)
                    cur = row[['video_id','agent_id','target_id','action','start_frame','stop_frame']].to_dict()
            if cur is not None:
                merged.append(cur)
        df_int = pd.DataFrame(merged)
        dur = df_int['stop_frame'] - df_int['start_frame']
        df_int = df_int[dur >= 3].reset_index(drop=True)
    else:
        df_int = pd.DataFrame(columns=['video_id','agent_id','target_id','action','start_frame','stop_frame'])

    return df_int

# --- T?O SUBMISSION ---

if CFG.mode == 'validate':
    print(">>> GENERATING VALIDATION SCORES...")
    if len(submission_list) > 0:
        submission_raw = pd.concat(submission_list, axis=0, ignore_index=True).fillna(0)

        submission = convert_frames_to_intervals(submission_raw, train) if "frame" in submission_raw.columns else submission_raw 

        if len(submission) > 0:
            cleaned_submission = clean_and_fill_submission(submission, train)
            print(f"Competition metric: {score(solution, cleaned_submission, ''):.4f}")
        else:
            print("Warning: No events detected after conversion.")

        if len(f1_list) > 0:
            f1_df = pd.DataFrame(f1_list, columns=['body_parts_tracked_str', 'action', 'binary F1 score'])
            print(f"Mean F1:             {f1_df['binary F1 score'].mean():.4f}")
            joblib.dump(f1_df, f"{getattr(CFG, 'model_dir', CFG.model_name)}/scores.pkl")

        joblib.dump(thresholds, f"{getattr(CFG, 'model_dir', CFG.model_name)}/final_thresholds_dict.pkl")
    else:
        print("Error: Submission list is empty!")

elif CFG.mode == 'submit':
    print(">>> GENERATING FINAL SUBMISSION.CSV...")

    if len(submission_list) > 0:
        print("Concatenating raw predictions...")
        submission_raw = pd.concat(submission_list, axis=0, ignore_index=True)
        submission_raw = submission_raw.fillna(0)

        submission = convert_frames_to_intervals(submission_raw, test) if "frame" in submission_raw.columns else submission_raw

    else:
        print("Warning: Creating dummy submission because list is empty.")
        submission = pd.DataFrame(dict(
            video_id=[438887472], agent_id=['mouse1'], target_id=['self'], 
            action=['rear'], start_frame=[0], stop_frame=[10]
        ))

    if len(submission) == 0:
        print("Warning: No events found! Using dummy row.")
        submission = pd.DataFrame(dict(
            video_id=[438887472], agent_id=['mouse1'], target_id=['self'], 
            action=['rear'], start_frame=[0], stop_frame=[10]
        ))

    cleaned_submission = clean_and_fill_submission(submission, test, is_train=False)
    cleaned_submission.index.name = 'row_id'
    cleaned_submission.to_csv('submission.csv')

    print("Success! saved to submission.csv")
    print(cleaned_submission.head())
