# <div style="color:#2d83ed; font-family: 'Segoe UI'; text-align: center; border-top:5px solid green; padding-left:10px; background-color:#F8F9F9; padding:10px; border-radius:5px;font-weight: bold">🚆 Model Training</div>

For Competition notebook visit ---> [https://www.kaggle.com/code/taimour/cmi-deep-dive-eda-boosters](https://www.kaggle.com/code/taimour/cmi-deep-dive-eda-boosters)

Due to time limitations, model was trained separately in this notebook.

# <div style="color:#2d83ed; font-family: 'Segoe UI'; text-align: center; border-top:5px solid green; padding-left:10px; background-color:#F8F9F9; padding:10px; border-radius:5px;font-weight: bold">📚 Libraries / Packages</div>

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
import polars as pl
import kaggle_evaluation.cmi_inference_server
import joblib

import catboost
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score
from scipy.spatial.transform import Rotation as R

from sklearn.preprocessing import LabelEncoder

# <div style="color:#2d83ed; font-family: 'Segoe UI'; text-align: center; border-top:5px solid green; padding-left:10px; background-color:#F8F9F9; padding:10px; border-radius:5px;font-weight: bold">📖 Load Data</div>

In [2]:
train = pl.read_csv('/kaggle/input/cmi-detect-behavior-with-sensor-data/train.csv')
train_demo = pl.read_csv('/kaggle/input/cmi-detect-behavior-with-sensor-data/train_demographics.csv')

# Drop these columns from training data
train = train.drop(['phase', 'orientation', 'behavior', 'sequence_type'])

data = train.join(train_demo,on="subject",how="left")
# data = train

# <div style="color:#2d83ed; font-family: 'Segoe UI'; text-align: center; border-top:5px solid green; padding-left:10px; background-color:#F8F9F9; padding:10px; border-radius:5px;font-weight: bold">💎 Feature Engineering</div>

**Helper Functions**

In [3]:
def remove_gravity_from_acc(acc_data, rot_data):

    if isinstance(acc_data, pd.DataFrame):
        acc_values = acc_data[['acc_x', 'acc_y', 'acc_z']].values
    else:
        acc_values = acc_data

    if isinstance(rot_data, pd.DataFrame):
        quat_values = rot_data[['rot_x', 'rot_y', 'rot_z', 'rot_w']].values
    else:
        quat_values = rot_data

    num_samples = acc_values.shape[0]
    linear_accel = np.zeros_like(acc_values)
    
    gravity_world = np.array([0, 0, 9.81])

    for i in range(num_samples):
        if np.all(np.isnan(quat_values[i])) or np.all(np.isclose(quat_values[i], 0)):
            linear_accel[i, :] = acc_values[i, :] 
            continue

        try:
            rotation = R.from_quat(quat_values[i])
            gravity_sensor_frame = rotation.apply(gravity_world, inverse=True)
            linear_accel[i, :] = acc_values[i, :] - gravity_sensor_frame
        except ValueError:
             linear_accel[i, :] = acc_values[i, :]
             
    return linear_accel

def calculate_angular_velocity_from_quat(rot_data, time_delta=1/200): # Assuming 200Hz sampling rate
    if isinstance(rot_data, pd.DataFrame):
        quat_values = rot_data[['rot_x', 'rot_y', 'rot_z', 'rot_w']].values
    else:
        quat_values = rot_data

    num_samples = quat_values.shape[0]
    angular_vel = np.zeros((num_samples, 3))

    for i in range(num_samples - 1):
        q_t = quat_values[i]
        q_t_plus_dt = quat_values[i+1]

        if np.all(np.isnan(q_t)) or np.all(np.isclose(q_t, 0)) or \
           np.all(np.isnan(q_t_plus_dt)) or np.all(np.isclose(q_t_plus_dt, 0)):
            continue

        try:
            rot_t = R.from_quat(q_t)
            rot_t_plus_dt = R.from_quat(q_t_plus_dt)

            # Calculate the relative rotation
            delta_rot = rot_t.inv() * rot_t_plus_dt
            
            # Convert delta rotation to angular velocity vector
            # The rotation vector (Euler axis * angle) scaled by 1/dt
            # is a good approximation for small delta_rot
            angular_vel[i, :] = delta_rot.as_rotvec() / time_delta
        except ValueError:
            # If quaternion is invalid, angular velocity remains zero
            pass
            
    return angular_vel

def calculate_angular_distance(rot_data):
    if isinstance(rot_data, pd.DataFrame):
        quat_values = rot_data[['rot_x', 'rot_y', 'rot_z', 'rot_w']].values
    else:
        quat_values = rot_data

    num_samples = quat_values.shape[0]
    angular_dist = np.zeros(num_samples)

    for i in range(num_samples - 1):
        q1 = quat_values[i]
        q2 = quat_values[i+1]

        if np.all(np.isnan(q1)) or np.all(np.isclose(q1, 0)) or \
           np.all(np.isnan(q2)) or np.all(np.isclose(q2, 0)):
            angular_dist[i] = 0 # Или np.nan, в зависимости от желаемого поведения
            continue
        try:
            # Converting quaternions to Rotation objects
            r1 = R.from_quat(q1)
            r2 = R.from_quat(q2)

            # Calculating the angular distance: 2 * arccos(|real(p * q*)|)
            # where q* is the conjugate of quaternion q
            # In scipy.spatial.transform.Rotation, r1.inv() * r2 gives the relative rotation.
            # The angle of this relative rotation is the angular distance.
            relative_rotation = r1.inv() * r2
            
            # The angle of the rotation vector corresponds to the angular distance
            # The norm of the rotation vector is the angle in radians
            angle = np.linalg.norm(relative_rotation.as_rotvec())
            angular_dist[i] = angle
        except ValueError:
            angular_dist[i] = 0 # In case of invalid quaternions
            pass
            
    return angular_dist

**IMU**

In [4]:
def feature_engineering_imu(data:pl.DataFrame):
    data = data.to_pandas()
    data['acc_mag'] = np.sqrt(data['acc_x']**2 + data['acc_y']**2 + data['acc_z']**2)
    data['rot_angle'] = 2 * np.arccos(data['rot_w'].clip(-1, 1))
    data['acc_mag_jerk'] = data.groupby('sequence_id')['acc_mag'].diff().fillna(0)
    data['rot_angle_vel'] = data.groupby('sequence_id')['rot_angle'].diff().fillna(0)

    linear_accel_list = []
    for _, group in data.groupby('sequence_id'):
        acc_data_group = group[['acc_x', 'acc_y', 'acc_z']]
        rot_data_group = group[['rot_x', 'rot_y', 'rot_z', 'rot_w']]
        linear_accel_group = remove_gravity_from_acc(acc_data_group, rot_data_group)
        linear_accel_list.append(pd.DataFrame(linear_accel_group, columns=['linear_acc_x', 'linear_acc_y', 'linear_acc_z'], index=group.index))
    
    df_linear_accel = pd.concat(linear_accel_list)
    data = pd.concat([data, df_linear_accel], axis=1)
    data['linear_acc_mag'] = np.sqrt(data['linear_acc_x']**2 + data['linear_acc_y']**2 + data['linear_acc_z']**2)
    data['linear_acc_mag_jerk'] = data.groupby('sequence_id')['linear_acc_mag'].diff().fillna(0)
    angular_vel_list = []
    for _, group in data.groupby('sequence_id'):
        rot_data_group = group[['rot_x', 'rot_y', 'rot_z', 'rot_w']]
        angular_vel_group = calculate_angular_velocity_from_quat(rot_data_group)
        angular_vel_list.append(pd.DataFrame(angular_vel_group, columns=['angular_vel_x', 'angular_vel_y', 'angular_vel_z'], index=group.index))
    
    df_angular_vel = pd.concat(angular_vel_list)
    data = pd.concat([data, df_angular_vel], axis=1)
    
    print("  Calculating angular distance between successive quaternions...")
    angular_distance_list = []
    for _, group in data.groupby('sequence_id'):
        rot_data_group = group[['rot_x', 'rot_y', 'rot_z', 'rot_w']]
        angular_dist_group = calculate_angular_distance(rot_data_group)
        angular_distance_list.append(pd.DataFrame(angular_dist_group, columns=['angular_distance'], index=group.index))
    
    df_angular_distance = pd.concat(angular_distance_list)
    data = pd.concat([data, df_angular_distance], axis=1)
    data = pl.from_pandas(data)
    return data

In [5]:
train = feature_engineering_imu(train)

  return op(a, b)
  return op(a, b)
  result = getattr(ufunc, method)(*inputs, **kwargs)


  Calculating angular distance between successive quaternions...


In [6]:
train.head(2)

row_id,sequence_id,sequence_counter,subject,gesture,acc_x,acc_y,acc_z,rot_w,rot_x,rot_y,rot_z,thm_1,thm_2,thm_3,thm_4,thm_5,tof_1_v0,tof_1_v1,tof_1_v2,tof_1_v3,tof_1_v4,tof_1_v5,tof_1_v6,tof_1_v7,tof_1_v8,tof_1_v9,tof_1_v10,tof_1_v11,tof_1_v12,tof_1_v13,tof_1_v14,tof_1_v15,tof_1_v16,tof_1_v17,tof_1_v18,tof_1_v19,…,tof_5_v40,tof_5_v41,tof_5_v42,tof_5_v43,tof_5_v44,tof_5_v45,tof_5_v46,tof_5_v47,tof_5_v48,tof_5_v49,tof_5_v50,tof_5_v51,tof_5_v52,tof_5_v53,tof_5_v54,tof_5_v55,tof_5_v56,tof_5_v57,tof_5_v58,tof_5_v59,tof_5_v60,tof_5_v61,tof_5_v62,tof_5_v63,acc_mag,rot_angle,acc_mag_jerk,rot_angle_vel,linear_acc_x,linear_acc_y,linear_acc_z,linear_acc_mag,linear_acc_mag_jerk,angular_vel_x,angular_vel_y,angular_vel_z,angular_distance
str,str,i64,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""SEQ_000007_000000""","""SEQ_000007""",0,"""SUBJ_059520""","""Cheek - pinch skin""",6.683594,6.214844,3.355469,0.134399,-0.355164,-0.447327,-0.809753,28.943842,31.822186,29.553024,28.592863,28.310535,131.0,134.0,132.0,135.0,98.0,74.0,64.0,60.0,-1.0,-1.0,152.0,153.0,141.0,89.0,68.0,63.0,-1.0,-1.0,-1.0,-1.0,…,113.0,124.0,122.0,131.0,-1.0,-1.0,-1.0,-1.0,120.0,127.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,9.723882,2.871978,0.0,0.0,-0.13854,0.044578,-0.053696,0.155125,0.0,-6.600812,9.554974,2.140512,0.059044
"""SEQ_000007_000001""","""SEQ_000007""",1,"""SUBJ_059520""","""Cheek - pinch skin""",6.949219,6.214844,3.125,0.143494,-0.340271,-0.42865,-0.824524,29.340816,31.874645,29.79174,28.663383,28.406172,130.0,138.0,131.0,135.0,101.0,76.0,66.0,61.0,-1.0,-1.0,156.0,155.0,141.0,93.0,74.0,64.0,-1.0,-1.0,-1.0,-1.0,…,116.0,122.0,123.0,126.0,-1.0,-1.0,-1.0,-1.0,122.0,129.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,9.832678,2.853611,0.108796,-0.018367,0.237503,0.238219,-0.808055,0.875276,0.720151,-16.678839,44.579519,21.057952,0.260238


**Statistical**

In [7]:
def feature_engineering_stat(data:pl.DataFrame):
    non_sensor_cols = []
    if "gesture" in data.columns:
        non_sensor_cols = ["gesture"]
        
    # All numeric sensor columns (everything except id, demo, target)
    stat_cols = [
        c for c in data.columns
        if c not in non_sensor_cols + ["sequence_id", "row_id","sequence_counter","subject"]
    ]
    
    # Build aggregation expressions
    agg_exprs = []
    
    # full-stats bundle for sensor columns
    for c in stat_cols:
        agg_exprs += [
            pl.col(c).mean().alias(f"{c}_mean"),
            pl.col(c).std().alias(f"{c}_std"),
            pl.col(c).var().alias(f"{c}_var"),
            pl.col(c).quantile(0.25).alias(f"{c}_q25"),
            pl.col(c).median().alias(f"{c}_q50"),
            pl.col(c).quantile(0.75).alias(f"{c}_q75"),
            pl.col(c).max().alias(f"{c}_max"),
            pl.col(c).min().alias(f"{c}_min"),
            pl.col(c).first().alias(f"{c}_first"),
            pl.col(c).last().alias(f"{c}_last"),
            pl.col(c).quantile(0.25, "nearest").alias(f"{c}_t25"),
            pl.col(c).quantile(0.75, "nearest").alias(f"{c}_t75"),
            (pl.col(c).last() - pl.col(c).first()).alias(f"{c}_delta"),
            pl.corr("sequence_counter", c).alias(f"{c}_corr_time"),
            pl.col(c).diff().mean().alias(f"{c}_diff_mean"),
            pl.col(c).diff().std().alias(f"{c}_diff_std"),
            pl.col(c).skew().alias(f"{c}_skew"),
            pl.col(c).kurtosis().alias(f"{c}_kurt"),
            pl.col(c).diff().abs().gt(0).sum().alias(f"{c}_n_changes")
        ]
        agg_exprs += [
            pl.when(pl.col("sequence_counter") < 0.1 * pl.max("sequence_counter"))
              .then(pl.col(c)).otherwise(None).mean().alias(f"{c}_seg1_mean"),
            pl.when(pl.col("sequence_counter") > 0.9 * pl.max("sequence_counter"))
              .then(pl.col(c)).otherwise(None).mean().alias(f"{c}_seg3_mean"),
        ]
    
    # first() for demographics and target
    agg_exprs += [
        pl.col(c).first().alias(c) for c in non_sensor_cols
    ]
    
    # Group-by and aggregate
    cleaned_data = (
        data
        .group_by("sequence_id", maintain_order=True)
        .agg(agg_exprs)
    )
    return cleaned_data

In [8]:
train_demographic_target_cols = [
    "gesture"
    ]
cleaned_data = feature_engineering_stat(data)
cleaned_data.shape

(8151, 7121)

# <div style="color:#2d83ed; font-family: 'Segoe UI'; text-align: center; border-top:5px solid green; padding-left:10px; background-color:#F8F9F9; padding:10px; border-radius:5px;font-weight: bold">🎯 Target Label Encoding</div>

In [9]:
# Assume cleaned_data is already a Polars DataFrame
target_col = "gesture"

# --- Convert Polars DataFrame to Pandas only if needed ---
# CatBoost does not yet fully support Polars directly
df = cleaned_data.to_pandas()

# --- Define X and y properly ---
X = df.drop(columns=[target_col, "sequence_id"])  # Feature matrix
y = df[target_col].values # Target

# Encode target
le = LabelEncoder()
y = le.fit_transform(y)

joblib.dump(le, 'Target_LabelEncoder.joblib')

['Target_LabelEncoder.joblib']

# <div style="color:#2d83ed; font-family: 'Segoe UI'; text-align: center; border-top:5px solid green; padding-left:10px; background-color:#F8F9F9; padding:10px; border-radius:5px;font-weight: bold">🚅 Train the Model</div>

In [10]:
# CAT Set up stratified KFold
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fold_acc, fold_f1 = [], []

for fold, (train_idx, val_idx) in enumerate(kfold.split(X, y), start=1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    final_model = CatBoostClassifier(
        iterations=2000,
        learning_rate=0.06,
        loss_function='MultiClass',
        task_type="GPU",
        devices='0',
        verbose=False
    )

    final_model.fit(X_train, y_train, eval_set=[(X_val, y_val)])
    y_pred = final_model.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred, average="macro")

    fold_acc.append(acc)
    fold_f1.append(f1)
    print(f"Fold {fold}: Accuracy={acc:.4f}, Macro-F1={f1:.4f}")

print("\n======  5-Fold Summary  ======")
print(f"Accuracy:  mean={np.mean(fold_acc):.4f}  std={np.std(fold_acc):.4f}")
print(f"Macro-F1 : mean={np.mean(fold_f1):.4f}  std={np.std(fold_f1):.4f}")

Fold 1: Accuracy=0.6450, Macro-F1=0.6564
Fold 2: Accuracy=0.6454, Macro-F1=0.6440
Fold 3: Accuracy=0.6485, Macro-F1=0.6577
Fold 4: Accuracy=0.6313, Macro-F1=0.6467
Fold 5: Accuracy=0.6509, Macro-F1=0.6531

Accuracy:  mean=0.6442  std=0.0068
Macro-F1 : mean=0.6516  std=0.0054


In [11]:
final_model = CatBoostClassifier(
    iterations=2000,
    learning_rate=0.06,
    loss_function='MultiClass',
    task_type="GPU",
    devices='0',
    verbose=False
)
final_model.fit(X, y)

<catboost.core.CatBoostClassifier at 0x7e64f97e4550>

# <div style="color:#2d83ed; font-family: 'Segoe UI'; text-align: center; border-top:5px solid green; padding-left:10px; background-color:#F8F9F9; padding:10px; border-radius:5px;font-weight: bold">💾 Save the Model</div>

In [12]:
joblib.dump(final_model, f'model_catboost.joblib')

['model_catboost.joblib']

# <div style="color:#2d83ed; font-family: 'Segoe UI'; text-align: center; border-top:5px solid green; padding-left:10px; background-color:#F8F9F9; padding:10px; border-radius:5px;font-weight: bold">🌟 Predict</div>

Not needed as we are just trainig the model. It is only kept here to be used for testing only, whenever needed.

In [13]:
# def predict(sequence: pl.DataFrame, demographics: pl.DataFrame) -> str:
#     # data =sequence
#     data = sequence.join(demographics,on="subject",how="left")
#     # print(data.schema)
#     cleaned_data = feature_engineering_stat(data)
#     pdf = cleaned_data.to_pandas().drop(columns=["sequence_id"])
#     predictions = final_model.predict(pdf).ravel()
#     predictions = le.inverse_transform(predictions)
#     return predictions[0]

# <div style="color:#2d83ed; font-family: 'Segoe UI'; text-align: center; border-top:5px solid green; padding-left:10px; background-color:#F8F9F9; padding:10px; border-radius:5px;font-weight: bold">🪐 Inference Server</div>
Not needed as we are just trainig the model. It is only kept here to be used for testing only, whenever needed.

In [14]:
# inference_server = kaggle_evaluation.cmi_inference_server.CMIInferenceServer(predict)

# if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
#     inference_server.serve()
# else:
#     inference_server.run_local_gateway(
#         data_paths=(
#             '/kaggle/input/cmi-detect-behavior-with-sensor-data/test.csv',
#             '/kaggle/input/cmi-detect-behavior-with-sensor-data/test_demographics.csv',
#         )
#     )