In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.feature_selection import SelectFromModel
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# -----------------------------
# STEP 1: Feature Extraction
# -----------------------------
def extract_advanced_features(df):
    import numpy as np
    import pandas as pd

    # Define sensor columns
    imu_sensors = [col for col in df.columns if col.startswith(('acc_', 'rot_', 'thm_'))]
    tof_sensors = [f'tof_{i}_v{v}' for i in range(1, 6) for v in [0, 7, 56, 63]]
    sensors = imu_sensors + tof_sensors

    # Replace -1 with NaN
    df[sensors] = df[sensors].replace(-1, np.nan)

    # Group by sequence_id and behavior
    grouped = df.groupby(['sequence_id', 'behavior'])[sensors]
    stats = grouped.agg(['mean', 'std', 'min', 'max'])

    # Flatten sensor + stat into one level
    stats.columns = ['_'.join(col) for col in stats.columns]

    # Pivot so we get one row per sequence_id with behavior in column names
    features = stats.reset_index().pivot(index='sequence_id', columns='behavior')

    # Flatten the column MultiIndex
    features.columns = [f'{behavior}_{col}' for behavior, col in features.columns]

    # Fill missing values (due to missing behaviors in some sequences)
    return features.fillna(0)



train = pd.read_csv("train.csv")
X = extract_advanced_features(train)
sequence_targets = train[train['behavior'] == 'Performs gesture'].groupby('sequence_id').first()
y_binary = (sequence_targets['sequence_type'] == 'Target').astype(int)
y_multi = sequence_targets['gesture']
X = X.loc[y_binary.index]
X.fillna(X.median(), inplace=True)

# -----------------------------
# STEP 2: Preprocessing
# -----------------------------
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), index=X.index, columns=X.columns)
X_train, X_val, yb_train, yb_val = train_test_split(X_scaled, y_binary, stratify=y_binary, test_size=0.2, random_state=42)
pos, neg = yb_train.sum(), len(yb_train) - yb_train.sum()
scale = neg / pos

# -----------------------------
# STEP 3: Feature Selection
# -----------------------------
xgb_feat = XGBClassifier(n_estimators=100, max_depth=6, use_label_encoder=False, eval_metric='logloss', scale_pos_weight=scale)
xgb_feat.fit(X_train, yb_train)
selector = SelectFromModel(xgb_feat, threshold="median", prefit=True)
X_train_sel = selector.transform(X_train)
X_val_sel = selector.transform(X_val)

# -----------------------------
# STEP 4: Train Final Binary Model
# -----------------------------
clf_binary = XGBClassifier(n_estimators=200, max_depth=8, learning_rate=0.05, use_label_encoder=False, eval_metric='logloss', scale_pos_weight=scale)
clf_binary.fit(X_train_sel, yb_train)
yb_preds = clf_binary.predict(X_val_sel)
print("✅ Binary Accuracy:", accuracy_score(yb_val, yb_preds))
print("✅ Binary F1 Score:", f1_score(yb_val, yb_preds))

# -----------------------------
# STEP 5: Multiclass Model
# -----------------------------
X_multi = X_scaled.loc[y_multi.index.intersection(X_scaled.index)]
X_multi_sel = selector.transform(X_multi)
ym_encoded = LabelEncoder().fit_transform(y_multi.loc[X_multi.index])
X_train_m, X_val_m, ym_train, ym_val = train_test_split(X_multi_sel, ym_encoded, stratify=ym_encoded, test_size=0.2, random_state=42)
clf_multi = XGBClassifier(n_estimators=250, max_depth=10, learning_rate=0.05, use_label_encoder=False, eval_metric='mlogloss')
clf_multi.fit(X_train_m, ym_train)
ym_preds = clf_multi.predict(X_val_m)
print("🎯 Multiclass Accuracy:", accuracy_score(ym_val, ym_preds))
print("🎯 Multiclass Macro F1 Score:", f1_score(ym_val, ym_preds, average='macro'))


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


✅ Binary Accuracy: 0.9766871165644172
✅ Binary F1 Score: 0.9815175097276264


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


🎯 Multiclass Accuracy: 0.6938650306748466
🎯 Multiclass Macro F1 Score: 0.6827982716457871


In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.feature_selection import RFECV
from xgboost import XGBClassifier

# -----------------------------
# STEP 1: Feature Engineering
# -----------------------------
def extract_advanced_features(df):
    imu_sensors = [col for col in df.columns if col.startswith(('acc_', 'rot_', 'thm_'))]
    tof_sensors = [f'tof_{i}_v{v}' for i in range(1, 6) for v in [0, 7, 56, 63]]
    sensors = imu_sensors + tof_sensors

    # Only sort if 'timestamp' is present
    if 'timestamp' in df.columns:
        df = df.sort_values(['sequence_id', 'timestamp'])    
        df[sensors] = df[sensors].replace(-1, np.nan)

    # Derivative features
    for col in sensors:
        df[f'{col}_diff'] = df.groupby('sequence_id')[col].diff()
    sensors += [f'{col}_diff' for col in sensors]

    grouped = df.groupby(['sequence_id', 'behavior'])[sensors]
    stats = grouped.agg(['mean', 'std', 'min', 'max', 'median', 'skew'])
    stats.columns = ['_'.join([sensor, stat]) for sensor, stat in stats.columns]

    features = stats.reset_index().pivot(index='sequence_id', columns='behavior')
    features.columns = [f'{behavior}_{col}' for behavior, col in features.columns]
    return features.fillna(0)

# -----------------------------
# STEP 2: Load Data
# -----------------------------
train = pd.read_csv("train.csv")
X = extract_advanced_features(train)

sequence_targets = train[train['behavior'] == 'Performs gesture'].groupby('sequence_id').first()
y_binary = (sequence_targets['sequence_type'] == 'Target').astype(int)
y_multi = sequence_targets['gesture']
X = X.loc[y_binary.index]
X.columns = X.columns.str.replace(r"[\[\]<>]", "", regex=True)
X.fillna(X.median(), inplace=True)

# -----------------------------
# STEP 3: Preprocessing
# -----------------------------
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), index=X.index, columns=X.columns)
X_train, X_val, yb_train, yb_val = train_test_split(X_scaled, y_binary, stratify=y_binary, test_size=0.2, random_state=42)
pos, neg = yb_train.sum(), len(yb_train) - yb_train.sum()
scale = neg / pos

# -----------------------------
# STEP 4: Feature Selection via RFECV
# -----------------------------
xgb_base = XGBClassifier(n_estimators=100, max_depth=6, scale_pos_weight=scale, eval_metric='logloss', random_state=42)
selector = RFECV(xgb_base, step=0.2, scoring='f1', cv=3, n_jobs=-1)
selector.fit(X_train, yb_train)
X_train_sel = selector.transform(X_train)
X_val_sel = selector.transform(X_val)

# -----------------------------
# STEP 5: Binary Classifier
# -----------------------------
clf_binary = XGBClassifier(n_estimators=300, max_depth=10, learning_rate=0.03,
                           scale_pos_weight=scale, eval_metric='logloss', random_state=42)
clf_binary.fit(X_train_sel, yb_train)
yb_preds = clf_binary.predict(X_val_sel)
print("✅ Binary F1 Score:", f1_score(yb_val, yb_preds))

# -----------------------------
# STEP 6: Multiclass Classifier
# -----------------------------
X_multi = X_scaled.loc[y_multi.index.intersection(X_scaled.index)]
X_multi_sel = selector.transform(X_multi)
label_encoder = LabelEncoder()
ym_encoded = label_encoder.fit_transform(y_multi.loc[X_multi.index])
X_train_m, X_val_m, ym_train, ym_val = train_test_split(X_multi_sel, ym_encoded, stratify=ym_encoded, test_size=0.2, random_state=42)

clf_multi = XGBClassifier(n_estimators=500, max_depth=10, learning_rate=0.03,
                          subsample=0.8, colsample_bytree=0.7,
                          reg_alpha=1, reg_lambda=2,
                          eval_metric='mlogloss', random_state=42)
clf_multi.fit(X_train_m, ym_train)
ym_preds = clf_multi.predict(X_val_m)
print("🎯 Multiclass Macro F1 Score:", f1_score(ym_val, ym_preds, average='macro'))


✅ Binary F1 Score: 0.9805068226120858
🎯 Multiclass Macro F1 Score: 0.7135704731960818


In [5]:
# -----------------------------
# STEP 7: Test Predictions
# -----------------------------
def extract_advanced_feature(df):
    imu_sensors = [col for col in df.columns if col.startswith(('acc_', 'rot_', 'thm_'))]
    tof_sensors = [f'tof_{i}_v{v}' for i in range(1, 6) for v in [0, 7, 56, 63]]
    sensors = imu_sensors + tof_sensors

    if 'timestamp' in df.columns:
        df = df.sort_values(['sequence_id', 'timestamp'])
        df[sensors] = df[sensors].replace(-1, np.nan)

    # Derivative features
    for col in sensors:
        df[f'{col}_diff'] = df.groupby('sequence_id')[col].diff()
    sensors += [f'{col}_diff' for col in sensors]

    # Detect if 'behavior' column exists
    group_cols = ['sequence_id']
    if 'behavior' in df.columns:
        group_cols.append('behavior')

    grouped = df.groupby(group_cols)[sensors]
    stats = grouped.agg(['mean', 'std', 'min', 'max', 'median', 'skew'])
    stats.columns = ['_'.join([sensor, stat]) for sensor, stat in stats.columns]

    features = stats.reset_index()

    if 'behavior' in features.columns:
        # Pivot only if behavior exists
        features = features.pivot(index='sequence_id', columns='behavior')
        features.columns = [f'{behavior}_{col}' for behavior, col in features.columns]
    else:
        # Otherwise just set sequence_id as index
        features = features.set_index('sequence_id')

    return features.fillna(0)
# -----------------------------
# STEP 7: Test Predictions
# -----------------------------
test = pd.read_csv("test.csv")
X_test = extract_advanced_feature(test)
X_test.columns = X_test.columns.str.replace(r"[\[\]<>]", "", regex=True)

# Add missing columns all at once
missing_cols = [col for col in X.columns if col not in X_test.columns]
X_test = pd.concat([X_test, pd.DataFrame(0, index=X_test.index, columns=missing_cols)], axis=1)
X_test = X_test[X.columns]  # reorder to match training

# Fill NaNs and scale
X_test.fillna(X.median(), inplace=True)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)

# Feature selection
X_test_sel = selector.transform(X_test_scaled)

# Binary predictions
binary_preds = clf_binary.predict(X_test_sel)

# Gesture predictions only on Target sequences
gesture_preds = np.full(len(X_test_sel), fill_value=-1)
target_indices = np.where(binary_preds == 1)[0]

if len(target_indices) > 0:
    predicted_gestures = clf_multi.predict(X_test_sel[target_indices])
    gesture_preds[target_indices] = predicted_gestures
    gesture_labels = np.full(len(gesture_preds), "None", dtype=object)
    gesture_labels[target_indices] = label_encoder.inverse_transform(predicted_gestures)
else:
    gesture_labels = np.full(len(gesture_preds), "None", dtype=object)

# Save submission
submission = pd.DataFrame({
    "sequence_id": X_test.index,
    "target": binary_preds,
    "gesture": gesture_labels
})
submission.to_csv("submission.parquet", index=False)
print("✅ submission.csv saved")
print(submission.head())


✅ submission.csv saved


In [4]:
# ==================== 0. Imports ====================
import os, gc, warnings
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import lightgbm as lgb

warnings.filterwarnings("ignore")

# ==================== 1. Feature Extraction ====================
def extract_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.replace(-1, np.nan)
    features = {}

    sensor_cols = [col for col in df.columns if col.startswith(('acc_', 'rot_', 'thm_', 'tof_'))]

    for col in sensor_cols:
        x = df[col].dropna()
        features[f"{col}_mean"] = x.mean()
        features[f"{col}_std"] = x.std()
        features[f"{col}_min"] = x.min()
        features[f"{col}_max"] = x.max()
        features[f"{col}_median"] = x.median()
        features[f"{col}_range"] = x.max() - x.min()
        dx = x.diff().dropna()
        features[f"{col}_diff_mean"] = dx.mean()
        features[f"{col}_diff_std"] = dx.std()

    return pd.DataFrame([features])

# ==================== 2. Load Data & Extract Features ====================
df = pd.read_csv("train.csv")

# Filter gesture sequences
label_df = df[df["behavior"] == "Performs gesture"]
label_df = label_df[["sequence_id", "gesture"]].drop_duplicates()

# Extract features for all sequences
feature_list = []
for seq_id, group in df.groupby("sequence_id"):
    feats = extract_features(group)
    feats["sequence_id"] = seq_id
    feature_list.append(feats)

feat_df = pd.concat(feature_list, ignore_index=True)
train_df = feat_df.merge(label_df, on="sequence_id", how="left")
train_df["gesture"] = train_df["gesture"].fillna("None")

# ==================== 3. Label Encoding ====================
le = LabelEncoder()
train_df["gesture"] = train_df["gesture"].astype(str)

# Drop classes with <2 samples
class_counts = train_df["gesture"].value_counts()
valid_classes = class_counts[class_counts >= 2].index
train_df = train_df[train_df["gesture"].isin(valid_classes)].copy()

train_df["gesture_enc"] = le.fit_transform(train_df["gesture"])

X = train_df.drop(columns=["sequence_id", "gesture", "gesture_enc"])
y = train_df["gesture_enc"]

# ==================== 4. Train-Validation Split ====================
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


dtrain = lgb.Dataset(X_train, y_train)
dval = lgb.Dataset(X_val, y_val)



TypeError: train() got an unexpected keyword argument 'early_stopping_rounds'

In [5]:
# ==================== 5. Train LightGBM Model ====================
params = {
    "objective": "multiclass",
    "num_class": len(le.classes_),
    "metric": "multi_logloss",
    "learning_rate": 0.05,
    "verbosity": -1,
    "seed": 42,
}

model = lgb.train(
    params,
    dtrain,
    num_boost_round=500,
    valid_sets=[dval],
    callbacks=[
        lgb.early_stopping(40),
        lgb.log_evaluation(100)
    ]
)


# ==================== 6. Validation Score ====================
y_val_pred = model.predict(X_val, num_iteration=model.best_iteration)
y_val_pred_cls = np.argmax(y_val_pred, axis=1)
macro_f1 = f1_score(y_val, y_val_pred_cls, average="macro")

print("🎯 Validation Macro F1 Score:", macro_f1)



Training until validation scores don't improve for 30 rounds
[100]	valid_0's multi_logloss: 1.17318
Early stopping, best iteration is:
[113]	valid_0's multi_logloss: 1.17164
🎯 Validation Macro F1 Score: 0.5927662315843657


In [1]:
# ==================== 0. Dependencies ====================
import os, gc, warnings, itertools
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import lightgbm as lgb

warnings.filterwarnings("ignore")

# ==================== 1. Utility Functions ====================
def safe(func, arr, default=np.nan):
    try:
        val = func(arr)
        return default if (val is None or np.isnan(val)) else val
    except Exception:
        return default

def series_fft_dom_freq(a: np.ndarray):
    if a.size < 2:
        return np.nan
    fft = np.fft.rfft(a)
    freqs = np.fft.rfftfreq(len(a))
    return np.abs(freqs[np.argmax(np.abs(fft))])


# ==================== 2. Feature Extraction ====================
NUM_COLS = ['acc_x', 'acc_y', 'acc_z',
            'rot_w', 'rot_x', 'rot_y', 'rot_z']
THM_COLS = [f'thm_{i}' for i in range(1, 6)]
TOF_COLS = list(itertools.product(range(1, 6), range(64)))  # (sensor, voxel)

def extract_one_sequence(seq_id: int, g: pd.DataFrame) -> dict:
    feat = {'sequence_id': seq_id}

    for col in NUM_COLS:
        arr = g[col].to_numpy()
        feat[f'{col}_mean']   = safe(np.mean, arr)
        feat[f'{col}_std']    = safe(np.std,  arr)
        feat[f'{col}_max']    = safe(np.max,  arr)
        feat[f'{col}_min']    = safe(np.min,  arr)
        feat[f'{col}_skew']   = safe(stats.skew, arr)
        feat[f'{col}_kurt']   = safe(stats.kurtosis, arr)
        feat[f'{col}_q25']    = safe(lambda x: np.quantile(x, .25), arr)
        feat[f'{col}_q75']    = safe(lambda x: np.quantile(x, .75), arr)
        feat[f'{col}_domf']   = safe(series_fft_dom_freq, arr)

    acc_norm = np.linalg.norm(g[['acc_x', 'acc_y', 'acc_z']].to_numpy(), axis=1)
    feat['acc_norm_mean'] = safe(np.mean, acc_norm)
    feat['acc_norm_std']  = safe(np.std,  acc_norm)

    for col in THM_COLS:
        if col in g.columns:
            arr = g[col].to_numpy(dtype=float)
            feat[f'{col}_mean'] = safe(np.nanmean, arr)
            feat[f'{col}_std']  = safe(np.nanstd, arr)

    for sensor, voxel in TOF_COLS:
        col = f'tof_{sensor}_v{voxel}'
        if col in g.columns:
            arr = g[col].replace(-1, np.nan).to_numpy(dtype=float)
            feat[f'{col}_mean'] = safe(np.nanmean, arr)

    return feat

def extract_features(df: pd.DataFrame) -> pd.DataFrame:
    feats = [extract_one_sequence(seq_id, group)
             for seq_id, group in df.groupby('sequence_id')]
    return pd.DataFrame(feats)


# ==================== 3. Read and Process Train ====================
ROOT = '/kaggle/input/cmi-detect-behavior-with-sensor-data'
train_raw = pd.read_csv('train.csv')

train_feats = extract_features(train_raw)

labels = (train_raw[['sequence_id', 'gesture']]
          .drop_duplicates()
          .rename(columns={'gesture': 'label'}))

df_train = train_feats.merge(labels, on='sequence_id')
le = LabelEncoder()
df_train['label_enc'] = le.fit_transform(df_train['label'])

X = df_train.drop(['sequence_id', 'label', 'label_enc'], axis=1)
y = df_train['label_enc']

# ==================== 4. CV + Training + Evaluation ====================
lgb_params = dict(
    objective='multiclass',
    num_class=len(le.classes_),
    metric='multi_logloss',
    learning_rate=0.05,
    num_leaves=64,
    feature_fraction=0.8,
    bagging_fraction=0.8,
    bagging_freq=5,
    seed=42,
)

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracies = []

for fold, (trn_idx, val_idx) in enumerate(kfold.split(X, y), 1):
    print(f"\n=== Fold {fold} ===")
    X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]

    dtrain = lgb.Dataset(X_train, y_train)
    dval   = lgb.Dataset(X_val, y_val)

    model = lgb.train(
        lgb_params, dtrain, num_boost_round=500,
        valid_sets=[dval],
        callbacks=[lgb.early_stopping(50), lgb.log_evaluation(100)]
    )

    y_val_pred = model.predict(X_val, num_iteration=model.best_iteration)
    y_val_pred_classes = np.argmax(y_val_pred, axis=1)

    acc = accuracy_score(y_val, y_val_pred_classes)
    print(f"Accuracy: {acc:.4f}")
    print("Classification Report:")
    print(classification_report(y_val, y_val_pred_classes, target_names=le.classes_))
    print("Confusion Matrix:")
    print(confusion_matrix(y_val, y_val_pred_classes))
    accuracies.append(acc)
    gc.collect()

print(f"\n=== Average CV Accuracy: {np.mean(accuracies):.4f} ===")



=== Fold 1 ===
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008839 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 100103
[LightGBM] [Info] Number of data points in the train set: 6520, number of used features: 394
[LightGBM] [Info] Start training from score -2.546260
[LightGBM] [Info] Start training from score -2.550182
[LightGBM] [Info] Start training from score -3.922817
[LightGBM] [Info] Start training from score -2.548219
[LightGBM] [Info] Start training from score -2.544305
[LightGBM] [Info] Start training from score -3.922817
[LightGBM] [Info] Start training from score -2.544305
[LightGBM] [Info] Start training from score -2.544305
[LightGBM] [Info] Start training from score -3.930599
[LightGBM] [Info] Start training from score -2.544305
[LightGBM] [Info] Start training from score -2.544305
[LightGBM] [Info] Start training from score -3.922817
[LightGBM] [Info] Start training from score -