In [None]:
import os, json, joblib, numpy as np, pandas as pd
from pathlib import Path
import warnings 
warnings.filterwarnings("ignore")

from scipy.spatial.transform import Rotation as R

from sklearn.model_selection import StratifiedGroupKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.utils.class_weight import compute_class_weight

from tensorflow.keras.utils import Sequence, to_categorical, pad_sequences
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import (
    Input, Conv1D, BatchNormalization, LayerNormalization, Activation, add, MaxPooling1D, Dropout,
    Bidirectional, LSTM, GlobalAveragePooling1D, Dense, Multiply, Reshape,
    Lambda, Concatenate, GRU, GaussianNoise
)
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import backend as K
import tensorflow as tf
import polars as pl

import matplotlib.pyplot as plt

2025-08-19 20:26:22.536405: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-08-19 20:26:22.751360: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755624382.833538    2937 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755624382.855714    2937 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-08-19 20:26:23.055462: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [None]:
import keras
print(tf.__version__)
print(keras.__version__)

2.18.0
3.8.0


In [None]:
print(tf.config.list_physical_devices("GPU"))

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [None]:
import tensorflow as tf
print(tf.sysconfig.get_build_info())

OrderedDict([('cpu_compiler', '/usr/lib/llvm-18/bin/clang'), ('cuda_compute_capabilities', ['sm_60', 'sm_70', 'sm_80', 'sm_89', 'compute_90']), ('cuda_version', '12.5.1'), ('cudnn_version', '9'), ('is_cuda_build', True), ('is_rocm_build', False), ('is_tensorrt_build', False)])


In [None]:
import tensorflow as tf
print("GPU sayısı:", len(tf.config.list_physical_devices('GPU')))

GPU sayısı: 1


In [None]:
state_num = 17
import random
def seed_everything(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    tf.experimental.numpy.random.seed(seed)
    os.environ['TF_CUDNN_DETERMINISTIC'] = '1'
    os.environ['TF_DETERMINISTIC_OPS'] = '1'
seed_everything(seed=state_num)

In [None]:
# (Competition metric will only be imported when TRAINing)
TRAIN = True                
DEBUG_GATE = False
                     
RAW_DIR = Path("")
PRETRAINED_DIR = Path("new_model_10_fold")
EXPORT_DIR = Path("81lb_test_3kol_imu_tof_tofraw")
BATCH_SIZE = 64
PAD_PERCENTILE = 95 
LR_INIT = 5e-4
WD = 3e-3
MIXUP_ALPHA = 0.4 
EPOCHS = 160
PATIENCE = 40
N_SPLITS = 5
MASKING_PROB = 0.25 
GATE_LOSS_WEIGHT = 0.20 # 0.20 

print("▶ imports ready · tensorflow", tf.__version__)

▶ imports ready · tensorflow 2.18.0


In [None]:
def remove_gravity_from_acc(acc_data, rot_data):
    acc_values = acc_data[['acc_x', 'acc_y', 'acc_z']].values
    quat_values = rot_data[['rot_x', 'rot_y', 'rot_z', 'rot_w']].values
    linear_accel = np.zeros_like(acc_values)
    gravity_world = np.array([0, 0, 9.81])
    for i in range(len(acc_values)):
        if np.all(np.isnan(quat_values[i])) or np.all(np.isclose(quat_values[i], 0)):
            linear_accel[i, :] = acc_values[i, :]
            continue
        try:
            rotation = R.from_quat(quat_values[i])
            gravity_sensor_frame = rotation.apply(gravity_world, inverse=True)
            linear_accel[i, :] = acc_values[i, :] - gravity_sensor_frame
        except ValueError:
             linear_accel[i, :] = acc_values[i, :]
    return linear_accel

def calculate_angular_velocity_from_quat(rot_data, time_delta=1/200):
    quat_values = rot_data[['rot_x', 'rot_y', 'rot_z', 'rot_w']].values
    angular_vel = np.zeros((len(quat_values), 3))
    for i in range(len(quat_values) - 1):
        q_t, q_t_plus_dt = quat_values[i], quat_values[i+1]
        if np.all(np.isnan(q_t)) or np.all(np.isnan(q_t_plus_dt)): continue
        try:
            rot_t = R.from_quat(q_t)
            rot_t_plus_dt = R.from_quat(q_t_plus_dt)
            delta_rot = rot_t.inv() * rot_t_plus_dt
            angular_vel[i, :] = delta_rot.as_rotvec() / time_delta
        except ValueError: pass
    return angular_vel

def calculate_angular_distance(rot_data):
    quat_values = rot_data[['rot_x', 'rot_y', 'rot_z', 'rot_w']].values
    angular_dist = np.zeros(len(quat_values))
    for i in range(len(quat_values) - 1):
        q1, q2 = quat_values[i], quat_values[i+1]
        if np.all(np.isnan(q1)) or np.all(np.isnan(q2)): continue
        try:
            r1, r2 = R.from_quat(q1), R.from_quat(q2)
            relative_rotation = r1.inv() * r2
            angular_dist[i] = np.linalg.norm(relative_rotation.as_rotvec())
        except ValueError: pass
    return angular_dist

class GatedMixupGenerator(Sequence):
    def __init__(self, X, y, batch_size, imu_dim, class_weight=None, alpha=0, masking_prob=0.0, shuffle=True):
        self.X, self.y = X, y
        self.batch = batch_size
        self.imu_dim = imu_dim
        self.class_weight = class_weight
        self.alpha = alpha
        self.masking_prob = masking_prob
        self.indices = np.arange(len(X))
        self.shuffle=shuffle
        
    def __len__(self):
        return int(np.ceil(len(self.X) / self.batch))

    def __getitem__(self, i):
        idx = self.indices[i*self.batch:(i+1)*self.batch]
        Xb, yb = self.X[idx].copy(), self.y[idx].copy()
        
        
        sample_weights = np.ones(len(Xb), dtype='float32')
        if self.class_weight:
            y_integers = yb.argmax(axis=1)
            sample_weights = np.array([self.class_weight[i] for i in y_integers])
        
        gate_target = np.ones(len(Xb), dtype='float32')
        if self.masking_prob > 0:
            for i in range(len(Xb)):
                if np.random.rand() < self.masking_prob:
                    Xb[i, :, self.imu_dim:] = 0
                    gate_target[i] = 0.0

        if self.alpha > 0:
            lam = np.random.beta(self.alpha, self.alpha)
            perm = np.random.permutation(len(Xb))
            X_mix = lam * Xb + (1 - lam) * Xb[perm]
            y_mix = lam * yb + (1 - lam) * yb[perm]
            gate_target_mix = lam * gate_target + (1 - lam) * gate_target[perm]
            sample_weights_mix = lam * sample_weights + (1 - lam) * sample_weights[perm]
            return X_mix, {'main_output': y_mix, 'aux_gates': gate_target_mix}, sample_weights_mix

        return Xb, {'main_output': yb, 'aux_gates': gate_target}, sample_weights

    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.indices)

In [None]:
import numpy as np
from tensorflow.keras.utils import Sequence

class GatedMixupGenerator(Sequence):
    def __init__(self, X, y, batch_size, imu_dim, class_weight=None, alpha=0, masking_prob=0.0, shuffle=True):
        self.X, self.y = X, y
        self.batch = batch_size
        self.imu_dim = imu_dim
        self.class_weight = class_weight
        self.alpha = alpha
        self.masking_prob = masking_prob
        self.indices = np.arange(len(X))
        self.shuffle = shuffle
    
    def __len__(self):
        return int(np.ceil(len(self.X) / self.batch))

    def __getitem__(self, i):
        idx = self.indices[i * self.batch : (i + 1) * self.batch]
        Xb, yb = self.X[idx].copy(), self.y[idx].copy()
        
        sample_weights = np.ones(len(Xb), dtype='float32')
        if self.class_weight:
            y_integers = yb.argmax(axis=1)
            sample_weights = np.array([self.class_weight[i] for i in y_integers])
        
        # -------------------------------------------------------------------------
        # ÖNEMLİ DEĞİŞİKLİK: gate_target'ı (batch_size, 2) şeklinde oluşturuyoruz.
        # İlk sütun (1.0), maskeleme yapılmadığını, ikinci sütun (0.0) ise maskeleme
        # yapıldığını temsil ediyor. Bu, BinaryCrossentropy için doğru format.
        # -------------------------------------------------------------------------
        gate_target = np.zeros((len(Xb), 2), dtype='float32')
        gate_target[:, 0] = 1.0 # Başlangıçta hepsi maskesiz (1.0)
        
        if self.masking_prob > 0:
            for j in range(len(Xb)):
                if np.random.rand() < self.masking_prob:
                    # IMU verilerini maskele
                    Xb[j, :, self.imu_dim:] = 0
                    # Hedef değeri maskelenmiş olarak ayarla
                    gate_target[j, 0] = 0.0
                    gate_target[j, 1] = 1.0

        if self.alpha > 0:
            lam = np.random.beta(self.alpha, self.alpha)
            perm = np.random.permutation(len(Xb))
            
            X_mix = lam * Xb + (1 - lam) * Xb[perm]
            y_mix = lam * yb + (1 - lam) * yb[perm]
            gate_target_mix = lam * gate_target + (1 - lam) * gate_target[perm]
            sample_weights_mix = lam * sample_weights + (1 - lam) * sample_weights[perm]
            
            # Model, girdi (X_mix) ve çıktı hedefleri (sözlük) bekliyor.
            # Sample weights (örnek ağırlıkları) da model.fit'e ayrı bir argüman olarak verilmelidir.
            return X_mix, {'main_output': y_mix, 'aux_gates': gate_target_mix}, sample_weights_mix

        # Model, girdi (Xb) ve çıktı hedefleri (sözlük) bekliyor.
        return Xb, {'main_output': yb, 'aux_gates': gate_target}, sample_weights

    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.indices)




def build_gated_two_branch_model(pad_len, imu_dim, tof_dim, n_classes, wd=1e-4):
    inp = Input(shape=(pad_len, imu_dim+tof_dim))
    imu = Lambda(lambda t: t[:, :, :imu_dim])(inp)
    tof = Lambda(lambda t: t[:, :, imu_dim:])(inp)

    x1 = residual_se_cnn_block(imu, 64, 3, drop=0.1, wd=wd)
    x1 = residual_se_cnn_block(x1, 128, 5, drop=0.1, wd=wd)

    x2_base = Conv1D(64, 3, padding='same', use_bias=False, kernel_regularizer=l2(wd))(tof)
    x2_base = LayerNormalization()(x2_base); x2_base = Activation('relu')(x2_base)
    x2_base = MaxPooling1D(2)(x2_base); x2_base = Dropout(0.2)(x2_base)
    x2_base = Conv1D(128, 3, padding='same', use_bias=False, kernel_regularizer=l2(wd))(x2_base)
    x2_base = LayerNormalization()(x2_base); x2_base = Activation('relu')(x2_base)
    x2_base = MaxPooling1D(2)(x2_base); x2_base = Dropout(0.2)(x2_base)
    
    gate_input = GlobalAveragePooling1D()(tof)
    gate_input = Dense(16, activation='relu')(gate_input)
    
    gate = Dense(1, activation='sigmoid', name='tof_gate')(gate_input)
    
    x2 = Multiply()([x2_base, gate])

    merged = Concatenate()([x1, x2])
    xa = Bidirectional(LSTM(128, return_sequences=True, kernel_regularizer=l2(wd)))(merged)
    xb = Bidirectional(GRU(128, return_sequences=True, kernel_regularizer=l2(wd)))(merged)
    xc = GaussianNoise(0.09)(merged)
    xc = Dense(16, activation='elu')(xc)
    x = Concatenate()([xa, xb, xc])
    x = Dropout(0.4)(x)
    x = attention_layer(x)
    for units, drop in [(256, 0.5), (128, 0.3)]:
        x = Dense(units, use_bias=False, kernel_regularizer=l2(wd))(x)
        x = BatchNormalization()(x); x = Activation('relu')(x)
        x = Dropout(drop)(x)
    
    out = Dense(n_classes, activation='softmax', name='main_output', kernel_regularizer=l2(wd))(x)
    
    
    return Model(inputs=inp, outputs=[out, gate])

# TRANSFORMER - (LS'den 0.816LB 0.8359 CV)
import tensorflow as tf
from tensorflow.keras.layers import (
    Input, Dense, Dropout, BatchNormalization, Conv1D, MaxPooling1D,
    Activation, Multiply, Bidirectional, LSTM, GRU, Concatenate,
    GlobalAveragePooling1D, Lambda, RepeatVector, Reshape, add,
    LayerNormalization, MultiHeadAttention, Flatten
)
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l2
import tensorflow.keras.backend as K

# -----------------
# SE Block
# -----------------
class SEBlock(tf.keras.layers.Layer):
    def __init__(self, reduction=8, **kwargs):
        super().__init__(**kwargs)
        self.reduction = reduction

    def build(self, input_shape):
        ch = input_shape[-1]
        self.gap = GlobalAveragePooling1D()
        self.fc1 = Dense(ch // self.reduction, activation='relu')
        self.fc2 = Dense(ch, activation='sigmoid')
        self.reshape = Reshape((1, ch))
        self.multiply = Multiply()

    def call(self, x):
        se = self.gap(x)
        se = self.fc1(se)
        se = self.fc2(se)
        se = self.reshape(se)
        return self.multiply([x, se])

# -----------------
# Residual SE CNN Block
# -----------------
class ResidualSEBlock(tf.keras.layers.Layer):
    def __init__(self, filters, kernel_size, pool_size=2, drop=0.3, wd=1e-4, **kwargs):
        super().__init__(**kwargs)
        self.filters = filters
        self.kernel_size = kernel_size
        self.pool_size = pool_size
        self.drop = drop
        self.wd = wd

    def build(self, input_shape):
        self.conv_layers = []
        for _ in range(2):
            self.conv_layers.append(Conv1D(self.filters, self.kernel_size, padding='same',
                                           use_bias=False, kernel_regularizer=l2(self.wd)))
            self.conv_layers.append(BatchNormalization())
            self.conv_layers.append(Activation('relu'))

        self.se_block = SEBlock()

        self.shortcut_conv = None
        if input_shape[-1] != self.filters:
            self.shortcut_conv = Conv1D(self.filters, 1, padding='same',
                                        use_bias=False, kernel_regularizer=l2(self.wd))
            self.shortcut_bn = BatchNormalization()

        self.add_layer = add
        self.relu = Activation('relu')
        self.pool = MaxPooling1D(self.pool_size)
        self.dropout = Dropout(self.drop)

    def call(self, x):
        shortcut = x
        for layer in self.conv_layers:
            x = layer(x)
        x = self.se_block(x)

        if self.shortcut_conv is not None:
            shortcut = self.shortcut_conv(shortcut)
            shortcut = self.shortcut_bn(shortcut)

        x = self.add_layer([x, shortcut])
        x = self.relu(x)
        x = self.pool(x)
        x = self.dropout(x)
        return x

# -----------------
# Attention Layer
# -----------------
class AttentionLayer(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def build(self, input_shape):
        self.score_dense = Dense(1, activation='tanh')
        self.softmax = Activation('softmax')
        self.multiply = Multiply()

    def call(self, inputs):
        score = self.score_dense(inputs)
        score = tf.squeeze(score, axis=-1)
        weights = self.softmax(score)
        weights = tf.expand_dims(weights, axis=-1)
        context = self.multiply([inputs, weights])
        return tf.reduce_sum(context, axis=1)

# -----------------
# Transformer Encoder Block
# -----------------
class TransformerEncoderBlock(tf.keras.layers.Layer):
    def __init__(self, head_num=4, ff_dim=256, dropout=0.2, **kwargs):
        super().__init__(**kwargs)
        self.head_num = head_num
        self.ff_dim = ff_dim
        self.dropout_rate = dropout

    def build(self, input_shape):
        self.ln1 = LayerNormalization(epsilon=1e-6)
        self.attn = MultiHeadAttention(num_heads=self.head_num, key_dim=input_shape[-1] // self.head_num)
        self.drop1 = Dropout(self.dropout_rate)

        self.ln2 = LayerNormalization(epsilon=1e-6)
        self.ffn_dense1 = Dense(self.ff_dim, activation='relu')
        self.ffn_dense2 = Dense(input_shape[-1])
        self.drop2 = Dropout(self.dropout_rate)

    def call(self, inputs):
        x_norm = self.ln1(inputs)
        attn_out = self.attn(x_norm, x_norm)
        attn_out = self.drop1(attn_out)
        out1 = attn_out + inputs

        x_norm2 = self.ln2(out1)
        x_ff = self.ffn_dense1(x_norm2)
        x_ff = self.ffn_dense2(x_ff)
        x_ff = self.drop2(x_ff)
        return x_ff + out1

# -----------------
# Model Builder
# -----------------
def build_competition_model(pad_len, imu_dim, tof_dim, n_classes, wd=1e-4):
    inp = Input(shape=(pad_len, imu_dim + tof_dim), name='input_all')

    imu = Lambda(lambda t: t[:, :, :imu_dim], name='imu_slice')(inp)
    tof = Lambda(lambda t: t[:, :, imu_dim:], name='tof_slice')(inp)

    x_imu = ResidualSEBlock(64, 3, pool_size=2, drop=0.1, wd=wd)(imu)
    x_imu = ResidualSEBlock(128, 5, pool_size=2, drop=0.1, wd=wd)(x_imu)

    x_tof = Conv1D(64, 3, padding='same', kernel_regularizer=l2(wd), use_bias=False)(tof)
    x_tof = BatchNormalization()(x_tof)
    x_tof = Activation('relu')(x_tof)
    x_tof = MaxPooling1D(2)(x_tof)

    x_tof = Conv1D(128, 3, padding='same', kernel_regularizer=l2(wd), use_bias=False)(x_tof)
    x_tof = BatchNormalization()(x_tof)
    x_tof = Activation('relu')(x_tof)
    x_tof = MaxPooling1D(2)(x_tof)

    x_tof = Bidirectional(GRU(64, return_sequences=True, kernel_regularizer=l2(wd)))(x_tof)
    x_tof = Dropout(0.2)(x_tof)

    gate_input = GlobalAveragePooling1D()(tof)
    gate_dense = Dense(16, activation='relu')(gate_input)
    gate = Dense(1, activation='sigmoid', name='tof_gate')(gate_dense)

    gate_timesteps = K.int_shape(x_tof)[1]
    gate_expanded = RepeatVector(gate_timesteps)(gate)
    x_tof = Multiply(name='gated_tof_output')([x_tof, gate_expanded])

    merged = Concatenate(name='merged_features')([x_imu, x_tof])

    #x_lstm = Bidirectional(LSTM(128, return_sequences=True, kernel_regularizer=l2(wd)))(merged)
    #x_gru = Bidirectional(GRU(128, return_sequences=True, kernel_regularizer=l2(wd)))(merged)
    x_lstm = Bidirectional(LSTM(160, return_sequences=True, kernel_regularizer=l2(wd)))(merged)
    x_gru = Bidirectional(GRU(160, return_sequences=True, kernel_regularizer=l2(wd)))(merged)
    x_rnn_concat = Concatenate()([x_lstm, x_gru])

    #x_trans = TransformerEncoderBlock(head_num=8, ff_dim=512, dropout=0.3)(x_rnn_concat)
    x_trans = TransformerEncoderBlock(head_num=12, ff_dim=768, dropout=0.3)(x_rnn_concat)
    x_trans = TransformerEncoderBlock(head_num=8, ff_dim=512, dropout=0.3)(x_trans)

    attn_out = AttentionLayer()(x_trans)

    x = attn_out
    for units, drop_rate in [(256, 0.5), (128, 0.3)]:
        x = Dense(units, kernel_regularizer=l2(wd), use_bias=False)(x)
        x = BatchNormalization()(x)
        x = Activation('relu')(x)
        x = Dropout(drop_rate)(x)

    out = Dense(n_classes, activation='softmax', name='main_output', kernel_regularizer=l2(wd))(x)

    return Model(inputs=inp, outputs=[out, gate])


# TRANSFORMER OPTIMIZE  (Lambda fix for H5) 0.8429 CV - 0.826 LB -------------- BEST TRANSFORMER
import tensorflow as tf
from tensorflow.keras.layers import (
    Input, Dense, Dropout, BatchNormalization, Conv1D, MaxPooling1D,
    Activation, Multiply, Bidirectional, LSTM, GRU, Concatenate,
    GlobalAveragePooling1D, Lambda, RepeatVector, Reshape, add,
    LayerNormalization, MultiHeadAttention
)
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l2

# -----------------
# SE Block
# -----------------
class SEBlock(tf.keras.layers.Layer):
    def __init__(self, reduction=8, **kwargs):
        super().__init__(**kwargs)
        self.reduction = reduction

    def build(self, input_shape):
        ch = int(input_shape[-1])
        self.gap = GlobalAveragePooling1D()
        self.fc1 = Dense(max(ch // self.reduction, 4), activation='relu')
        self.fc2 = Dense(ch, activation='sigmoid')
        self.reshape = Reshape((1, ch))

    def call(self, x):
        se = self.gap(x)
        se = self.fc1(se)
        se = self.fc2(se)
        se = self.reshape(se)
        return x * se

# -----------------
# Residual SE Block (with pooling)
# -----------------
class ResidualSEBlock(tf.keras.layers.Layer):
    def __init__(self, filters, kernel_size, pool_size=2, drop=0.25, wd=1e-4, **kwargs):
        super().__init__(**kwargs)
        self.filters = filters
        self.kernel_size = kernel_size
        self.pool_size = pool_size
        self.drop = drop
        self.wd = wd

    def build(self, input_shape):
        self.conv1 = Conv1D(self.filters, self.kernel_size, padding='same', use_bias=False,
                            kernel_regularizer=l2(self.wd))
        self.bn1 = BatchNormalization()
        self.act1 = Activation('relu')

        self.conv2 = Conv1D(self.filters, self.kernel_size, padding='same', use_bias=False,
                            kernel_regularizer=l2(self.wd))
        self.bn2 = BatchNormalization()
        self.act2 = Activation('relu')

        self.se = SEBlock(reduction=8)

        self.shortcut_conv = None
        if int(input_shape[-1]) != self.filters:
            self.shortcut_conv = Conv1D(self.filters, 1, padding='same', use_bias=False,
                                        kernel_regularizer=l2(self.wd))
            self.shortcut_bn = BatchNormalization()

        self.pool = MaxPooling1D(self.pool_size)
        self.dropout = Dropout(self.drop)
        self.add = add
        self.relu = Activation('relu')

    def call(self, x, training=False):
        shortcut = x
        x = self.conv1(x)
        x = self.bn1(x, training=training)
        x = self.act1(x)

        x = self.conv2(x)
        x = self.bn2(x, training=training)
        x = self.act2(x)

        x = self.se(x)

        if self.shortcut_conv is not None:
            shortcut = self.shortcut_conv(shortcut)
            shortcut = self.shortcut_bn(shortcut, training=training)

        x = self.add([x, shortcut])
        x = self.relu(x)
        x = self.pool(x)
        x = self.dropout(x, training=training)
        return x

# -----------------
# Attention pooling (time-wise)
# -----------------
class AttentionLayer(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def build(self, input_shape):
        self.score_dense = Dense(1, activation='tanh')
        self.softmax = tf.keras.layers.Softmax(axis=1)

    def call(self, inputs):
        score = self.score_dense(inputs)            # (B, T, 1)
        score = tf.squeeze(score, axis=-1)          # (B, T)
        weights = self.softmax(score)               # (B, T)
        weights = tf.expand_dims(weights, axis=-1)  # (B, T, 1)
        context = inputs * weights                  # (B, T, C)
        return tf.reduce_sum(context, axis=1)       # (B, C)

# -----------------
# Transformer Encoder (pre-LN)
# -----------------
class TransformerEncoderBlock(tf.keras.layers.Layer):
    def __init__(self, head_num=8, ff_dim=512, dropout=0.2, **kwargs):
        super().__init__(**kwargs)
        self.head_num = head_num
        self.ff_dim = ff_dim
        self.dropout_rate = dropout

    def build(self, input_shape):
        d_model = int(input_shape[-1])
        self.ln1 = LayerNormalization(epsilon=1e-6)
        # key_dim must be >=1 and typically d_model // head_num
        self.mha = MultiHeadAttention(num_heads=self.head_num, key_dim=max(1, d_model // self.head_num))
        self.dropout1 = Dropout(self.dropout_rate)

        self.ln2 = LayerNormalization(epsilon=1e-6)
        self.ffn_dense1 = Dense(self.ff_dim, activation='relu', kernel_regularizer=l2(1e-4))
        self.ffn_dense2 = Dense(d_model, kernel_regularizer=l2(1e-4))
        self.dropout2 = Dropout(self.dropout_rate)

    def call(self, x, training=False):
        # Pre-LN
        x_norm = self.ln1(x)
        attn_out = self.mha(x_norm, x_norm)
        attn_out = self.dropout1(attn_out, training=training)
        x = attn_out + x

        x_norm2 = self.ln2(x)
        x_ff = self.ffn_dense1(x_norm2)
        x_ff = self.ffn_dense2(x_ff)
        x_ff = self.dropout2(x_ff, training=training)
        return x + x_ff

# -----------------
# Model builder (Lambda-safe)
# -----------------
def build_competition_model(pad_len, imu_dim, tof_dim, n_classes, wd=1e-4):
    inp = Input(shape=(pad_len, imu_dim + tof_dim), name='input_all')

    # Dilimleme Lambda'larına output_shape verildi (H5 yüklemesi güvenli)
    imu = Lambda(lambda t: t[:, :, :imu_dim],
                 output_shape=(pad_len, imu_dim),
                 name='imu_slice')(inp)
    tof = Lambda(lambda t: t[:, :, imu_dim:],
                 output_shape=(pad_len, tof_dim),
                 name='tof_slice')(inp)

    # IMU path
    x_imu = ResidualSEBlock(64, 3, pool_size=2, drop=0.12, wd=wd)(imu)
    x_imu = ResidualSEBlock(128, 5, pool_size=2, drop=0.12, wd=wd)(x_imu)

    # ToF path
    x_tof = Conv1D(64, 3, padding='same', kernel_regularizer=l2(wd), use_bias=False)(tof)
    x_tof = BatchNormalization()(x_tof)
    x_tof = Activation('relu')(x_tof)
    x_tof = MaxPooling1D(2)(x_tof)

    x_tof = Conv1D(128, 3, padding='same', kernel_regularizer=l2(wd), use_bias=False)(x_tof)
    x_tof = BatchNormalization()(x_tof)
    x_tof = Activation('relu')(x_tof)
    x_tof = MaxPooling1D(2)(x_tof)

    x_tof = Bidirectional(GRU(64, return_sequences=True, kernel_regularizer=l2(wd)))(x_tof)
    x_tof = Dropout(0.22)(x_tof)

    # --- Channel-wise ToF gate (Lambda -> RepeatVector ile değiştirildi) ---
    gate_feat = GlobalAveragePooling1D()(x_tof)  # (B, C)
    gate_chan = Dense(int(x_tof.shape[-1]), activation='sigmoid', name='tof_gate_chan')(gate_feat)  # (B, C)
    gate_chan_expand = RepeatVector(1, name='tof_gate_chan_expand')(gate_chan)  # (B, 1, C) — shape belirgin
    x_tof = Multiply(name='tof_channel_gate')([x_tof, gate_chan_expand])  # (B, T, C) * (B, 1, C)

    # Scalar aux gate (aynı kaldı)
    gate_scalar = Dense(1, activation='sigmoid', name='tof_gate')(gate_feat)  # (B, 1)

    # Merge
    merged = Concatenate(name='merged_features')([x_imu, x_tof])

    # RNN katmanı
    x_lstm = Bidirectional(LSTM(160, return_sequences=True, kernel_regularizer=l2(wd)))(merged)
    x_gru  = Bidirectional(GRU(160, return_sequences=True, kernel_regularizer=l2(wd)))(merged)
    x_rnn_concat = Concatenate(name='rnn_concat')([x_lstm, x_gru])

    # Transformer stack
    x_trans = TransformerEncoderBlock(head_num=12, ff_dim=768, dropout=0.28)(x_rnn_concat)
    x_trans = TransformerEncoderBlock(head_num=8, ff_dim=512, dropout=0.28)(x_trans)

    # Temporal attention pooling
    attn_out = AttentionLayer()(x_trans)

    # Dense head
    x = tf.keras.layers.LayerNormalization(epsilon=1e-6)(attn_out)
    for units, drop_rate in [(256, 0.48), (128, 0.32)]:
        x = Dense(units, kernel_regularizer=l2(wd), use_bias=False)(x)
        x = BatchNormalization()(x)
        x = Activation('relu')(x)
        x = Dropout(drop_rate)(x)

    out = Dense(n_classes, activation='softmax', name='main_output', kernel_regularizer=l2(wd))(x)

    return Model(inputs=inp, outputs=[out, gate_scalar])


# cv: 0.8400
import tensorflow as tf
from tensorflow.keras import regularizers
from tensorflow.keras.layers import (Layer, Conv1D, SeparableConv1D, BatchNormalization, Activation,
                                     Add, Dropout, GlobalAveragePooling1D, Dense, Multiply, Bidirectional,
                                     GRU, LSTM, Input, Concatenate, MaxPooling1D, LayerNormalization)
from tensorflow.keras.models import Model

# ---------- Utility Blocks (Class-based, no Lambda) ----------

class SplitIMUToF(Layer):
    """ Tek tensörden (B, T, imu_dim+tof_dim) -> imu:(B,T,imu_dim), tof:(B,T,tof_dim) """
    def __init__(self, imu_dim, tof_dim, **kwargs):
        super().__init__(**kwargs)
        self.imu_dim = int(imu_dim)
        self.tof_dim = int(tof_dim)

    def call(self, x):
        imu = x[..., :self.imu_dim]
        tof = x[..., self.imu_dim:self.imu_dim + self.tof_dim]
        return imu, tof

    def get_config(self):
        return {"imu_dim": self.imu_dim, "tof_dim": self.tof_dim, **super().get_config()}


class SE1D(Layer):
    """ Squeeze-Excitation (kanal) """
    def __init__(self, reduction=8, **kwargs):
        super().__init__(**kwargs)
        self.reduction = reduction

    def build(self, input_shape):
        c = int(input_shape[-1])
        self.d1 = Dense(max(c // self.reduction, 8), activation='relu')
        self.d2 = Dense(c, activation='sigmoid')
        super().build(input_shape)

    def call(self, x):
        se = GlobalAveragePooling1D()(x)        # (B,C)
        se = self.d1(se)
        se = self.d2(se)                         # (B,C)
        se = tf.expand_dims(se, axis=1)          # (B,1,C)
        return x * se

    def get_config(self):
        return {"reduction": self.reduction, **super().get_config()}


class ResidualSEBlock1D(Layer):
    """ Residual + (Separable)Conv + SE + Dropout + (opsiyonel) Pool """
    def __init__(self, filters, ksize=5, wd=1e-4, drop=0.2, pool=2, depthwise=False, **kwargs):
        super().__init__(**kwargs)
        self.filters = filters
        self.ksize = ksize
        self.wd = wd
        self.drop = drop
        self.pool = pool
        self.depthwise = depthwise

        # convs created in build() because for SeparableConv we set different args
        self.c1 = None
        self.b1 = BatchNormalization()
        self.a1 = Activation('relu')

        self.c2 = None
        self.b2 = BatchNormalization()

        self.se = SE1D()
        self.shortcut_conv = None
        self.shortcut_bn = None
        self.out_act = Activation('relu')
        self.drop_layer = Dropout(drop)
        self.pool_layer = MaxPooling1D(self.pool) if self.pool and self.pool > 1 else None

    def build(self, input_shape):
        in_c = int(input_shape[-1])
        if self.depthwise:
            # SeparableConv1D: use depthwise_regularizer & pointwise_regularizer
            self.c1 = SeparableConv1D(self.filters, self.ksize, padding='same', use_bias=False,
                                      depthwise_regularizer=regularizers.l2(self.wd),
                                      pointwise_regularizer=regularizers.l2(self.wd))
            self.c2 = SeparableConv1D(self.filters, self.ksize, padding='same', use_bias=False,
                                      depthwise_regularizer=regularizers.l2(self.wd),
                                      pointwise_regularizer=regularizers.l2(self.wd))
        else:
            self.c1 = Conv1D(self.filters, self.ksize, padding='same', use_bias=False,
                             kernel_regularizer=regularizers.l2(self.wd))
            self.c2 = Conv1D(self.filters, self.ksize, padding='same', use_bias=False,
                             kernel_regularizer=regularizers.l2(self.wd))

        if in_c != self.filters:
            self.shortcut_conv = Conv1D(self.filters, 1, padding='same', use_bias=False,
                                        kernel_regularizer=regularizers.l2(self.wd))
            self.shortcut_bn = BatchNormalization()
        super().build(input_shape)

    def call(self, x, training=False):
        shortcut = x
        y = self.c1(x)
        y = self.b1(y, training=training)
        y = self.a1(y)
        y = self.c2(y)
        y = self.b2(y, training=training)
        y = self.se(y)

        if self.shortcut_conv is not None:
            shortcut = self.shortcut_conv(shortcut)
            shortcut = self.shortcut_bn(shortcut, training=training)

        y = Add()([y, shortcut])
        y = self.out_act(y)
        if self.pool_layer is not None:
            y = self.pool_layer(y)
        y = self.drop_layer(y, training=training)
        return y

    def get_config(self):
        return {
            "filters": self.filters, "ksize": self.ksize, "wd": self.wd, "drop": self.drop,
            "pool": self.pool, "depthwise": self.depthwise, **super().get_config()
        }


class TemporalAttention1D(Layer):
    """ Global temporal attention pooling: (B,T,C) -> (B,C) """
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def build(self, input_shape):
        self.score = Dense(1, activation='tanh')
        super().build(input_shape)

    def call(self, x):
        s = self.score(x)                 # (B,T,1)
        s = tf.squeeze(s, axis=-1)        # (B,T)
        w = tf.nn.softmax(s, axis=1)      # (B,T)
        w = tf.expand_dims(w, axis=-1)    # (B,T,1)
        ctx = x * w                       # (B,T,C)
        ctx = tf.reduce_sum(ctx, axis=1)  # (B,C)
        return ctx


class MHARefine(Layer):
    """ Light Transformer encoder block (MHA + FFN) """
    def __init__(self, num_heads=8, proj_dim=None, dropout=0.1, **kwargs):
        super().__init__(**kwargs)
        self.num_heads = num_heads
        self.proj_dim = proj_dim
        self.dropout_rate = dropout
        self.mha = None
        self.proj_in = None
        self.ffn = None
        self.norm1 = LayerNormalization(epsilon=1e-6)
        self.norm2 = LayerNormalization(epsilon=1e-6)
        self.drop1 = Dropout(dropout)
        self.drop2 = Dropout(dropout)

    def build(self, input_shape):
        feat = int(input_shape[-1])
        if self.proj_dim is None:
            self.proj_dim = feat
        if feat != self.proj_dim:
            self.proj_in = Dense(self.proj_dim)
        self.mha = tf.keras.layers.MultiHeadAttention(
            num_heads=self.num_heads, key_dim=max(self.proj_dim // self.num_heads, 1),
            dropout=self.dropout_rate
        )
        self.ffn = tf.keras.Sequential([
            Dense(self.proj_dim * 4, activation='relu'),
            Dropout(self.dropout_rate),
            Dense(self.proj_dim),
        ])
        super().build(input_shape)

    def call(self, x, training=False):
        if self.proj_in is not None:
            x = self.proj_in(x)
        attn = self.mha(x, x, training=training)
        attn = self.drop1(attn, training=training)
        y = self.norm1(x + attn)
        f = self.ffn(y, training=training)
        f = self.drop2(f, training=training)
        y = self.norm2(y + f)
        return y

    def get_config(self):
        return {"num_heads": self.num_heads, "proj_dim": self.proj_dim,
                "dropout": self.dropout_rate, **super().get_config()}


class ChannelGate1D(Layer):
    """ Kanal-içi reweight (ToF yolunda, ekstra çıktı yok) """
    def __init__(self, reduction=2, **kwargs):
        super().__init__(**kwargs)
        self.reduction = reduction

    def build(self, input_shape):
        c = int(input_shape[-1])
        self.d1 = Dense(max(c // self.reduction, 16), activation='relu')
        self.d2 = Dense(c, activation='sigmoid')
        super().build(input_shape)

    def call(self, x):
        g = GlobalAveragePooling1D()(x)     # (B,C)
        g = self.d1(g)
        g = self.d2(g)                      # (B,C)
        g = tf.expand_dims(g, axis=1)       # (B,1,C)
        return x * g

    def get_config(self):
        return {"reduction": self.reduction, **super().get_config()}


class ScalarGateBroadcast(Layer):
    """ Skaler gate'i zaman eksenine yayınla ve uygula
        Bu layer iki argüman bekler: (seq, gate_scalar) """
    def call(self, seq, gate):
        # seq: (B,T,C), gate: (B,1) veya (B,)
        g = tf.expand_dims(gate, axis=-1)   # (B,1,1) or (B,1,1)
        # ensure shape (B,1,1) -> tile to (B,T,1)
        g = tf.repeat(g, repeats=tf.shape(seq)[1], axis=1)  # (B,T,1)
        return seq * g  # (B,T,C)


# ---------- Full Single-Input Architecture (IMU+ToF inside) ----------

def build_full_single_input_cnn_mha(pad_len, imu_dim, tof_dim, n_classes, wd=1e-4):
    """
    Tek girişli FULL model:
      - İçeride IMU/ToF split
      - Her iki dalda /4 downsample (concat uyumlu)
      - ToF dalında skaler 'tof_gate' (çıktı) + kanal-içi gate (internal)
      - CNN + BiRNN + MHA refine + TemporalAttention + Dense head
    """
    inp = Input(shape=(pad_len, imu_dim + tof_dim), name="full_input")

    # Split
    imu, tof = SplitIMUToF(imu_dim=imu_dim, tof_dim=tof_dim, name="split_imutof")(inp)

    # ----- IMU branch -----
    xi = ResidualSEBlock1D(128, ksize=7, wd=wd, drop=0.15, pool=2, depthwise=True, name="imu_block1")(imu)
    xi = ResidualSEBlock1D(192, ksize=5, wd=wd, drop=0.15, pool=2, depthwise=True, name="imu_block2")(xi)  # /4

    xi_rnn = Bidirectional(GRU(128, return_sequences=True,
                               kernel_regularizer=regularizers.l2(wd)), name="imu_bigru")(xi)
    xi = Dropout(0.15)(xi_rnn)

    # ----- ToF/THM branch -----
    xt = ResidualSEBlock1D(128, ksize=5, wd=wd, drop=0.20, pool=2, depthwise=False, name="tof_block1")(tof)
    xt = ResidualSEBlock1D(192, ksize=3, wd=wd, drop=0.20, pool=2, depthwise=False, name="tof_block2")(xt)  # /4

    xt_rnn = Bidirectional(LSTM(128, return_sequences=True,
                                kernel_regularizer=regularizers.l2(wd)), name="tof_bilstm")(xt)
    xt = Dropout(0.20)(xt_rnn)

    # ---- Skaler tof_gate (MODEL ÇIKTISI OLARAK) ----
    gate_in = GlobalAveragePooling1D(name="tof_gate_gap")(tof)       # ham ToF tabanlı
    gate_h  = Dense(32, activation='relu', name="tof_gate_dense")(gate_in)
    tof_gate_out = Dense(1, activation='sigmoid', name="tof_gate")(gate_h)  # ikinci model çıktısı

    # ---- Kanal-içi gate (internal) + skaler gate uygulaması (broadcast) ----
    xt = ChannelGate1D(reduction=2, name="tof_channel_gate")(xt)
    xt = ScalarGateBroadcast(name="tof_scalar_gate_apply")(xt, tof_gate_out)

    # ---- Concat (zaman boyutu eşit: /4) ----
    merged = Concatenate(name="merge_imu_tof")([xi, xt])  # (B, T/4, C_imu+C_tof)

    # ---- Temporal encoder: GRU + LSTM (paralel) -> concat -> MHA refine ----
    xa = Bidirectional(GRU(160, return_sequences=True,
                           kernel_regularizer=regularizers.l2(wd)), name="enc_bigru")(merged)
    xb = Bidirectional(LSTM(160, return_sequences=True,
                            kernel_regularizer=regularizers.l2(wd)), name="enc_bilstm")(merged)
    xf = Concatenate(name="enc_concat")([xa, xb])

    xf = MHARefine(num_heads=8, proj_dim=None, dropout=0.1, name="mha_refine")(xf)

    # ---- Global temporal attention pooling ----
    attn_vec = TemporalAttention1D(name="temporal_attention")(xf)

    # ---- Dense head ----
    x = LayerNormalization(epsilon=1e-6)(attn_vec)
    for units, drop in [(384, 0.45), (192, 0.30)]:
        x = Dense(units, use_bias=False, kernel_regularizer=regularizers.l2(wd))(x)
        x = BatchNormalization()(x)
        x = Activation(tf.nn.gelu)(x)
        x = Dropout(drop)(x)

    main_out = Dense(n_classes, activation='softmax', name="main_output",
                     kernel_regularizer=regularizers.l2(wd))(x)

    model = Model(inputs=inp, outputs=[main_out, tof_gate_out], name="FullSingleInput_CNN_MHA_Gated")
    return model


In [None]:
import tensorflow as tf
from tensorflow.keras.layers import (
    Input, Dense, Dropout, BatchNormalization, Conv1D, MaxPooling1D, Conv2D, MaxPooling2D,
    Flatten, Activation, Add, MultiHeadAttention, Multiply, Concatenate,
    GlobalAveragePooling1D, Layer, LayerNormalization, TimeDistributed, Reshape
)
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l2
import numpy as np

# -------------------------
# Özel Katmanlar (H5-safe)
# -------------------------
class ResidualSEConv1D(Layer):
    """
    Conv1D ve Squeeze-and-Excitation ile tek akışlı bir kalıntı (residual) bloğu.
    S-E bloğu, kanal bazlı özellik tepkilerini dinamik olarak yeniden kalibre eder.
    """
    def __init__(self, filters, kernel_size=3, drop=0.2, wd=1e-4, **kwargs):
        super(ResidualSEConv1D, self).__init__(**kwargs)
        self.filters = filters
        self.kernel_size = kernel_size
        self.drop = drop
        self.wd = wd

    def build(self, input_shape):
        self.conv1 = Conv1D(self.filters, self.kernel_size, padding='same', use_bias=False, kernel_regularizer=l2(self.wd))
        self.bn1 = BatchNormalization()
        self.act1 = Activation('relu')
        
        self.conv2 = Conv1D(self.filters, self.kernel_size, padding='same', use_bias=False, kernel_regularizer=l2(self.wd))
        self.bn2 = BatchNormalization()
        self.add = Add()
        
        # Squeeze-and-Excitation bloğu
        self.se_reduce = Dense(max(4, self.filters // 8), activation='relu', kernel_regularizer=l2(self.wd))
        self.se_expand = Dense(self.filters, activation='sigmoid', kernel_regularizer=l2(self.wd))
        
        # Filtre sayısını eşleştirmek için kısayol bağlantısı
        self.shortcut_conv = None
        if int(input_shape[-1]) != self.filters:
            self.shortcut_conv = Conv1D(self.filters, 1, padding='same', use_bias=False, kernel_regularizer=l2(self.wd))
            
        self.dropout = Dropout(self.drop)
        super(ResidualSEConv1D, self).build(input_shape)

    def call(self, x, training=False):
        shortcut = x
        y = self.conv1(x)
        y = self.bn1(y, training=training)
        y = self.act1(y)
        
        y = self.conv2(y)
        y = self.bn2(y, training=training)

        # Squeeze-and-Excitation
        se_path = GlobalAveragePooling1D()(y)
        se_path = self.se_reduce(se_path)
        se_path = self.se_expand(se_path)
        se_path = tf.expand_dims(se_path, axis=1)
        y = Multiply()([y, se_path])

        if self.shortcut_conv:
            shortcut = self.shortcut_conv(shortcut)
            
        y = self.add([y, shortcut])
        y = self.dropout(y, training=training)
        return Activation('relu')(y)

    def get_config(self):
        config = super(ResidualSEConv1D, self).get_config()
        config.update({
            "filters": self.filters, "kernel_size": self.kernel_size,
            "drop": self.drop, "wd": self.wd
        })
        return config

class TransformerEncoderBlock(Layer):
    """
    MultiHeadAttention ve bir ileri beslemeli ağ ile bir Transformer Kodlayıcı bloğu.
    """
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.3, wd=1e-4, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.ff_dim = ff_dim
        self.rate = rate
        self.wd = wd
        
        self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim // num_heads)
        self.ffn = tf.keras.Sequential([
            Dense(ff_dim, activation="relu", kernel_regularizer=l2(wd)),
            Dense(embed_dim, kernel_regularizer=l2(wd)),
        ])
        self.norm1 = LayerNormalization(epsilon=1e-6)
        self.norm2 = LayerNormalization(epsilon=1e-6)
        self.drop1 = Dropout(rate)
        self.drop2 = Dropout(rate)

    def call(self, inputs, training=False):
        attn_out = self.att(inputs, inputs, training=training)
        x = self.norm1(inputs + self.drop1(attn_out, training=training))
        ffn_out = self.ffn(x)
        return self.norm2(x + self.drop2(ffn_out, training=training))
    
    def get_config(self):
        config = super().get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "num_heads": self.num_heads,
            "ff_dim": self.ff_dim,
            "rate": self.rate,
            "wd": self.wd
        })
        return config

# -------------------------
# Model Builder
# -------------------------
def build_three_headed_model_single_input(pad_len, imu_dim, tof_features_dim, tof_raw_shape, n_classes, wd=5e-4):
    """
    Tek bir giriş tensöründen ayrılan üç farklı veri türünü (IMU, ToF-Özellikleri, Ham ToF)
    işleyen bir model oluşturur.

    Argümanlar:
        pad_len (int): Dolgulu sekansların uzunluğu.
        imu_dim (int): IMU verisi için özellik sayısı.
        tof_features_dim (int): ToF + THM için özellik sayısı.
        tof_raw_shape (tuple): Ham ToF verisinin şekli (örn., (5, 64)).
        n_classes (int): Çıkış sınıfı sayısı.
        wd (float): L2 normalizasyonu için ağırlık bozunması.

    Dönüş:
        tf.keras.Model: İki çıkışlı Keras modeli.
    """
    # Girdileri tek bir tensör olarak birleştirin
    total_features_dim = imu_dim + tof_features_dim + np.prod(tof_raw_shape)
    inp = Input(shape=(pad_len, total_features_dim), name='main_input')

    # Girdiyi üç ayrı dala ayırmak için Lambda katmanlarını kullanın.
    imu_input = tf.keras.layers.Lambda(
        lambda t: t[:, :, :imu_dim], 
        output_shape=lambda input_shape: (input_shape[0], input_shape[1], imu_dim), 
        name='imu_branch_input'
    )(inp)
    
    tof_features_input = tf.keras.layers.Lambda(
        lambda t: t[:, :, imu_dim:imu_dim + tof_features_dim], 
        output_shape=lambda input_shape: (input_shape[0], input_shape[1], tof_features_dim), 
        name='tof_features_branch_input'
    )(inp)
    
    tof_raw_flat_input = tf.keras.layers.Lambda(
        lambda t: t[:, :, imu_dim + tof_features_dim:], 
        output_shape=lambda input_shape: (input_shape[0], input_shape[1], np.prod(tof_raw_shape)), 
        name='tof_raw_branch_input'
    )(inp)

    # --- IMU Dalı: CNN-Transformer (Ana Sinyal) ---
    imu = ResidualSEConv1D(64, kernel_size=3, drop=0.3, wd=wd)(imu_input)
    imu = MaxPooling1D(2)(imu)
    imu = ResidualSEConv1D(128, kernel_size=5, drop=0.3, wd=wd)(imu)
    imu = MaxPooling1D(2)(imu)
    imu = TransformerEncoderBlock(embed_dim=128, num_heads=8, ff_dim=256, rate=0.3, wd=wd)(imu)
    imu = TransformerEncoderBlock(embed_dim=128, num_heads=8, ff_dim=256, rate=0.2, wd=wd)(imu)
    imu = GlobalAveragePooling1D()(imu)

    # --- ToF Özellikleri Dalı: CNN (Yardımcı Sinyal) ---
    tof_features = Conv1D(64, 3, padding='same', activation='relu', kernel_regularizer=l2(wd))(tof_features_input)
    tof_features = BatchNormalization()(tof_features)
    tof_features = MaxPooling1D(2)(tof_features)
    tof_features = Dropout(0.2)(tof_features)
    tof_features = Conv1D(128, 3, padding='same', dilation_rate=2, activation='relu', kernel_regularizer=l2(wd))(tof_features)
    tof_features = BatchNormalization()(tof_features)
    tof_features = Dropout(0.2)(tof_features)
    tof_features = MaxPooling1D(2)(tof_features)
    tof_features = Dropout(0.2)(tof_features)
    tof_features_output = GlobalAveragePooling1D()(tof_features)

    # --- Ham ToF Dalı: Bellek Kullanımı İçin Optimize Edilmiş 1D CNN ---
    # Bu dal, bellek yetersizliği hatasını (OOM) çözmek için yeniden tasarlandı.
    # TimeDistributed(Conv2D) yerine, Flatten ve 1D CNN katmanları kullanılıyor.
    # Bu, bellek kullanımını önemli ölçüde azaltacaktır.
    tof_raw_reshaped = tf.keras.layers.Reshape((pad_len, np.prod(tof_raw_shape)))(tof_raw_flat_input)
    
    tof_raw = Conv1D(32, kernel_size=3, activation='relu', padding='same', kernel_regularizer=l2(wd))(tof_raw_reshaped)
    tof_raw = BatchNormalization()(tof_raw)
    tof_raw = MaxPooling1D(pool_size=2)(tof_raw)
    tof_raw = Dropout(0.2)(tof_raw)

    tof_raw = Conv1D(64, kernel_size=3, activation='relu', padding='same', kernel_regularizer=l2(wd))(tof_raw)
    tof_raw = BatchNormalization()(tof_raw)
    tof_raw = MaxPooling1D(pool_size=2)(tof_raw)
    tof_raw = Dropout(0.2)(tof_raw)
    
    tof_raw_output = GlobalAveragePooling1D()(tof_raw)

    # --- Yardımcı Sinyallerin Kapılı Füzyonu ---
    aux_features_for_gate = Concatenate()([tof_features_output, tof_raw_output])
    
    gate_input = Dense(64, activation='relu', kernel_regularizer=l2(wd))(aux_features_for_gate)
    gate = Dense(2, activation='sigmoid', name='aux_gates', kernel_regularizer=l2(wd))(gate_input)
    
    gate_tof_features = tf.keras.layers.Reshape((1,))(gate[:, 0:1])
    gate_tof_raw = tf.keras.layers.Reshape((1,))(gate[:, 1:2])

    gated_tof_features = Multiply()([tof_features_output, gate_tof_features])
    gated_tof_raw = Multiply()([tof_raw_output, gate_tof_raw])

    # --- Ana Veri Akışına Katılma ---
    fused_aux_features = Concatenate()([gated_tof_features, gated_tof_raw])
    
    fused_features = Concatenate()([imu, fused_aux_features])
    
    # --- Sınıflandırma Başlığı ---
    x = Dense(256, activation='relu', kernel_regularizer=l2(wd))(fused_features)
    x = BatchNormalization()(x)
    x = Dropout(0.4)(x)
    
    x = Dense(128, activation='relu', kernel_regularizer=l2(wd))(x)
    x = BatchNormalization()(x)
    x = Dropout(0.3)(x)

    out = Dense(n_classes, activation='softmax', name='main_output', kernel_regularizer=l2(wd))(x)

    return Model(inputs=inp, outputs=[out, gate])



def build_gated_two_branch_model(pad_len, imu_dim, tof_dim, n_classes, wd=1e-4):
    inp = Input(shape=(pad_len, imu_dim+tof_dim))
    imu = Lambda(lambda t: t[:, :, :imu_dim])(inp)
    tof = Lambda(lambda t: t[:, :, imu_dim:])(inp)

    x1 = residual_se_cnn_block(imu, 64, 3, drop=0.1, wd=wd)
    x1 = residual_se_cnn_block(x1, 128, 5, drop=0.1, wd=wd)

    x2_base = Conv1D(64, 3, padding='same', use_bias=False, kernel_regularizer=l2(wd))(tof)
    x2_base = BatchNormalization()(x2_base); x2_base = Activation('relu')(x2_base)
    x2_base = MaxPooling1D(2)(x2_base); x2_base = Dropout(0.2)(x2_base)
    x2_base = Conv1D(128, 3, padding='same', use_bias=False, kernel_regularizer=l2(wd))(x2_base)
    x2_base = BatchNormalization()(x2_base); x2_base = Activation('relu')(x2_base)
    x2_base = MaxPooling1D(2)(x2_base); x2_base = Dropout(0.2)(x2_base)
    
    gate_input = GlobalAveragePooling1D()(tof)
    gate_input = Dense(16, activation='relu')(gate_input)
    
    gate = Dense(1, activation='sigmoid', name='tof_gate')(gate_input)
    
    x2 = Multiply()([x2_base, gate])

    merged = Concatenate()([x1, x2])
    xa = Bidirectional(LSTM(128, return_sequences=True, kernel_regularizer=l2(wd)))(merged)
    xb = Bidirectional(GRU(128, return_sequences=True, kernel_regularizer=l2(wd)))(merged)
    xc = GaussianNoise(0.09)(merged)
    xc = Dense(16, activation='elu')(xc)
    x = Concatenate()([xa, xb, xc])
    x = Dropout(0.4)(x)
    x = attention_layer(x)
    for units, drop in [(256, 0.5), (128, 0.3)]:
        x = Dense(units, use_bias=False, kernel_regularizer=l2(wd))(x)
        x = BatchNormalization()(x); x = Activation('relu')(x)
        x = Dropout(drop)(x)
    
    out = Dense(n_classes, activation='softmax', name='main_output', kernel_regularizer=l2(wd))(x)
    
    
    return Model(inputs=inp, outputs=[out, gate])

In [None]:
from scipy.ndimage import sobel

# ToF için spatial gradyan (sobel) temelli özellikler
def calculate_spatial_tof_features(seq_df, sensor_id):
    # 1D 64-pikseli 8x8'e reshape edip sobel gradyanı alacağız
    pixel_cols = [f"tof_{sensor_id}_v{p}" for p in range(64)]
    tof_data = seq_df[pixel_cols].replace(-1, np.nan).ffill().bfill().fillna(0).values
    
    # Frame sayısı x 64 → (N x 8 x 8)
    N = len(seq_df)
    reshaped = tof_data.reshape(N, 8, 8)
    
    # Spatial gradyanları hesapla (sobel x ve y)
    sobel_x = sobel(reshaped, axis=1)
    sobel_y = sobel(reshaped, axis=2)
    grad_mag = np.sqrt(sobel_x ** 2 + sobel_y ** 2)

    # Özet istatistikleri hesapla
    grad_mean = grad_mag.mean(axis=(1, 2))
    grad_std  = grad_mag.std(axis=(1, 2))
    grad_max  = grad_mag.max(axis=(1, 2))
    
    return pd.DataFrame({
        f'tof_{sensor_id}_grad_mean': grad_mean,
        f'tof_{sensor_id}_grad_std': grad_std,
        f'tof_{sensor_id}_grad_max': grad_max
    }, index=seq_df.index)

In [None]:
from scipy.signal import find_peaks

def count_peaks(series):
    peaks, _ = find_peaks(series, height=np.mean(series))
    return len(peaks)

In [None]:
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

if TRAIN:
    print("▶ TRAIN MODE – loading dataset ...")
    df = pd.read_csv(RAW_DIR / "train.csv")
    
    train_dem_df = pd.read_csv(RAW_DIR / "train_demographics.csv")

    le = LabelEncoder()
    df['gesture_int'] = le.fit_transform(df['gesture'])
    np.save(EXPORT_DIR / "gesture_classes.npy", le.classes_)
    
    acc_y_neg_subjects = (
        df.groupby('subject')['acc_y']
        .mean()
        .loc[lambda x: x < 0]
        .index
        .tolist()
    )
    
    # Bu subject'leri tamamen drop et
    df = df[~df['subject'].isin(acc_y_neg_subjects)].reset_index(drop=True)
    
    # --- [Önemli Değişiklik] Gelişmiş Fiziksel ve İstatistiksel Özellikler ---
    print("  Removing gravity and calculating linear acceleration features...")
    linear_accel_list = [pd.DataFrame(remove_gravity_from_acc(group[['acc_x', 'acc_y', 'acc_z']], group[['rot_x', 'rot_y', 'rot_z', 'rot_w']]), columns=['linear_acc_x', 'linear_acc_y', 'linear_acc_z'], index=group.index) for _, group in df.groupby('sequence_id')]
    df = pd.concat([df, pd.concat(linear_accel_list)], axis=1)
    
    # Lineer İvme Özellikleri
    df['linear_acc_mag'] = np.sqrt(df['linear_acc_x']**2 + df['linear_acc_y']**2 + df['linear_acc_z']**2)
    # df['linear_acc_mag_jerk'] already exists, but consider a smoother derivative or higher order jerks if needed
    df['linear_acc_mag_jerk'] = df.groupby('sequence_id')['linear_acc_mag'].diff().fillna(0) # Keep current for now
    
   
    
    print("  Calculating angular velocity and distance from quaternions...")
    angular_vel_list = [pd.DataFrame(calculate_angular_velocity_from_quat(group[['rot_x', 'rot_y', 'rot_z', 'rot_w']]), columns=['angular_vel_x', 'angular_vel_y', 'angular_vel_z'], index=group.index) for _, group in df.groupby('sequence_id')]
    df = pd.concat([df, pd.concat(angular_vel_list)], axis=1)
    angular_dist_list = [pd.DataFrame(calculate_angular_distance(group[['rot_x', 'rot_y', 'rot_z', 'rot_w']]), columns=['angular_distance'], index=group.index) for _, group in df.groupby('sequence_id')]
    df = pd.concat([df, pd.concat(angular_dist_list)], axis=1)

  
    #- HUSEYİN GUR (GMN) 1.FE 80>81
    # Hız ve İvme için Anlık İstatistiksel Özellikler (Mevcut sensör okumalarına ek olarak) 
    for col in ['acc_x', 'acc_y', 'acc_z',  'linear_acc_x', 'linear_acc_y', 'linear_acc_z', 'angular_vel_x', 'angular_vel_y', 'angular_vel_z']:  # 'rot_w', 'rot_x', 'rot_y', 'rot_z' eksik
        if col in df.columns:
            df[f'{col}_diff'] = df.groupby('sequence_id')[col].diff().fillna(0)
            df[f'{col}_abs_diff'] = np.abs(df.groupby('sequence_id')[col].diff()).fillna(0) # Mutlak fark
    #- HUSEYİN GUR (GMN) 1.FE 80>81

    # --- [Önemli Değişiklik] Fiziksel ve Yeni İstatistiksel FE'yi Yansıtan Özellik Listesi ---
    imu_cols_base = ['acc_x', 'acc_y', 'acc_z'] + [c for c in df.columns if c.startswith('rot_')]  #+ ['handedness', 'height_cm']
    # , 'acc_x_spectral_energy','acc_y_spectral_energy','acc_z_spectral_energy', 'linear_acc_mag_spectral_energy'
    imu_engineered = [
    'linear_acc_mag', 'linear_acc_mag_jerk',
    'angular_vel_x', 'angular_vel_y', 'angular_vel_z', 'angular_distance',
    #'jerk_mad_25'
    #'damj_linear_acc_mag_jerk_mean', 'damj_linear_acc_mag_jerk_std', 'damj_linear_acc_mag_jerk_skew',
    #'damj_jerk_mad_25_mean','damj_jerk_mad_25_std'
   
    ] 
    #- HUSEYİN GUR (GMN) 1.FE 80>81
    # Yeni eklenen differansiyel ve mutlak fark özellikleri
    for col in ['acc_x', 'acc_y', 'acc_z', 'linear_acc_x', 'linear_acc_y', 'linear_acc_z', 'angular_vel_x', 'angular_vel_y', 'angular_vel_z']:
        if col in df.columns:
            imu_engineered.append(f'{col}_diff')
            imu_engineered.append(f'{col}_abs_diff')
     #- HUSEYİN GUR (GMN) 1.FE 80>81

    imu_cols = list(dict.fromkeys(imu_cols_base + imu_engineered))
    
    # HUSEYIN GUR - THM - CGPT 3.FE ÇOK AZ BAŞARISIZ 81>81 FAKAT CV 83
    #df = extract_temporal_thm_features(df)
    #df = extract_spatial_thm_features(df)
    # HUSEYIN GUR - THM - CGPT 3.FE
    
    thm_cols_original = [c for c in df.columns if c.startswith('thm_')]
    tof_cols_raw = [c for c in df.columns if c.startswith('tof_')]

    tof_aggregated_cols_template = []
    for i in range(1, 6): tof_aggregated_cols_template.extend([f'tof_{i}_mean', f'tof_{i}_std', f'tof_{i}_min', f'tof_{i}_max'])

    # Spatial Gradient ToF - HUSEYİN GUR - cgpt 2.FE 81>better 81
    for i in range(1, 6):
        tof_aggregated_cols_template.extend([
            f'tof_{i}_grad_mean', f'tof_{i}_grad_std', f'tof_{i}_grad_max'
        ])
    # Spatial Gradient ToF - HUSEYİN GUR - cgpt 2.FE 81>better 81
    
    final_feature_cols = imu_cols  + tof_aggregated_cols_template + thm_cols_original + tof_cols_raw

    imu_dim_final = len(imu_cols)
    #tof_thm_aggregated_dim_final = len(thm_cols_original) + len(tof_aggregated_cols_template)
    tof_dim = len(tof_aggregated_cols_template)
    thm_dim = len(thm_cols_original)
    tof_raw_dim = len(tof_cols_raw)

    print(f"  IMU (phys-based + enhanced) {imu_dim_final} | THM + Aggregated TOF {tof_dim + thm_dim} | ToF raw {tof_raw_dim}| total {len(final_feature_cols)} features")
    np.save(EXPORT_DIR / "feature_cols.npy", np.array(final_feature_cols))
    
    print("  Building sequences...")
    seq_gp = df.groupby('sequence_id') 
    X_list_unscaled, y_list_int, groups_list, lens = [], [], [], [] 
    for seq_id, seq_df in seq_gp:
        seq_df_copy = seq_df.copy()
        for i in range(1, 6):
            pixel_cols = [f"tof_{i}_v{p}" for p in range(64)]; tof_data = seq_df_copy[pixel_cols].replace(-1, np.nan)
            seq_df_copy[f'tof_{i}_mean'], seq_df_copy[f'tof_{i}_std'], seq_df_copy[f'tof_{i}_min'], seq_df_copy[f'tof_{i}_max'] = tof_data.mean(axis=1), tof_data.std(axis=1), tof_data.min(axis=1), tof_data.max(axis=1)
            
            # Spatial Gradient ToF - HUSEYİN GUR - cgpt 2.FE 81>better 81
            spatial_feats = calculate_spatial_tof_features(seq_df_copy, i)
            seq_df_copy = pd.concat([seq_df_copy, spatial_feats], axis=1)
            # Spatial Gradient ToF - HUSEYİN GUR - cgpt 2.FE 81>better 81
        
        # Sadece belirlenen nihai özellik sütunlarını kullan
        X_list_unscaled.append(seq_df_copy[final_feature_cols].ffill().bfill().fillna(0).values.astype('float32'))
        y_list_int.append(seq_df_copy['gesture_int'].iloc[0])
        groups_list.append(seq_df_copy['subject'].iloc[0])
        lens.append(len(seq_df_copy))

    print("  Fitting StandardScaler...")
    all_steps_concatenated = np.concatenate(X_list_unscaled, axis=0)
    scaler = StandardScaler().fit(all_steps_concatenated)
    joblib.dump(scaler, EXPORT_DIR / "scaler.pkl")
    
    print("  Scaling and padding sequences...")
    X_scaled_list = [scaler.transform(x_seq) for x_seq in X_list_unscaled]
    pad_len = int(np.percentile(lens, PAD_PERCENTILE)); np.save(EXPORT_DIR / "sequence_maxlen.npy", pad_len)
    X = pad_sequences(X_scaled_list, maxlen=pad_len, padding='post', truncating='post', dtype='float32')
    #y_stratify = np.array(y_list_int)

       # --- DEĞİŞİKLİK BAŞLANGICI --- SOL ELLERİ HER FOLD'A EŞİT DAĞIT!
    subject_acc_x_mean_global = df.groupby('subject')['acc_x'].mean()
    subject_is_acc_x_mean_negative = (subject_acc_x_mean_global < 0).astype(str) # '0' veya '1' string olarak
    
    y_stratify = np.array([f"{gesture_label}_{subject_is_acc_x_mean_negative.loc[sub_id]}"
                           for gesture_label, sub_id in zip(y_list_int, groups_list)])
    # --- DEĞİŞİKLİK SONU ---
    
    groups, y = np.array(groups_list), to_categorical(y_list_int, num_classes=len(le.classes_))
    print("  Starting training with Stratified Group K-Fold CV...")
    sgkf = StratifiedGroupKFold(n_splits=N_SPLITS, shuffle=True, random_state=state_num)
    oof_preds = np.zeros_like(y, dtype='float32')
    
    for fold, (train_idx, val_idx) in enumerate(sgkf.split(X, y_stratify, groups)):
        print(f"\n===== FOLD {fold+1}/{N_SPLITS} =====")
        X_tr, X_val, y_tr, y_val = X[train_idx], X[val_idx], y[train_idx], y[val_idx]
       
        # --- [Önemli Değişiklik] Model Derlemesi ve Geri Çağırmalar ---
        model = build_three_headed_model_single_input(pad_len, imu_dim_final, tof_dim+ thm_dim, (5,64), len(le.classes_), wd=WD)
        
        # Learning Rate Scheduler ekleme - TEK BAŞINA 80>81
        # Bu scheduler, belirli bir metrik iyileşmediğinde öğrenme oranını azaltır.
        lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_main_output_accuracy',
            mode='max',
            factor=0.5,
            patience=8,
            cooldown=2,
            min_lr=3e-6,
            verbose=1
        )
        
        model.compile(optimizer=Adam(LR_INIT),
                      loss={'main_output': tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.1),
                             'aux_gates': tf.keras.losses.BinaryCrossentropy()
                             },
                      loss_weights={'main_output': 1.0,
                                     'aux_gates': GATE_LOSS_WEIGHT,
                                     },
                      metrics={'main_output': 'accuracy'})
        
        class_weight_dict = dict(enumerate(compute_class_weight('balanced', classes=np.arange(len(le.classes_)), y=y_tr.argmax(1))))
        
        # GatedMixupGenerator'ın imu_dim parametresini güncelledik
      #  train_gen = MixupGenerator(X_tr, y_tr, batch_size=BATCH_SIZE, imu_dim=imu_dim_final, class_weight=class_weight_dict, alpha=MIXUP_ALPHA, shuffle=True)
      #  val_gen = MixupGenerator(X_val, y_val, batch_size=BATCH_SIZE, imu_dim=imu_dim_final,class_weight=None, alpha=0.0,shuffle=False) # İmu_dim burada da doğru olmalı

        train_gen = GatedMixupGenerator(X_tr, y_tr, batch_size=BATCH_SIZE, imu_dim=imu_dim_final, class_weight=class_weight_dict, alpha=MIXUP_ALPHA, masking_prob=MASKING_PROB, shuffle=True)
        val_gen = GatedMixupGenerator(X_val, y_val, batch_size=BATCH_SIZE, imu_dim=imu_dim_final,class_weight=None, alpha=0.0, masking_prob=0.0,shuffle=False) # İmu_dim burada da doğru olmalı

        # EarlyStopping ve LearningRateScheduler'ı birlikte kullan
        cb = [
            EarlyStopping(patience=PATIENCE, restore_best_weights=True, verbose=1, monitor='val_main_output_accuracy', mode='max'),
            lr_scheduler
        ]
        
        model.fit(train_gen, epochs=EPOCHS, validation_data=val_gen, callbacks=cb, verbose=1)
        model.save(EXPORT_DIR / f"gesture_model_fold_{fold}.h5")
        preds_val,_ = model.predict(X_val) # Gate çıktısını ayır
        oof_preds[val_idx] = preds_val

    print("\n✔ Training done.")
    
    # --- [OOF Skoru Hesaplama] ---
    from metric import CompetitionMetric
    true_oof_int = y.argmax(1)
    pred_oof_int = oof_preds.argmax(1)
    
    h_f1_oof = CompetitionMetric().calculate_hierarchical_f1(
        pd.DataFrame({'gesture': le.classes_[true_oof_int]}),
        pd.DataFrame({'gesture': le.classes_[pred_oof_int]}))
    print(f"Overall OOF H‑F1 Score = {h_f1_oof:.4f}")

▶ TRAIN MODE – loading dataset ...
  Removing gravity and calculating linear acceleration features...
  Calculating angular velocity and distance from quaternions...
  IMU (phys-based + enhanced) 31 | THM + Aggregated TOF 40 | ToF raw 320| total 391 features
  Building sequences...
  Fitting StandardScaler...
  Scaling and padding sequences...
  Starting training with Stratified Group K-Fold CV...

===== FOLD 1/5 =====


I0000 00:00:1755624010.386805     997 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 21770 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3090, pci bus id: 0000:01:00.0, compute capability: 8.6


Epoch 1/160


I0000 00:00:1755624030.387895    1601 cuda_dnn.cc:529] Loaded cuDNN version 90300
2025-08-19 20:20:42.400459: W external/local_xla/xla/tsl/framework/bfc_allocator.cc:497] Allocator (GPU_0_bfc) ran out of memory trying to allocate 317.50MiB (rounded to 332922880)requested by op StatefulPartitionedCall/gradient_tape/functional_2_1/time_distributed_1_2/strided_slice_74/StridedSliceGrad
If the cause is memory fragmentation maybe the environment variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' will improve the situation. 
Current allocation summary follows.
Current allocation summary follows.
2025-08-19 20:20:42.400528: I external/local_xla/xla/tsl/framework/bfc_allocator.cc:1053] BFCAllocator dump for GPU_0_bfc
2025-08-19 20:20:42.400539: I external/local_xla/xla/tsl/framework/bfc_allocator.cc:1060] Bin (256): 	Total Chunks: 241, Chunks in use: 223. 60.2KiB allocated for chunks. 55.8KiB in use in bin. 9.5KiB client-requested in use in bin.
2025-08-19 20:20:42.400545: I external/local_xla/xla/