## Preparación de datos para modelos mTAND con series irregulares

In [1]:
import numpy as np
import pandas as pd

In [4]:
df = pd.read_csv('/home/gmartinez/Tesis/Datasets/Synthetic data/simulated_hr_data.csv')
df

Unnamed: 0,minute,state,heart_rate,timestamp,user_id,day
0,0,heart_rate_sensor_failure,,2024-01-19 00:00:00,1,2024-01-19
1,1,heart_rate_sensor_failure,,2024-01-19 00:01:00,1,2024-01-19
2,2,heart_rate_sensor_failure,,2024-01-19 00:02:00,1,2024-01-19
3,3,heart_rate_sensor_failure,,2024-01-19 00:03:00,1,2024-01-19
4,4,heart_rate_sensor_failure,,2024-01-19 00:04:00,1,2024-01-19
...,...,...,...,...,...,...
698395,1435,heart_rate_high_activity,147.443179,2024-12-15 23:55:00,30,2024-12-15
698396,1436,heart_rate_high_activity,140.558608,2024-12-15 23:56:00,30,2024-12-15
698397,1437,heart_rate_exercise_recovery,94.523685,2024-12-15 23:57:00,30,2024-12-15
698398,1438,heart_rate_exercise_recovery,99.619873,2024-12-15 23:58:00,30,2024-12-15


## Tokenización temporal irregular

In [5]:
def sequences_from_df(df, by='day', time_unit='m'):
    """
    Divide el DF en secuencias por (user_id, by).
    time_unit: 's' segundos, 'm' minutos -> controla escala de t/dt.
    Retorna lista de dicts con: user_id, key (día), t, x, m, state, timestamp.
    """
    unit = {'s': 's', 'm': 'm'}[time_unit]
    seqs = []
    for (uid, key), g in df.groupby(['user_id', by], sort=True):
        g = g.sort_values('timestamp')
        t0 = g['timestamp'].iloc[0]
        t = (g['timestamp'] - t0).dt.total_seconds()
        if unit == 'm':
            t = t / 60.0
        x = g['heart_rate'].astype(float).values
        m = (~pd.isna(x)).astype(float).values
        state = g['state'].astype(str).values
        seqs.append({
            'user_id': uid,
            'key': str(key),
            't': t.astype(np.float32),
            'x': np.nan_to_num(x, nan=0.0).astype(np.float32),
            'm': m.astype(np.float32),
            'state': state,
            'timestamp': g['timestamp'].values
        })
    return seqs

In [None]:
def sequences_from_df(df, by='day', time_unit='m'):
    """
    Divide el DF en secuencias por (user_id, by).
    time_unit: 's' segundos, 'm' minutos -> controla escala de t/dt.
    Retorna lista de dicts con: user_id, key (día), t, x, m, state, timestamp.
    """
    unit = {'s': 's', 'm': 'm'}[time_unit]
    seqs = []
    for (uid, key), g in df.groupby(['user_id', by], sort=True):
        g = g.sort_values('timestamp')
        t0 = g['timestamp'].iloc[0]
        t = (g['timestamp'] - t0).dt.total_seconds()
        if unit == 'm':
            t = t / 60.0
        x = g['heart_rate'].astype(float).values
        m = (~pd.isna(x)).astype(float).values
        state = g['state'].astype(str).values
        seqs.append({
            'user_id': uid,
            'key': str(key),
            't': t.astype(np.float32),
            'x': np.nan_to_num(x, nan=0.0).astype(np.float32),
            'm': m.astype(np.float32),
            'state': state,
            'timestamp': g['timestamp'].values
        })
    return seqs

In [None]:
def pad_batch(seqs, pad_to=None):
    """
    Convierte una lista de secuencias en tensores padded.
    Salida: dict con arrays: t, x, m, dt, state_idx, lengths, user_id
    Dimensiones: [B, T_max, ...]
    """
    import numpy as np
    B = len(seqs)
    L = [len(s['t']) for s in seqs]
    T_max = pad_to if pad_to is not None else max(L)

    def pad_array(arr, fill_value=0):
        out = np.full((B, T_max), fill_value, dtype=arr[0].dtype)
        for i, a in enumerate(arr):
            out[i, :len(a)] = a
        return out

    t = pad_array([s['t'] for s in seqs], fill_value=0.0)
    x = pad_array([s['x'] for s in seqs], fill_value=0.0)
    m = pad_array([s['m'] for s in seqs], fill_value=0.0)
    dt = pad_array([s['dt'] for s in seqs], fill_value=0.0)
    state_idx = pad_array([s['state_idx'].astype(np.int64) for s in seqs], fill_value=0)

    lengths = np.array(L, dtype=np.int32)
    user_ids = np.array([s['user_id'] for s in seqs], dtype=np.int32)

    # máscara de atención: 1 válido, 0 padded
    attn_mask = np.zeros((B, T_max), dtype=np.float32)
    for i, l in enumerate(L):
        attn_mask[i, :l] = 1.0

    return {
        't': t, 'x': x, 'm': m, 'dt': dt, 'state_idx': state_idx,
        'lengths': lengths, 'attn_mask': attn_mask, 'user_id': user_ids
    }