In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

In [2]:
DATA_DIR = Path('/home/gmartinez/Tesis/Datasets/Synthetic-data/outputs')  # cambia según tu entorno
HR_CSV = DATA_DIR / "hr_series.csv"
SLEEP_CSV = DATA_DIR / "sleep_series.csv"
LABELS_CSV = DATA_DIR / "nightly_labeled.csv"
PROFILES_CSV = DATA_DIR / "user_profiles.csv"

In [3]:
# Cargar
hr_data = pd.read_csv(HR_CSV, parse_dates=['timestamp'])
sleep_data = pd.read_csv(SLEEP_CSV, parse_dates=['night_date'])
labels = pd.read_csv(LABELS_CSV, parse_dates=['night_date'])

# Normalizar tipos
hr_data['user_id'] = hr_data['user_id'].astype(str)
sleep_data['user_id'] = sleep_data['user_id'].astype(str)
labels['user_id'] = labels['user_id'].astype(str)

# Derivar fecha (día) para HR y contar eventos diarios (tu serie ejemplo es un conteo por día)
hr_data['date'] = hr_data['timestamp'].dt.floor('D')

# Serie diaria de HR (puedes reemplazar "count" por otra métrica; p.ej. mediana HR)
hr_daily = (
    hr_data
    .groupby(['user_id','date'], as_index=False)
    .agg(hr_count=('hr','count'),
         hr_med=('hr','median'))  # opcional, por si luego prefieres mediana
)

# Para el baseline usaremos hr_count como la serie diaria principal

In [4]:
labels.head()

Unnamed: 0,user_id,night_date,total_hours,deep_minutes,rem_minutes,awake_minutes,sleep_latency_minutes,awakenings,rhr_median,hrv_proxy_median,...,score_S_rem,score_S_awake,score_S_latency,score_C_rhr,score_C_hrv,score_C_resp,recovery_score_0_1,recovery_threshold,recovery_label_binary,failure_minutes
0,1,2024-01-01,5.766667,128,8,95,2,42,,,...,0.0,0.0,1.0,,,,0.310667,0.349333,Deficient Recovery,63
1,1,2024-01-02,6.333333,140,10,118,6,32,,,...,0.0,0.0,0.975,,,,0.330633,0.349333,Deficient Recovery,29
2,1,2024-01-03,6.233333,146,16,77,1,34,,,...,0.0,0.0,1.0,,,,0.329333,0.349333,Deficient Recovery,46
3,1,2024-01-04,7.266667,176,16,82,1,34,,,...,0.0,0.0,1.0,,,,0.370667,0.349333,Adequate Recovery,56
4,1,2024-01-05,7.416667,134,12,128,2,45,,,...,0.0,0.0,1.0,,,,0.376667,0.349333,Adequate Recovery,49


In [5]:
# Mapear etiqueta binaria a 0/1
label_map = {
    'Adequate Recovery': 1,
    'Deficient Recovery': 0
}
labels['target'] = labels['recovery_label_binary'].map(label_map).astype(int)  # ajusta el nombre de la columna real

# Índice maestro de noches etiquetadas
nights = labels[['user_id','night_date','target']].dropna().copy()
nights['night_date'] = pd.to_datetime(nights['night_date'])

In [6]:
from datetime import timedelta

def extract_hr_window(hr_daily_user, night_date, past_days=7, include_night=True, feature='hr_count'):
    """
    Devuelve vector de longitud L = past_days + (1 si include_night) con HR diaria.
    Rellena con 0 si faltan días (o usa np.nan y luego imputación).
    """
    L = past_days + (1 if include_night else 0)
    start_date = (night_date - timedelta(days=past_days)) if include_night else (night_date - timedelta(days=past_days))
    end_date = night_date if include_night else (night_date - timedelta(days=1))
    idx_days = pd.date_range(start=start_date, end=end_date, freq='D')

    # Merge con serie del usuario
    series = (
        pd.DataFrame({'date': idx_days})
        .merge(hr_daily_user[['date', feature]], on='date', how='left')
        [feature]
        .fillna(0.0)  # baseline: 0 al faltar datos; alternativa: forward-fill/back-fill/median
        .to_numpy(dtype=float)
    )
    if len(series) != L:
        # Por seguridad
        series = np.resize(series, L)
    return series

SLEEP_FEATURES = ['total_hours','deep_minutes','rem_minutes','awake_minutes',
                  'sleep_latency_minutes','awakenings','failure_minutes']

def extract_sleep_features_for_night(sleep_user, night_date, features=SLEEP_FEATURES):
    row = sleep_user.loc[sleep_user['night_date'] == night_date, features]
    if row.empty:
        return np.array([np.nan]*len(features), dtype=float)
    return row.iloc[0].astype(float).to_numpy()

def standardize_fit(X, eps=1e-8):
    mean = np.nanmean(X, axis=0)
    std = np.nanstd(X, axis=0)
    std = np.where(std < eps, 1.0, std)
    return mean, std

def standardize_transform(X, mean, std):
    return (X - mean) / std

In [7]:
def build_samples(hr_daily, sleep_data, nights, past_days=7, use_hr_feature='hr_count'):
    # Pre-index por usuario para eficiencia
    hr_by_user = {u: df.sort_values('date') for u, df in hr_daily.groupby('user_id')}
    sleep_by_user = {u: df.sort_values('night_date') for u, df in sleep_data.groupby('user_id')}

    X_list = []      # features tensor (variable length → luego padding)
    T_list = []      # timestamps por canal
    M_list = []      # masks por canal
    y_list = []
    meta_list = []   # (user_id, night_date)

    for _, row in nights.iterrows():
        u = row['user_id']
        night = row['night_date']
        target = row['target']

        hr_user = hr_by_user.get(u, pd.DataFrame(columns=['date',use_hr_feature]))
        sleep_user = sleep_by_user.get(u, pd.DataFrame(columns=['night_date'] + SLEEP_FEATURES))

        # Canal HR
        hr_vec = extract_hr_window(hr_user, night_date=night, past_days=past_days,
                                   include_night=True, feature=use_hr_feature)  # (L1,)
        L1 = len(hr_vec)
        # timestamps uniformes en [0,1]
        t_hr = np.linspace(0.0, 1.0, L1)
        m_hr = (~np.isnan(hr_vec)).astype(float)

        # Canal Sleep (longitud 1, multi-features)
        s_vec = extract_sleep_features_for_night(sleep_user, night, features=SLEEP_FEATURES)  # (C2,)
        L2 = 1
        t_sl = np.array([1.0])  # situamos la noche objetivo al final de la ventana
        m_sl = (~np.isnan(s_vec)).astype(float)  # para C2 canales aplicaremos máscara por canal

        # Montaje multicanal:
        # Representamos como dos canales separados, cada uno con su propio conjunto de features.
        # Para mTAN, típica entrada: (B, L, C) con timestamps y máscara (B, L, C).
        # Aquí unificamos longitudes concatenando en el eje temporal y rellenamos con ceros/máscara 0.
        # Canal HR: C_hr = 1, Canal Sleep: C_sl = len(SLEEP_FEATURES). Concatenaremos features en eje C, alineando tiempo con padding.

        # Construir secuencia temporal concatenando puntos HR y Sleep
        t_seq = np.concatenate([t_hr, t_sl], axis=0)                 # (L1+1,)
        # Features: creamos C = 1 + len(SLEEP_FEATURES)
        C = 1 + len(SLEEP_FEATURES)
        X = np.zeros((L1+1, C), dtype=float)
        M = np.zeros((L1+1, C), dtype=float)

        # Rellenar HR en canal 0 para los L1 primeros pasos
        X[:L1, 0] = np.nan_to_num(hr_vec, nan=0.0)
        M[:L1, 0] = m_hr

        # Rellenar Sleep en el último paso temporal, canales 1..C-1
        X[L1, 1:] = np.nan_to_num(s_vec, nan=0.0)
        M[L1, 1:] = (~np.isnan(s_vec)).astype(float)

        X_list.append(X)
        T_list.append(t_seq)
        M_list.append(M)
        y_list.append(target)
        meta_list.append((u, night))

    return X_list, T_list, M_list, np.array(y_list, dtype=int), meta_list

# Construcción
X_list, T_list, M_list, y, meta = build_samples(hr_daily, sleep_data, nights, past_days=7, use_hr_feature='hr_count')

In [8]:
from sklearn.model_selection import GroupShuffleSplit

# Split por usuario para evitar fuga
users = np.array([u for u, _ in meta])
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
(train_idx, test_idx) = next(gss.split(np.zeros(len(users)), groups=users))

def stack_fixed_length(X_list, T_list, M_list):
    # Si todas las secuencias tienen la misma longitud, basta apilar.
    L = X_list[0].shape[0]
    C = X_list[0].shape[1]
    B = len(X_list)
    X = np.stack(X_list, axis=0)    # (B, L, C)
    T = np.stack([t for t in T_list], axis=0)  # (B, L)
    M = np.stack(M_list, axis=0)    # (B, L, C)
    return X, T, M

X_all, T_all, M_all = stack_fixed_length(X_list, T_list, M_list)

# Estimación de mean/std solo con train
X_train = X_all[train_idx]
M_train = M_all[train_idx]

# Calcular stats por canal usando solo elementos observados (M=1)
obs_mask = M_train.astype(bool)
sum_x = (X_train * obs_mask).sum(axis=(0,1))
count_x = obs_mask.sum(axis=(0,1)).clip(min=1)
mean_x = sum_x / count_x

sum_sq = ((X_train - mean_x) * obs_mask)**2
var_x = sum_sq.sum(axis=(0,1)) / count_x
std_x = np.sqrt(np.maximum(var_x, 1e-8))

def apply_standardize(X, M, mean_x, std_x):
    return (X - mean_x) / std_x

X_std = apply_standardize(X_all, M_all, mean_x, std_x)

X_train, T_train, M_train, y_train = X_std[train_idx], T_all[train_idx], M_all[train_idx], y[train_idx]
X_test,  T_test,  M_test,  y_test  = X_std[test_idx],  T_all[test_idx],  M_all[test_idx],  y[test_idx]

In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class TimeAwareAttention(nn.Module):
    def __init__(self, d_in, d_model, n_heads=4, dropout=0.1):
        super().__init__()
        self.n_heads = n_heads
        self.d_model = d_model
        self.dk = d_model // n_heads
        assert d_model % n_heads == 0
        self.Wq = nn.Linear(d_in, d_model)
        self.Wk = nn.Linear(d_in, d_model)
        self.Wv = nn.Linear(d_in, d_model)
        self.out = nn.Linear(d_model, d_model)
        self.time_decay = nn.Parameter(torch.tensor(1.0))  # factor de decaimiento temporal
        self.dropout = nn.Dropout(dropout)

    def forward(self, X, T, M):
        """
        X: (B, L, C_in) ya estandarizado
        T: (B, L) en [0,1]
        M: (B, L, C_in) máscara de observación; aquí usamos una máscara por tiempo: si al menos 1 canal observado
        """
        B, L, Cin = X.shape
        # Reducimos canales a una representación por tiempo mediante proyección
        # Alternativa: sumar por canales con pesos aprendibles
        X_in = X  # (B, L, C)
        # Proyección a espacio de atención
        Q = self.Wq(X_in)  # (B, L, d_model)
        K = self.Wk(X_in)
        V = self.Wv(X_in)

        # Multi-head split
        def split_heads(Z):
            return Z.view(B, L, self.n_heads, self.dk).transpose(1,2)  # (B, h, L, dk)
        Qh, Kh, Vh = split_heads(Q), split_heads(K), split_heads(V)

        # Similitud + sesgo temporal
        # Distancia temporal |ti - tj|
        # Usamos T medio por paso; si hubiera múltiples canales con distintos tiempos, promediaríamos.
        t = T.unsqueeze(1).unsqueeze(-1)  # (B,1,L,1)
        # scores: (B, h, L, L)
        scores = torch.matmul(Qh, Kh.transpose(-2, -1)) / np.sqrt(self.dk)

        # Penalización por distancia temporal
        # dist[i,j] = |ti - tj|
        Ti = T.unsqueeze(1).unsqueeze(-1)  # (B,1,L,1)
        Tj = T.unsqueeze(1).unsqueeze(-2)  # (B,1,1,L)
        dist = torch.abs(Ti - Tj)          # (B,1,L,L)
        scores = scores - self.time_decay.abs() * dist  # mayor distancia → menor atención

        # Máscara temporal: pasos sin observación en K
        time_mask = (M.sum(dim=-1) > 0).unsqueeze(1).unsqueeze(2)  # (B,1,1,L_K)
        scores = scores.masked_fill(~time_mask, float('-inf'))

        attn = torch.softmax(scores, dim=-1)
        attn = self.dropout(attn)
        Z = torch.matmul(attn, Vh)  # (B, h, L, dk)
        Z = Z.transpose(1,2).contiguous().view(B, L, self.d_model)
        return self.out(Z), attn

class MTANClassifier(nn.Module):
    def __init__(self, c_in, d_model=128, n_heads=4, n_layers=2, dropout=0.1):
        super().__init__()
        self.input_proj = nn.Linear(c_in, d_model)
        self.layers = nn.ModuleList([
            nn.ModuleList([
                TimeAwareAttention(d_model, d_model, n_heads=n_heads, dropout=dropout),
                nn.LayerNorm(d_model),
                nn.Sequential(
                    nn.Linear(d_model, d_model*2),
                    nn.GELU(),
                    nn.Dropout(dropout),
                    nn.Linear(d_model*2, d_model)
                ),
                nn.LayerNorm(d_model),
            ]) for _ in range(n_layers)
        ])
        self.cls = nn.Sequential(
            nn.Linear(d_model, d_model),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(d_model, 1)
        )

    def forward(self, X, T, M):
        # X: (B,L,C), T:(B,L), M:(B,L,C)
        B,L,C = X.shape
        x = self.input_proj(X)  # (B,L,d_model)
        for attn, ln1, ff, ln2 in self.layers:
            h, _ = attn(x, T, M)
            x = ln1(x + h)
            f = ff(x)
            x = ln2(x + f)
        # Pooling temporal con máscara
        time_mask = (M.sum(dim=-1) > 0).float()  # (B,L)
        masked_x = x * time_mask.unsqueeze(-1)
        denom = time_mask.sum(dim=1, keepdim=True).clamp_min(1.0)
        pooled = masked_x.sum(dim=1) / denom  # (B,d_model)
        logit = self.cls(pooled).squeeze(-1)   # (B,)
        return logit

In [10]:
import torch
from torch.utils.data import Dataset, DataLoader

class TimeDataset(Dataset):
    def __init__(self, X, T, M, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.T = torch.tensor(T, dtype=torch.float32)
        self.M = torch.tensor(M, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)
    def __len__(self): return len(self.y)
    def __getitem__(self, idx):
        return self.X[idx], self.T[idx], self.M[idx], self.y[idx]

train_ds = TimeDataset(X_train, T_train, M_train, y_train)
test_ds  = TimeDataset(X_test,  T_test,  M_test,  y_test)

train_loader = DataLoader(train_ds, batch_size=128, shuffle=True, drop_last=False)
test_loader  = DataLoader(test_ds,  batch_size=256, shuffle=False, drop_last=False)

In [11]:
from sklearn.metrics import roc_auc_score, average_precision_score
import math

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = MTANClassifier(c_in=X_train.shape[2], d_model=128, n_heads=4, n_layers=2, dropout=0.2).to(device)

# Pérdida BCE con logits; alternativa: focal si hay fuerte desbalance
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-3, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=20)

def evaluate(model, loader):
    model.eval()
    ys, ps = [], []
    with torch.no_grad():
        for Xb, Tb, Mb, yb in loader:
            Xb, Tb, Mb = Xb.to(device), Tb.to(device), Mb.to(device)
            yb = yb.to(device)
            logit = model(Xb, Tb, Mb)
            prob = torch.sigmoid(logit)
            ys.append(yb.detach().cpu().numpy())
            ps.append(prob.detach().cpu().numpy())
    y_true = np.concatenate(ys)
    y_prob = np.concatenate(ps)
    auroc = roc_auc_score(y_true, y_prob) if len(np.unique(y_true))>1 else np.nan
    auprc = average_precision_score(y_true, y_prob) if len(np.unique(y_true))>1 else np.nan
    return auroc, auprc

best = {'auroc': -1, 'state': None}
EPOCHS = 20

for epoch in range(1, EPOCHS+1):
    model.train()
    for Xb, Tb, Mb, yb in train_loader:
        Xb, Tb, Mb = Xb.to(device), Tb.to(device), Mb.to(device)
        yb = yb.to(device)
        logit = model(Xb, Tb, Mb)
        loss = criterion(logit, yb)
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
    scheduler.step()

    auroc, auprc = evaluate(model, test_loader)
    if auroc > best['auroc']:
        best['auroc'] = auroc
        best['state'] = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
    print(f"Epoch {epoch:02d} | Test AUROC {auroc:.4f} | AUPRC {auprc:.4f}")

# Restaurar mejor estado (opcional)
if best['state'] is not None:
    model.load_state_dict(best['state'])

Epoch 01 | Test AUROC 0.9970 | AUPRC 0.9952
Epoch 02 | Test AUROC 0.9987 | AUPRC 0.9984
Epoch 03 | Test AUROC 0.9993 | AUPRC 0.9987
Epoch 04 | Test AUROC 0.9946 | AUPRC 0.9958
Epoch 05 | Test AUROC 0.9972 | AUPRC 0.9977
Epoch 06 | Test AUROC 0.9875 | AUPRC 0.9901
Epoch 07 | Test AUROC 0.9981 | AUPRC 0.9979
Epoch 08 | Test AUROC 0.9991 | AUPRC 0.9985
Epoch 09 | Test AUROC 0.9996 | AUPRC 0.9994
Epoch 10 | Test AUROC 0.9995 | AUPRC 0.9993
Epoch 11 | Test AUROC 0.9997 | AUPRC 0.9995
Epoch 12 | Test AUROC 0.9997 | AUPRC 0.9996
Epoch 13 | Test AUROC 0.9998 | AUPRC 0.9997
Epoch 14 | Test AUROC 0.9999 | AUPRC 0.9999
Epoch 15 | Test AUROC 0.9999 | AUPRC 0.9999
Epoch 16 | Test AUROC 0.9999 | AUPRC 0.9999
Epoch 17 | Test AUROC 0.9999 | AUPRC 0.9999
Epoch 18 | Test AUROC 0.9999 | AUPRC 0.9999
Epoch 19 | Test AUROC 0.9999 | AUPRC 0.9999
Epoch 20 | Test AUROC 0.9999 | AUPRC 0.9999


**Adding Profile information**

In [12]:
# =========================================
# Paths y carga base
# =========================================
from pathlib import Path
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import roc_auc_score, average_precision_score

# Rutas
DATA_DIR = Path('/home/gmartinez/Tesis/Datasets/Synthetic-data/outputs')
HR_CSV = DATA_DIR / "hr_series.csv"
SLEEP_CSV = DATA_DIR / "sleep_series.csv"
LABELS_CSV = DATA_DIR / "nightly_labeled.csv"
PROFILES_CSV = DATA_DIR / "user_profiles.csv"

# Semilla y dispositivo
SEED = 42
def set_seed(seed=SEED):
    import random
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
set_seed(SEED)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# =========================================
# Carga de datos de series y etiquetas
# =========================================
hr_data = pd.read_csv(HR_CSV, parse_dates=['timestamp'])
sleep_data = pd.read_csv(SLEEP_CSV, parse_dates=['night_date'])
labels = pd.read_csv(LABELS_CSV, parse_dates=['night_date'])

hr_data['user_id'] = hr_data['user_id'].astype(str)
sleep_data['user_id'] = sleep_data['user_id'].astype(str)
labels['user_id'] = labels['user_id'].astype(str)

# Serie diaria HR (conteo y mediana opcional)
hr_data['date'] = hr_data['timestamp'].dt.floor('D')
hr_daily = (
    hr_data
    .groupby(['user_id','date'], as_index=False)
    .agg(hr_count=('hr','count'),
         hr_med=('hr','median'))
)

# Etiquetas binarizadas
label_map = {'Adequate Recovery': 1, 'Deficient Recovery': 0}
# Ajusta el nombre real de la columna; en tu fragmento pusiste 'recovery_label_binary'
labels['target'] = labels['recovery_label_binary'].map(label_map).astype(int)
nights = labels[['user_id','night_date','target']].dropna().copy()
nights['night_date'] = pd.to_datetime(nights['night_date'])

# =========================================
# Extracción de ventanas HR y features de Sleep
# =========================================
from datetime import timedelta

SLEEP_FEATURES = ['total_hours','deep_minutes','rem_minutes','awake_minutes',
                  'sleep_latency_minutes','awakenings','failure_minutes']

def extract_hr_window(hr_daily_user, night_date, past_days=7, include_night=True, feature='hr_count'):
    L = past_days + (1 if include_night else 0)
    start_date = night_date - timedelta(days=past_days)
    end_date = night_date if include_night else (night_date - timedelta(days=1))
    idx_days = pd.date_range(start=start_date, end=end_date, freq='D')
    ser = (
        pd.DataFrame({'date': idx_days})
        .merge(hr_daily_user[['date', feature]], on='date', how='left')
        [feature]
        .to_numpy(dtype=float)
    )
    # No imputes aún; deja NaN para que el M maneje ausencias
    if len(ser) != L:
        ser = np.resize(ser, L)
    return ser

def extract_sleep_features_for_night(sleep_user, night_date, features=SLEEP_FEATURES):
    row = sleep_user.loc[sleep_user['night_date'] == night_date, features]
    if row.empty:
        return np.array([np.nan]*len(features), dtype=float)
    return row.iloc[0].astype(float).to_numpy()

def build_samples(hr_daily, sleep_data, nights, past_days=7, use_hr_feature='hr_count'):
    hr_by_user = {u: df.sort_values('date') for u, df in hr_daily.groupby('user_id')}
    sleep_by_user = {u: df.sort_values('night_date') for u, df in sleep_data.groupby('user_id')}

    X_list, T_list, M_list, y_list, meta_list = [], [], [], [], []

    for _, row in nights.iterrows():
        u = row['user_id']; night = row['night_date']; target = row['target']
        hr_user = hr_by_user.get(u, pd.DataFrame(columns=['date',use_hr_feature]))
        sleep_user = sleep_by_user.get(u, pd.DataFrame(columns=['night_date'] + SLEEP_FEATURES))

        # HR canal 0
        hr_vec = extract_hr_window(hr_user, night_date=night, past_days=past_days,
                                   include_night=True, feature=use_hr_feature)  # (L1,)
        L1 = len(hr_vec)
        t_hr = np.linspace(0.0, 1.0, L1)
        m_hr = (~np.isnan(hr_vec)).astype(float)

        # Sleep en el último instante
        s_vec = extract_sleep_features_for_night(sleep_user, night, features=SLEEP_FEATURES)  # (C2,)
        L2 = 1
        t_sl = np.array([1.0])
        m_sl = (~np.isnan(s_vec)).astype(float)

        t_seq = np.concatenate([t_hr, t_sl], axis=0)  # (L1+1,)
        C = 1 + len(SLEEP_FEATURES)
        X = np.zeros((L1+1, C), dtype=float)
        M = np.zeros((L1+1, C), dtype=float)

        # En X, conserva NaN para canales correspondientes; imputamos solo al final del z-score
        X[:L1, 0] = hr_vec
        M[:L1, 0] = m_hr

        X[L1, 1:] = s_vec
        M[L1, 1:] = m_sl

        X_list.append(X); T_list.append(t_seq); M_list.append(M)
        y_list.append(target); meta_list.append((u, night))

    return X_list, T_list, M_list, np.array(y_list, dtype=int), meta_list

X_list, T_list, M_list, y, meta = build_samples(hr_daily, sleep_data, nights, past_days=7, use_hr_feature='hr_count')

# =========================================
# Apilado y estandarización por canal (solo train)
# =========================================
def stack_fixed_length(X_list, T_list, M_list):
    L = X_list[0].shape[0]; C = X_list[0].shape[1]
    X = np.stack(X_list, axis=0)
    T = np.stack(T_list, axis=0)
    M = np.stack(M_list, axis=0)
    return X, T, M

X_all, T_all, M_all = stack_fixed_length(X_list, T_list, M_list)

# Split por usuario
users = np.array([u for u, _ in meta]).astype(str)
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=SEED)
train_idx, test_idx = next(gss.split(np.zeros(len(users)), groups=users))

# Stats por canal usando solo observados en train
obs_mask = M_all[train_idx].astype(bool)
sum_x = np.nansum(np.where(obs_mask, X_all[train_idx], np.nan), axis=(0,1))
count_x = obs_mask.sum(axis=(0,1)).clip(min=1)
mean_x = sum_x / count_x

sum_sq = np.nansum(((np.where(obs_mask, X_all[train_idx], mean_x)) - mean_x)**2, axis=(0,1))
var_x = sum_sq / count_x
std_x = np.sqrt(np.maximum(var_x, 1e-8))

def apply_standardize(X, M, mean_x, std_x):
    Xz = (np.where(M>0, X, mean_x) - mean_x) / std_x
    # Donde M=0, deja 0 tras z-score (neutral); el backbone usa M para ignorar esos puntos
    Xz = np.where(M>0, Xz, 0.0)
    return Xz

X_std = apply_standardize(X_all, M_all, mean_x, std_x)

X_train, T_train, M_train, y_train = X_std[train_idx], T_all[train_idx], M_all[train_idx], y[train_idx]
X_test,  T_test,  M_test,  y_test  = X_std[test_idx],  T_all[test_idx],  M_all[test_idx],  y[test_idx]

# =========================================
# Modelo mTAN (Fase 1) y Dataset
# =========================================
class TimeAwareAttention(nn.Module):
    def __init__(self, d_in, d_model, n_heads=4, dropout=0.1):
        super().__init__()
        self.n_heads = n_heads
        self.d_model = d_model
        self.dk = d_model // n_heads
        assert d_model % n_heads == 0
        self.Wq = nn.Linear(d_in, d_model)
        self.Wk = nn.Linear(d_in, d_model)
        self.Wv = nn.Linear(d_in, d_model)
        self.out = nn.Linear(d_model, d_model)
        self.time_decay = nn.Parameter(torch.tensor(1.0))
        self.dropout = nn.Dropout(dropout)

    def forward(self, X, T, M):
        B, L, Cin = X.shape
        Q = self.Wq(X); K = self.Wk(X); V = self.Wv(X)

        def split(Z): return Z.view(B, L, self.n_heads, self.dk).transpose(1,2)
        Qh, Kh, Vh = split(Q), split(K), split(V)

        scores = torch.matmul(Qh, Kh.transpose(-2, -1)) / np.sqrt(self.dk)
        Ti = T.unsqueeze(1).unsqueeze(-1)
        Tj = T.unsqueeze(1).unsqueeze(-2)
        dist = torch.abs(Ti - Tj)
        scores = scores - self.time_decay.abs() * dist

        time_mask = (M.sum(dim=-1) > 0).unsqueeze(1).unsqueeze(2)
        scores = scores.masked_fill(~time_mask, float('-inf'))
        attn = torch.softmax(scores, dim=-1)
        attn = self.dropout(attn)
        Z = torch.matmul(attn, Vh)
        Z = Z.transpose(1,2).contiguous().view(B, L, self.d_model)
        return self.out(Z), attn

class MTANBackbone(nn.Module):
    def __init__(self, c_in, d_model=128, n_heads=4, n_layers=2, dropout=0.2):
        super().__init__()
        self.input_proj = nn.Linear(c_in, d_model)
        self.layers = nn.ModuleList([
            nn.ModuleList([
                TimeAwareAttention(d_model, d_model, n_heads=n_heads, dropout=dropout),
                nn.LayerNorm(d_model),
                nn.Sequential(
                    nn.Linear(d_model, d_model*2),
                    nn.GELU(),
                    nn.Dropout(dropout),
                    nn.Linear(d_model*2, d_model)
                ),
                nn.LayerNorm(d_model),
            ]) for _ in range(n_layers)
        ])

    def forward(self, X, T, M):
        x = self.input_proj(X)
        for attn, ln1, ff, ln2 in self.layers:
            h, _ = attn(x, T, M)
            x = ln1(x + h)
            f = ff(x)
            x = ln2(x + f)
        time_mask = (M.sum(dim=-1) > 0).float()
        masked_x = x * time_mask.unsqueeze(-1)
        denom = time_mask.sum(dim=1, keepdim=True).clamp_min(1.0)
        pooled = masked_x.sum(dim=1) / denom
        return pooled  # (B, d_model)

class TSOnlyHead(nn.Module):
    def __init__(self, d_model=128, dropout=0.2):
        super().__init__()
        self.cls = nn.Sequential(
            nn.Linear(d_model, d_model),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(d_model, 1)
        )
    def forward(self, h):
        return self.cls(h).squeeze(-1)

class ProfileMLP(nn.Module):
    def __init__(self, p_dim, d_hidden=64, dropout=0.1):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(p_dim, d_hidden),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(d_hidden, d_hidden),
            nn.GELU(),
            nn.Dropout(dropout)
        )
    def forward(self, p):
        return self.net(p)

class TSPlusProfileHead(nn.Module):
    def __init__(self, d_model=128, p_hidden=64, d_joint=128, dropout=0.2):
        super().__init__()
        self.joint = nn.Sequential(
            nn.Linear(d_model + p_hidden, d_joint),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(d_joint, 1)
        )
    def forward(self, h_mtan, h_prof):
        z = torch.cat([h_mtan, h_prof], dim=-1)
        return self.joint(z).squeeze(-1)

class MTANWithHeads(nn.Module):
    def __init__(self, c_in, d_model=128, n_heads=4, n_layers=2, p_dim=None, prof_hidden=64, dropout=0.2):
        super().__init__()
        self.backbone = MTANBackbone(c_in=c_in, d_model=d_model, n_heads=n_heads, n_layers=n_layers, dropout=dropout)
        self.ts_head = TSOnlyHead(d_model=d_model, dropout=dropout)
        self.use_profile = p_dim is not None
        if self.use_profile:
            self.prof_mlp = ProfileMLP(p_dim=p_dim, d_hidden=prof_hidden, dropout=0.1)
            self.ts_p_head = TSPlusProfileHead(d_model=d_model, p_hidden=prof_hidden, d_joint=d_model, dropout=dropout)

    def forward_phase1(self, X, T, M):
        h = self.backbone(X, T, M)
        return self.ts_head(h)

    def forward_phase2(self, X, T, M, p):
        h = self.backbone(X, T, M)
        h_p = self.prof_mlp(p)
        return self.ts_p_head(h, h_p)

class TimeDataset(Dataset):
    def __init__(self, X, T, M, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.T = torch.tensor(T, dtype=torch.float32)
        self.M = torch.tensor(M, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)
    def __len__(self): return len(self.y)
    def __getitem__(self, idx): return self.X[idx], self.T[idx], self.M[idx], self.y[idx]

train_loader_p1 = DataLoader(TimeDataset(X_train, T_train, M_train, y_train), batch_size=128, shuffle=True)
test_loader_p1  = DataLoader(TimeDataset(X_test,  T_test,  M_test,  y_test),  batch_size=256, shuffle=False)

# =========================================
# Entrenamiento y evaluación Fase 1
# =========================================
def evaluate_phase1(model, loader):
    model.eval(); ys, ps = [], []
    with torch.no_grad():
        for Xb, Tb, Mb, yb in loader:
            Xb, Tb, Mb, yb = Xb.to(device), Tb.to(device), Mb.to(device), yb.to(device)
            prob = torch.sigmoid(model.forward_phase1(Xb, Tb, Mb))
            ys.append(yb.detach().cpu().numpy()); ps.append(prob.detach().cpu().numpy())
    y_true = np.concatenate(ys); y_prob = np.concatenate(ps)
    auroc = roc_auc_score(y_true, y_prob) if len(np.unique(y_true))>1 else np.nan
    auprc = average_precision_score(y_true, y_prob) if len(np.unique(y_true))>1 else np.nan
    return auroc, auprc

EPOCHS = 20
set_seed(SEED)
model_p1 = MTANWithHeads(c_in=X_train.shape[2], d_model=128, n_heads=4, n_layers=2, p_dim=None, prof_hidden=64, dropout=0.2).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(model_p1.parameters(), lr=2e-3, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS)
best = {'auroc': -1, 'state': None}

for epoch in range(1, EPOCHS+1):
    model_p1.train()
    for Xb, Tb, Mb, yb in train_loader_p1:
        Xb, Tb, Mb, yb = Xb.to(device), Tb.to(device), Mb.to(device), yb.to(device)
        logit = model_p1.forward_phase1(Xb, Tb, Mb)
        loss = criterion(logit, yb)
        optimizer.zero_grad(); loss.backward()
        torch.nn.utils.clip_grad_norm_(model_p1.parameters(), 1.0)
        optimizer.step()
    scheduler.step()
    auroc, auprc = evaluate_phase1(model_p1, test_loader_p1)
    if auroc > best['auroc']:
        best['auroc'] = auroc
        best['state'] = {k: v.detach().cpu().clone() for k, v in model_p1.state_dict().items()}
    print(f"[Phase 1] Epoch {epoch:02d} | Test AUROC {auroc:.4f} | AUPRC {auprc:.4f}")

if best['state'] is not None:
    model_p1.load_state_dict(best['state'])
auroc_p1, auprc_p1 = evaluate_phase1(model_p1, test_loader_p1)

# =========================================
# Perfiles: preparación desde PROFILES_CSV
# =========================================
# CSV con columnas: user_id, age_group, gender, physical_activity_level, smoking_status,
# alcohol_consumption, diabetes, hypertension, age

profiles_df = pd.read_csv(PROFILES_CSV)
profiles_df['user_id'] = profiles_df['user_id'].astype(str)

# Usuarios en splits
train_users = sorted(set(users[train_idx]))
test_users  = sorted(set(users[test_idx]))

# Especificamos columnas categóricas y numéricas según tu CSV
CAT_COLS = ['age_group','gender','physical_activity_level','smoking_status','alcohol_consumption','diabetes','hypertension']
NUM_COLS = ['age']

def fit_profile_artifacts(df_train, cat_cols, num_cols):
    # Mapas de categorías con <UNK>
    cat_maps = {}
    for c in cat_cols:
        cats = list(pd.Series(df_train[c].astype(str)).astype('category').cat.categories)
        cat_maps[c] = cats + (["<UNK>"] if "<UNK>" not in cats else [])
    # Stats numéricas
    num_stats = {}
    for c in num_cols:
        col = pd.to_numeric(df_train[c], errors='coerce')
        mu = float(col.mean(skipna=True)) if col.notna().any() else 0.0
        sd = float(col.std(skipna=True)) if col.notna().any() else 1.0
        if sd < 1e-8: sd = 1.0
        num_stats[c] = {"mean": mu, "std": sd}
    return cat_maps, num_stats

def one_hot_series(series, categories):
    cat_to_idx = {cat:i for i,cat in enumerate(categories)}
    unk = cat_to_idx.get("<UNK>")
    idx = [cat_to_idx.get(str(v), unk) for v in series.fillna("<UNK>").astype(str)]
    mat = np.eye(len(categories), dtype=np.float32)[np.array(idx, dtype=int)]
    return mat

def zscore_series(series, mean, std):
    x = pd.to_numeric(series, errors='coerce').fillna(mean).to_numpy(dtype=np.float32)
    return ((x - mean) / std).astype(np.float32)

# Split del perfil por usuarios del train/test
df_prof_train = profiles_df[profiles_df['user_id'].isin(train_users)].copy()
df_prof_test  = profiles_df[profiles_df['user_id'].isin(test_users)].copy()

cat_maps, num_stats = fit_profile_artifacts(df_prof_train, CAT_COLS, NUM_COLS)

def build_profile_matrix(df, user_col='user_id'):
    parts = []
    feature_names = []
    # num
    for c in NUM_COLS:
        z = zscore_series(df[c], num_stats[c]['mean'], num_stats[c]['std']).reshape(-1,1)
        parts.append(z); feature_names.append(f"{c}_z")
    # cat
    for c in CAT_COLS:
        oh = one_hot_series(df[c], cat_maps[c])
        parts.append(oh); feature_names += [f"{c}={v}" for v in cat_maps[c]]
    # flag presencia
    pres = np.ones((df.shape[0],1), dtype=np.float32)
    parts.append(pres); feature_names.append("profile_present")
    P = np.concatenate(parts, axis=1).astype(np.float32)
    return P, feature_names

P_train, profile_feature_order = build_profile_matrix(df_prof_train)
P_test, _ = build_profile_matrix(df_prof_test)

# Diccionarios user_id -> vector perfil
prof_by_user_train = {u: v for u, v in zip(df_prof_train['user_id'].astype(str).tolist(), list(P_train))}
prof_by_user_test  = {u: v for u, v in zip(df_prof_test['user_id'].astype(str).tolist(),  list(P_test))}
p_dim = P_train.shape[1]

# Construcción de perfiles por muestra para loaders
def build_sample_profiles(meta, train_split_users, prof_train, prof_test, p_dim):
    profiles = []
    for (u, _night) in meta:
        u = str(u)
        if u in train_split_users:
            vec = prof_train.get(u, None)
        else:
            vec = prof_test.get(u, None)
        if vec is None:
            # Usuario sin perfil: vector de ceros y flag 0
            v = np.zeros((p_dim,), dtype=np.float32)
            # Último índice es profile_present; pon 0
            v[-1] = 0.0
            vec = v
        profiles.append(vec.astype(np.float32))
    return np.stack(profiles).astype(np.float32)

train_split_users_set = set(train_users)
profiles_all = build_sample_profiles(meta, train_split_users_set, prof_by_user_train, prof_by_user_test, p_dim)
profiles_train = profiles_all[train_idx]
profiles_test  = profiles_all[test_idx]

# =========================================
# Fase 2: mTAN + MLP de perfil
# =========================================
class TimeDatasetWithProfile(Dataset):
    def __init__(self, X, T, M, P, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.T = torch.tensor(T, dtype=torch.float32)
        self.M = torch.tensor(M, dtype=torch.float32)
        self.P = torch.tensor(P, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)
    def __len__(self): return len(self.y)
    def __getitem__(self, idx): return self.X[idx], self.T[idx], self.M[idx], self.P[idx], self.y[idx]

train_loader_p2 = DataLoader(TimeDatasetWithProfile(X_train, T_train, M_train, profiles_train, y_train), batch_size=128, shuffle=True)
test_loader_p2  = DataLoader(TimeDatasetWithProfile(X_test,  T_test,  M_test,  profiles_test,  y_test),  batch_size=256, shuffle=False)

def evaluate_phase2(model, loader):
    model.eval(); ys, ps = [], []
    with torch.no_grad():
        for Xb, Tb, Mb, Pb, yb in loader:
            Xb, Tb, Mb, Pb, yb = Xb.to(device), Tb.to(device), Mb.to(device), Pb.to(device), yb.to(device)
            prob = torch.sigmoid(model.forward_phase2(Xb, Tb, Mb, Pb))
            ys.append(yb.detach().cpu().numpy()); ps.append(prob.detach().cpu().numpy())
    y_true = np.concatenate(ys); y_prob = np.concatenate(ps)
    auroc = roc_auc_score(y_true, y_prob) if len(np.unique(y_true))>1 else np.nan
    auprc = average_precision_score(y_true, y_prob) if len(np.unique(y_true))>1 else np.nan
    return auroc, auprc

# Entrenar Fase 2 (re-inicializando para comparación justa)
set_seed(SEED)
model_p2 = MTANWithHeads(c_in=X_train.shape[2], d_model=128, n_heads=4, n_layers=2, p_dim=p_dim, prof_hidden=64, dropout=0.2).to(device)
criterion2 = nn.BCEWithLogitsLoss()
optimizer2 = torch.optim.AdamW(model_p2.parameters(), lr=2e-3, weight_decay=1e-4)
scheduler2 = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer2, T_max=EPOCHS)
best2 = {'auroc': -1, 'state': None}

for epoch in range(1, EPOCHS+1):
    model_p2.train()
    for Xb, Tb, Mb, Pb, yb in train_loader_p2:
        Xb, Tb, Mb, Pb, yb = Xb.to(device), Tb.to(device), Mb.to(device), Pb.to(device), yb.to(device)
        logit = model_p2.forward_phase2(Xb, Tb, Mb, Pb)
        loss = criterion2(logit, yb)
        optimizer2.zero_grad(); loss.backward()
        torch.nn.utils.clip_grad_norm_(model_p2.parameters(), 1.0)
        optimizer2.step()
    scheduler2.step()
    auroc, auprc = evaluate_phase2(model_p2, test_loader_p2)
    if auroc > best2['auroc']:
        best2['auroc'] = auroc
        best2['state'] = {k: v.detach().cpu().clone() for k, v in model_p2.state_dict().items()}
    print(f"[Phase 2] Epoch {epoch:02d} | Test AUROC {auroc:.4f} | AUPRC {auprc:.4f}")

if best2['state'] is not None:
    model_p2.load_state_dict(best2['state'])
auroc_p2, auprc_p2 = evaluate_phase2(model_p2, test_loader_p2)

# =========================================
# Comparación de resultados
# =========================================
print("===== Final Results =====")
print(f"Phase 1 (TS only)  -> AUROC: {auroc_p1:.4f} | AUPRC: {auprc_p1:.4f}")
print(f"Phase 2 (TS+Profile)-> AUROC: {auroc_p2:.4f} | AUPRC: {auprc_p2:.4f}")

# Tabla Markdown opcional (si vas a reportar)
def format_table(auroc1, auprc1, auroc2, auprc2):
    return (
        "| Model | AUROC | AUPRC |\n"
        "|-------|-------|-------|\n"
        f"| TS only | {auroc1:.4f} | {auprc1:.4f} |\n"
        f"| TS + Profile | {auroc2:.4f} | {auprc2:.4f} |\n"
    )

print(format_table(auroc_p1, auprc_p1, auroc_p2, auprc_p2))


[Phase 1] Epoch 01 | Test AUROC 0.9984 | AUPRC 0.9976
[Phase 1] Epoch 02 | Test AUROC 0.9995 | AUPRC 0.9992
[Phase 1] Epoch 03 | Test AUROC 0.9997 | AUPRC 0.9995
[Phase 1] Epoch 04 | Test AUROC 0.9997 | AUPRC 0.9994
[Phase 1] Epoch 05 | Test AUROC 0.9968 | AUPRC 0.9952
[Phase 1] Epoch 06 | Test AUROC 0.9995 | AUPRC 0.9992
[Phase 1] Epoch 07 | Test AUROC 0.9998 | AUPRC 0.9997
[Phase 1] Epoch 08 | Test AUROC 0.9998 | AUPRC 0.9997
[Phase 1] Epoch 09 | Test AUROC 0.9996 | AUPRC 0.9993
[Phase 1] Epoch 10 | Test AUROC 0.9999 | AUPRC 0.9999
[Phase 1] Epoch 11 | Test AUROC 0.9998 | AUPRC 0.9998
[Phase 1] Epoch 12 | Test AUROC 0.9999 | AUPRC 0.9999
[Phase 1] Epoch 13 | Test AUROC 0.9999 | AUPRC 0.9998
[Phase 1] Epoch 14 | Test AUROC 0.9999 | AUPRC 0.9999
[Phase 1] Epoch 15 | Test AUROC 0.9999 | AUPRC 0.9999
[Phase 1] Epoch 16 | Test AUROC 0.9999 | AUPRC 0.9999
[Phase 1] Epoch 17 | Test AUROC 0.9999 | AUPRC 0.9999
[Phase 1] Epoch 18 | Test AUROC 0.9999 | AUPRC 0.9999
[Phase 1] Epoch 19 | Test AU