Del dataset maestro creo otros 4 cada uno para un modelo/plazo de inversión

In [None]:
import pandas as pd
import numpy as np

# ================= CONFIG =================
MASTER = "AMZN_features_10y_basicos_plus_options_with_news_feats.csv"

H = {"muy_corto": 1, "corto": 21, "medio": 252, "largo": 504}

# Columnas EXACTAS por modelo
REQ = {
    "muy_corto": [
        "Date","close","volume",
        "logret_1","roc_5","rsi_14","macd_hist",
        "atr_14_pct","vol_z_21","bb_width_20_2","vol_ratio_21_252",
        "dist_sma20","sma_slope_20","dist_sma50","dist_sma200","dist_sma400",
        "from_52w_high","from_52w_low",
        "hv_yz_30_ext","iv_atm_30_ext","iv_skew_30_ext",
        "pc_ratio_oi_30_ext","pc_ratio_vol_30_ext","iv_minus_hv30","iv_ts_ratio_30_360",
        # Noticias (primero diaria, luego derivadas)
        "news_sent_mean","news_mean_w3","news_std_w3",
    ],
    "corto": [
        "Date","close","volume",
        "logret_1","roc_5","rsi_14","macd_hist",
        "atr_14_pct","vol_z_21","bb_width_20_2","vol_ratio_21_252",
        "dist_sma20","sma_slope_20","dist_sma50","dist_sma200","dist_sma400",
        "from_52w_high","from_52w_low","roc_21",
        "hv_yz_30_ext","iv_atm_30_ext","iv_atm_90_ext","iv_skew_30_ext",
        "pc_ratio_oi_30_ext","pc_ratio_vol_30_ext","iv_minus_hv30","iv_ts_ratio_30_360",
        # Noticias (primero diaria, luego derivadas)
        "news_sent_mean","news_mean_w21","news_std_w21",
    ],
    "medio": [
        "Date","close","volume",
        "roc_21","roc_1260",
        "dist_sma50","dist_sma200","dist_sma400",
        "from_52w_high","from_52w_low","from_ath",
        "drawdown_long","trend_ok","trend_slope_3y",
        "vol_ratio_21_252","vol_ratio_252_1260",
        "hv_yz_270_ext",
        "iv_atm_360_ext","iv_ts_ratio_30_360","iv_minus_hv30",
        "news_mean_w63","news_std_w63",
    ],
    "largo": [
        "Date","close","volume",
        "roc_1260",
        "dist_sma50","dist_sma200","dist_sma400",
        "from_52w_high","from_52w_low","from_ath",
        "drawdown_long","trend_ok","trend_slope_3y",
        "vol_ratio_252_1260",
        "hv_yz_270_ext",
        "iv_atm_360_ext","iv_ts_ratio_30_360",
        "news_mean_w126","news_std_w126",
    ],
}

OUT = {
    "muy_corto": "AMZN_modelo_muy_corto_train.csv",
    "corto":     "AMZN_modelo_corto_train.csv",
    "medio":     "AMZN_modelo_medio_train.csv",
    "largo":     "AMZN_modelo_largo_train.csv",
}
# =========================================

def first_full_row_idx(df, cols):
    tmp = df[cols].replace([np.inf,-np.inf], np.nan)
    ok = tmp.notna().all(axis=1).to_numpy()
    idx = np.where(ok)[0]
    return int(idx[0]) if len(idx) else None

def make_train_ready(master_df, feats, tgt, horizon, last_news_pos):
    """
    - No ordena.
    - Corta cabeza hasta 1ª fila con TODAS las features completas.
    - Corta cola en min(n - H, last_news_pos+1) para no pasar del último día con noticia real.
    - Dropna final en feats+tgt.
    """
    n = len(master_df)
    start_idx = first_full_row_idx(master_df, feats)
    if start_idx is None:
        raise RuntimeError("No hay ninguna fila completa de features.")
    end_idx_excl = min(n - horizon, last_news_pos + 1)  # t <= N y t <= P-H
    if end_idx_excl <= start_idx:
        raise RuntimeError("Tramo útil vacío tras aplicar warm-up y recorte por N/H.")
    sub = master_df.iloc[start_idx:end_idx_excl, :].copy()
    sub = sub.replace([np.inf,-np.inf], np.nan).dropna(subset=feats + [tgt])
    return sub

# ---------- main ----------
m = pd.read_csv(MASTER)  # SIN ordenar

# Targets por cierre (log-retorno futuro)
m["target_1d_logret_fwd"]   = np.log(m["close"].shift(-H["muy_corto"]) / m["close"])
m["target_21d_logret_fwd"]  = np.log(m["close"].shift(-H["corto"])     / m["close"])
m["target_252d_logret_fwd"] = np.log(m["close"].shift(-H["medio"])     / m["close"])
m["target_504d_logret_fwd"] = np.log(m["close"].shift(-H["largo"])     / m["close"])

# Localizar N = último día con noticia real (news_sent_mean != 0)
if "news_sent_mean" not in m.columns:
    raise AssertionError("Falta 'news_sent_mean' en el maestro.")
nz = (pd.to_numeric(m["news_sent_mean"], errors="coerce").fillna(0).abs() > 0).to_numpy()
if not nz.any():
    raise RuntimeError("No se detectan días con noticias (todos 0).")
last_news_pos = np.where(nz)[0][-1]  # índice 0-based
last_news_date = m.loc[last_news_pos, "Date"]
print("Usando N (último día con noticia real):", last_news_date)

# Validar que todas las columnas pedidas existen en el maestro (más los targets)
ALL_TARGETS = {
    "muy_corto": "target_1d_logret_fwd",
    "corto":     "target_21d_logret_fwd",
    "medio":     "target_252d_logret_fwd",
    "largo":     "target_504d_logret_fwd",
}
for tag, cols in REQ.items():
    missing = [c for c in cols if c not in m.columns]
    if missing:
        raise AssertionError(f"[{tag}] FALTAN en maestro: {missing}")

# Construcción y guardado por modelo
for tag in ["muy_corto","corto","medio","largo"]:
    feats = REQ[tag]
    tgt   = ALL_TARGETS[tag]
    cols  = feats + [tgt]
    dfw   = m.loc[:, cols].copy()  # mismo orden de filas

    df_ready = make_train_ready(dfw, feats, tgt, H[tag], last_news_pos)

    # Sanity checks finales
    assert not df_ready[feats + [tgt]].isna().any().any(), f"[{tag}] Quedan NaN."
    # 'Date' presente y no vacío
    assert "Date" in df_ready.columns and len(df_ready["Date"])>0

    df_ready.to_csv(OUT[tag], index=False)
    print(f"[{tag}] filas={len(df_ready)} | {df_ready['Date'].iloc[0]} → {df_ready['Date'].iloc[-1]} -> {OUT[tag]}")


Usando N (último día con noticia real): 01-08-25
[muy_corto] filas=1211 | 16-09-20 → 01-08-25 -> AMZN_modelo_muy_corto_train.csv
[corto] filas=1211 | 16-09-20 → 01-08-25 -> AMZN_modelo_corto_train.csv
[medio] filas=1001 | 21-09-20 → 13-09-24 -> AMZN_modelo_medio_train.csv
[largo] filas=749 | 21-09-20 → 13-09-23 -> AMZN_modelo_largo_train.csv
