## 1. Imports y config

In [None]:
# --- Imports y config ---
import os
import numpy as np
import pandas as pd
import pandas_ta as ta
from datetime import timedelta
from scipy.stats import linregress
from alpha_vantage.timeseries import TimeSeries
import warnings
warnings.filterwarnings("ignore", message="pkg_resources is deprecated as an API")


pd.set_option("display.width", 140)
pd.set_option("display.max_columns", 200)

# Clave eliminada por seguridad para publicación en repositorio
ALPHAVANTAGE_API_KEY = "TU_API_KEY_AQUI"


  from pkg_resources import get_distribution, DistributionNotFound


## 2.Funciones auxiliares (pendiente, ancho de Bollinger y ajuste por splits)

In [None]:
def _rolling_slope(x_ndarray: np.ndarray) -> float:
    x = np.asarray(x_ndarray, dtype=float)
    m = np.isfinite(x)
    if m.sum() < 2:
        return np.nan
    xx = np.arange(m.sum(), dtype=float)
    return linregress(xx, x[m]).slope

def _bb_width_from_df(bbands_df: pd.DataFrame) -> pd.Series:
    """Ancho de Bollinger: (upper - lower) / media. Devuelve razón (no %)."""
    if bbands_df is None or not isinstance(bbands_df, pd.DataFrame) or bbands_df.empty:
        return pd.Series(np.nan, index=[])
    cols = bbands_df.columns
    if "BBB_20_2.0" in cols:  
        return bbands_df["BBB_20_2.0"]
    upper = bbands_df[[c for c in cols if c.startswith("BBU_")]].iloc[:, 0]
    lower = bbands_df[[c for c in cols if c.startswith("BBL_")]].iloc[:, 0]
    mid   = bbands_df[[c for c in cols if c.startswith("BBM_")]].iloc[:, 0]
    denom = mid.replace(0, np.nan)
    return (upper - lower) / denom

def _apply_manual_splits(df: pd.DataFrame, splits: list[tuple[str, float]]) -> pd.DataFrame:
    """Aplica splits: divide OHLC por factor en fechas ANTERIORES; multiplica volumen."""
    if not splits:
        return df
    df = df.copy()
    df["adj_factor"] = 1.0
    for date_str, factor in sorted(splits, key=lambda x: pd.to_datetime(x[0])):
        d = pd.to_datetime(date_str)
        df.loc[df.index < d, "adj_factor"] *= float(factor)
    for col in ["open", "high", "low", "close"]:
        df[col] = df[col] / df["adj_factor"]
    df["volume"] = df["volume"] * df["adj_factor"]
    return df.drop(columns="adj_factor")


## 3. Descarga OHLCV (Alpha Vantage + caché)

In [3]:
def descargar_datos_alphavantage(ticker: str, api_key: str) -> pd.DataFrame:
    print(f"Descargando {ticker} (Alpha Vantage, daily full)...")
    ts = TimeSeries(key=api_key, output_format='pandas')
    df, _ = ts.get_daily(symbol=ticker, outputsize='full')
    if df is None or df.empty:
        return pd.DataFrame()
    df = df.rename(columns={
        '1. open':'open','2. high':'high','3. low':'low','4. close':'close','5. volume':'volume'
    })
    df.index = pd.to_datetime(df.index, errors='coerce')
    df = df[['open','high','low','close','volume']].sort_index()
    df = df[~df.index.duplicated(keep='last')]
    return df


## 4. Descarga OHLCV (Alpha Vantage + caché)

In [4]:
TICKER = "AMZN"
ANIOS_HIST = 10
AJUSTAR_AMZN_SPLIT = True

cache_file = f"{TICKER}_alphavantage_cache.csv"


## 5. Obtener OHLCV (con caché y split)

In [5]:
# Carga desde caché si es reciente; si no, descarga y guarda
use_cache = False
if os.path.exists(cache_file):
    last_mtime = pd.to_datetime(os.path.getmtime(cache_file), unit='s')
    use_cache = (pd.Timestamp.now() - last_mtime) < pd.Timedelta(hours=24)

if use_cache:
    print(f"Cargando desde caché: {cache_file}")
    df = pd.read_csv(cache_file, index_col=0, parse_dates=True)
else:
    df = descargar_datos_alphavantage(TICKER, ALPHAVANTAGE_API_KEY)
    if df.empty:
        raise RuntimeError("No hay datos OHLCV.")
    fecha_inicio = pd.Timestamp.today().normalize() - pd.DateOffset(years=ANIOS_HIST)
    df = df[df.index >= fecha_inicio]
    df.to_csv(cache_file)
    print(f"Caché guardado: {cache_file} ({len(df)} filas)")

# Ajuste por split de AMZN 20:1 (2022-06-06)
if AJUSTAR_AMZN_SPLIT and TICKER.upper() == "AMZN" and (df.index < "2022-06-06").any():
    df = _apply_manual_splits(df, [("2022-06-06", 20)])

df.tail(3)


Descargando AMZN (Alpha Vantage, daily full)...
Caché guardado: AMZN_alphavantage_cache.csv (2514 filas)


Unnamed: 0_level_0,open,high,low,close,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2025-09-15,230.625,233.73,230.32,231.43,33243328.0
2025-09-16,232.935,235.9,232.23,234.05,38203912.0
2025-09-17,233.77,234.3,228.71,231.62,42815230.0


## 6. Técnicos básicos

In [6]:
feats = df.copy()
feats.index.name = "Date"

close = feats['close'].astype(float)
high  = feats['high'].astype(float)
low   = feats['low'].astype(float)
opn   = feats['open'].astype(float)
vol   = feats['volume'].astype(float)

# Rendimientos
feats['logret_1']      = np.log(close / close.shift(1))
feats['overnight_ret'] = np.log(opn / close.shift(1))

# Momentum/RSI/MACD
feats['roc_5']  = ta.roc(close, length=5)
feats['rsi_14'] = ta.rsi(close, length=14)
_macd = ta.macd(close, fast=12, slow=26, signal=9)
feats['macd_hist'] = _macd['MACDh_12_26_9'] if isinstance(_macd, pd.DataFrame) and 'MACDh_12_26_9' in _macd.columns else np.nan

# Volatilidad de rango (ATR relativo en %)
atr14 = ta.atr(high, low, close, length=14)
feats['atr_14_pct'] = (atr14 / close) * 100.0

# Volumen (z-score 21d)
vol_mu  = vol.rolling(21, min_periods=21).mean()
vol_sig = vol.rolling(21, min_periods=21).std().replace(0, np.nan)
feats['vol_z_21'] = (vol - vol_mu) / vol_sig

# Medias y distancias
sma20  = ta.sma(close, length=20)
sma50  = ta.sma(close, length=50)
sma200 = ta.sma(close, length=200)
sma400 = ta.sma(close, length=400)

feats['dist_sma20']  = (close / sma20) - 1
feats['roc_21']      = ta.roc(close, length=21)
feats['dist_sma50']  = (close / sma50) - 1
feats['dist_sma200'] = (close / sma200) - 1
feats['dist_sma400'] = (close / sma400) - 1

# Bandas de Bollinger (20, 2σ): ancho relativo
_bbands = ta.bbands(close, length=20, std=2)
feats['bb_width_20_2'] = _bb_width_from_df(_bbands)

# Pendiente de SMA20 (suavizado corto)
feats['sma_slope_20'] = sma20.rolling(window=5, min_periods=5).apply(_rolling_slope, raw=True)

# 52 semanas (usar min_periods=252 para NaNs iniciales correctos)
roll_max_252 = close.rolling(252, min_periods=252).max()
roll_min_252 = close.rolling(252, min_periods=252).min()
feats['from_52w_high'] = (close / roll_max_252) - 1
feats['from_52w_low']  = (close / roll_min_252) - 1

# Ratios de volatilidad a partir de std de logret (sin HV de CSV)
logret   = feats['logret_1']
vol_21   = logret.rolling(21,   min_periods=21).std()
vol_252  = logret.rolling(252,  min_periods=252).std()
vol_1260 = logret.rolling(1260, min_periods=1260).std()
feats['vol_ratio_21_252']   = vol_21 / vol_252
feats['vol_ratio_252_1260'] = vol_252 / vol_1260

# Tendencia y drawdown largos
feats['trend_ok'] = (close > sma200).astype('int8')
roll_max_1260 = close.rolling(1260, min_periods=1).max()
feats['drawdown_long'] = ((close - roll_max_1260) / roll_max_1260).rolling(1260, min_periods=1).min()

# Largos plazos (ROC largo, slope 3y, ATH)
feats['roc_1260']       = ta.roc(close, length=1260)
feats['trend_slope_3y'] = np.log(close).rolling(756, min_periods=756).apply(_rolling_slope, raw=True)
feats['from_ath']       = (close / close.expanding().max()) - 1

feats.tail(3)


Unnamed: 0_level_0,open,high,low,close,volume,logret_1,overnight_ret,roc_5,rsi_14,macd_hist,atr_14_pct,vol_z_21,dist_sma20,roc_21,dist_sma50,dist_sma200,dist_sma400,bb_width_20_2,sma_slope_20,from_52w_high,from_52w_low,vol_ratio_21_252,vol_ratio_252_1260,trend_ok,drawdown_long,roc_1260,trend_slope_3y,from_ath
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
2025-09-15,230.625,233.73,230.32,231.43,33243328.0,0.014274,0.01079,-1.869912,53.178209,-0.221057,2.14953,-0.116439,0.007586,0.194822,0.019867,0.080736,0.163129,6.767259,0.131,-0.043915,0.383158,0.811235,0.945739,1,-0.561453,46.947147,0.001264,-0.043915
2025-09-16,232.935,235.9,232.23,234.05,38203912.0,0.011257,0.006482,-1.758731,56.033155,-0.067845,2.110066,0.375896,0.018426,1.30719,0.030452,0.092139,0.175416,6.934389,0.04305,-0.033091,0.398817,0.81964,0.946469,1,-0.561453,43.210723,0.001267,-0.033091
2025-09-17,233.77,234.3,228.71,231.62,42815230.0,-0.010437,-0.001197,0.560066,52.816467,-0.141009,2.152291,0.75047,0.007061,0.056158,0.018653,0.080201,0.162348,6.921793,0.0522,-0.04313,0.384294,0.826962,0.947525,1,-0.561453,45.897307,0.00127,-0.04313


## 7. Limpieza, guardado preliminar y vista rápida

In [None]:
# Reemplazar infinitos por NaN
feats = feats.replace([np.inf, -np.inf], np.nan)

base_min = ['rsi_14', 'macd_hist', 'atr_14_pct', 'dist_sma200']
present = [c for c in base_min if c in feats.columns]
feats_clean = feats.dropna(subset=present)

# Guardar features técnicas básicas
out_path = f"{TICKER}_features_{ANIOS_HIST}y_basicos.csv"
feats_clean.to_csv(out_path, float_format="%.8g")
print(f"Guardado: {out_path} | shape: {feats_clean.shape}")

feats_clean.tail(3)


Guardado: AMZN_features_10y_basicos.csv | shape: (2315, 28)


Unnamed: 0_level_0,open,high,low,close,volume,logret_1,overnight_ret,roc_5,rsi_14,macd_hist,atr_14_pct,vol_z_21,dist_sma20,roc_21,dist_sma50,dist_sma200,dist_sma400,bb_width_20_2,sma_slope_20,from_52w_high,from_52w_low,vol_ratio_21_252,vol_ratio_252_1260,trend_ok,drawdown_long,roc_1260,trend_slope_3y,from_ath
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
2025-09-15,230.625,233.73,230.32,231.43,33243328.0,0.014274,0.01079,-1.869912,53.178209,-0.221057,2.14953,-0.116439,0.007586,0.194822,0.019867,0.080736,0.163129,6.767259,0.131,-0.043915,0.383158,0.811235,0.945739,1,-0.561453,46.947147,0.001264,-0.043915
2025-09-16,232.935,235.9,232.23,234.05,38203912.0,0.011257,0.006482,-1.758731,56.033155,-0.067845,2.110066,0.375896,0.018426,1.30719,0.030452,0.092139,0.175416,6.934389,0.04305,-0.033091,0.398817,0.81964,0.946469,1,-0.561453,43.210723,0.001267,-0.033091
2025-09-17,233.77,234.3,228.71,231.62,42815230.0,-0.010437,-0.001197,0.560066,52.816467,-0.141009,2.152291,0.75047,0.007061,0.056158,0.018653,0.080201,0.162348,6.921793,0.0522,-0.04313,0.384294,0.826962,0.947525,1,-0.561453,45.897307,0.00127,-0.04313


## 8. Adición de métricas de opciones/volatilidad

In [None]:
# Añadir métricas de opciones/volatilidad desde CSV a las features básicas
import pandas as pd
import numpy as np

def _load_series(path, new_col):
    df = pd.read_csv(path)
    # detectar columna de fecha
    date_col = None
    for c in df.columns:
        if c.lower() in ("date", "fecha"):
            date_col = c
            break
    if date_col is None:
        date_col = df.columns[0]
    df = df.rename(columns={date_col: "Date"})
    df["Date"] = pd.to_datetime(df["Date"]).dt.tz_localize(None)
    # escoger primera numérica como valor
    for c in df.columns:
        if c == "Date":
            continue
        if pd.api.types.is_numeric_dtype(df[c]):
            value_col = c
            break
    if value_col is None and len(df.columns) > 1:
        value_col = df.columns[1]
        df[value_col] = pd.to_numeric(df[value_col], errors="coerce")
    out = df[["Date", value_col]].copy()
    out = out.rename(columns={value_col: new_col})
    return out

# archivos csv
base_path   = "AMZN_features_10y_basicos.csv"
hv30_path   = "AMZN_hv_yz_30.csv"
hv270_path  = "AMZN_hv_yz_270.csv"
iv30_path   = "AMZN_iv_mean_30.csv"
iv90_path   = "AMZN_iv_mean_90.csv"
iv360_path  = "AMZN_iv_mean_360.csv"
skew30_path = "AMZN_iv_skew_30.csv"
pcr_oi_path = "AMZN_pcr_oi_30.csv"
pcr_v_path  = "AMZN_pcr_v_30.csv"

# leer base
base = pd.read_csv(base_path)
if "Date" not in base.columns:
    base = base.rename(columns={base.columns[0]: "Date"})
base["Date"] = pd.to_datetime(base["Date"]).dt.tz_localize(None)

# leer externos
hv30   = _load_series(hv30_path,   "hv_yz_30_ext")
hv270  = _load_series(hv270_path,  "hv_yz_270_ext")
iv30   = _load_series(iv30_path,   "iv_atm_30_ext")
iv90   = _load_series(iv90_path,   "iv_atm_90_ext")
iv360  = _load_series(iv360_path,  "iv_atm_360_ext")
skew30 = _load_series(skew30_path, "iv_skew_30_ext")
pcr_oi = _load_series(pcr_oi_path, "pc_ratio_oi_30_ext")
pcr_v  = _load_series(pcr_v_path,  "pc_ratio_vol_30_ext")

# merge por Date (left)
df = base.merge(hv30,   on="Date", how="left") \
         .merge(hv270,  on="Date", how="left") \
         .merge(iv30,   on="Date", how="left") \
         .merge(iv90,   on="Date", how="left") \
         .merge(iv360,  on="Date", how="left") \
         .merge(skew30, on="Date", how="left") \
         .merge(pcr_oi, on="Date", how="left") \
         .merge(pcr_v,  on="Date", how="left")

# normalizar escalas de IV/HV: si parecen %, dividir por 100
for col in ["hv_yz_30_ext","hv_yz_270_ext","iv_atm_30_ext","iv_atm_90_ext","iv_atm_360_ext"]:
    if col in df.columns and df[col].median(skipna=True) is not None:
        med = df[col].median(skipna=True)
        if pd.notna(med) and med > 5:
            df[col] = df[col] / 100.0

# derivadas
if {"iv_atm_30_ext","iv_atm_360_ext"}.issubset(df.columns):
    df["iv_ts_ratio_30_360"] = df["iv_atm_30_ext"] / df["iv_atm_360_ext"]
if {"iv_atm_30_ext","hv_yz_30_ext"}.issubset(df.columns):
    df["iv_minus_hv30"] = df["iv_atm_30_ext"] - df["hv_yz_30_ext"]

# guardar
out_path = "AMZN_features_10y_basicos_plus_options.csv"
df.to_csv(out_path, index=False)
print("Guardado:", out_path, "| filas:", len(df))
print(df.tail(3)[["Date","iv_atm_30_ext","iv_atm_360_ext","hv_yz_30_ext","iv_ts_ratio_30_360","iv_minus_hv30"]])


Guardado: AMZN_features_10y_basicos_plus_options.csv | filas: 2315
           Date  iv_atm_30_ext  iv_atm_360_ext  hv_yz_30_ext  iv_ts_ratio_30_360  iv_minus_hv30
2312 2025-09-15         0.2706          0.3194        0.2317            0.847214         0.0389
2313 2025-09-16         0.2674          0.3184        0.2300            0.839824         0.0374
2314 2025-09-17            NaN             NaN           NaN                 NaN            NaN
