In [2]:
# ./scripts/build_master_dataset.py
import pandas as pd
import numpy as np
from pathlib import Path
from glob import glob

# ---------- 0 · Parámetros globales ----------
DATA_DIR      = Path("../datasets")
OUT_FILE      = Path("../datasets/merged/master_2014-09-17_2025-06-09.csv")
START_DATE    = "2014-09-17"
END_DATE      = "2025-06-09"
RSI_WINDOW    = 14         # estándar técnico
RETURN_PERIOD = 1          # delta en días (log‑return diario)

# ---------- 1 · Helpers ----------
def read_price_folder(folder: Path, col_name: str) -> pd.Series:
    """Lee todos los CSV con cabecera Yahoo/Investing y devuelve Serie Close."""
    files = sorted(glob(str(folder / f"*/*.csv")) + glob(str(folder / "*.csv")))
    df_list = []
    for f in files:
        # Cabecera especial: saltar fila 'Ticker' y fila 'Date' adicional
        tmp = pd.read_csv(
            f,
            skiprows=[1],          # salta fila 'Ticker,...'
            parse_dates=["Price"], # la columna 'Price' contiene fechas
        )
        tmp = tmp.rename(columns={"Price": "date", "Close": col_name})
        df_list.append(tmp[["date", col_name]])
    return (
        pd.concat(df_list)
        .drop_duplicates("date")
        .set_index("date")
        .sort_index()
        .squeeze()
    )

def read_simple_csv(pattern: str, date_col: str, value_col: str) -> pd.Series:
    """Lee CSV simples (1 fila de cabecera) que podrían estar por año."""
    files = sorted(glob(pattern))
    df = pd.concat([pd.read_csv(f) for f in files])
    df[date_col] = pd.to_datetime(df[date_col])
    return (
        df.set_index(date_col)[value_col]
        .drop_duplicates()
        .sort_index()
    )

def log_return(series: pd.Series, period: int = 1) -> pd.Series:
    return np.log(series).diff(periods=period)

def rsi(series: pd.Series, window: int = 14) -> pd.Series:
    delta = series.diff()
    up = delta.clip(lower=0)
    down = -delta.clip(upper=0)
    roll_up = up.ewm(span=window, adjust=False).mean()
    roll_down = down.ewm(span=window, adjust=False).mean()
    rs = roll_up / roll_down
    return 100 - (100 / (1 + rs))

# ---------- 2 · Lectura de series ----------
btc_close  = read_price_folder(DATA_DIR / "btc",  "btc_close")
eth_close  = read_price_folder(DATA_DIR / "eth",  "eth_close")
sp_close   = read_price_folder(DATA_DIR / "sp500", "sp500_close")
dxy_close  = read_price_folder(DATA_DIR / "dxy",  "dxy_close")
gold_close = read_price_folder(DATA_DIR / "gold", "gold_close")

active_addr = read_simple_csv(str(DATA_DIR / "active_addresses" / "*.csv"),
                              "date", "active_addresses")

interest_rate = read_simple_csv(str(DATA_DIR / "interest_rate" / "*.csv"),
                                "date", "interest_rate")

trend_btc = read_simple_csv(str(DATA_DIR / "trend" / "**/bitcoin_trend_*.csv"),
                            "date", "bitcoin")

fear_greed = (pd.read_csv(DATA_DIR / "fear_and_greed_index" / "fear_and_greed_index.csv",
                          parse_dates=["timestamp"])
              .set_index("timestamp")["value"]
              .rename("fear_greed"))

# ---------- 3 · Merge y alineación ----------
idx = pd.date_range(start=START_DATE, end=END_DATE, freq="D")

dfs = [
    btc_close, eth_close, sp_close, dxy_close, gold_close,
    active_addr, interest_rate, trend_btc, fear_greed
]

master = pd.concat(dfs, axis=1).reindex(idx)

# ---------- 4 · Imputación de huecos ----------
master = master.ffill().bfill()   # primero hacia atrás por seguridad

# ---------- 5 · Features derivados ----------
master["btc_logret"] = log_return(master["btc_close"], period=RETURN_PERIOD)
master["btc_rsi14"]  = rsi(master["btc_close"], window=RSI_WINDOW)

# ---------- 6 · Guardado ----------
OUT_FILE.parent.mkdir(parents=True, exist_ok=True)
master.to_csv(OUT_FILE, index_label="date")
print(f"✓ Dataset guardado en: {OUT_FILE.resolve()}")


  tmp = pd.read_csv(
  tmp = pd.read_csv(
  tmp = pd.read_csv(
  tmp = pd.read_csv(
  tmp = pd.read_csv(
  tmp = pd.read_csv(
  tmp = pd.read_csv(
  tmp = pd.read_csv(
  tmp = pd.read_csv(
  tmp = pd.read_csv(
  tmp = pd.read_csv(
  tmp = pd.read_csv(
  tmp = pd.read_csv(
  tmp = pd.read_csv(
  tmp = pd.read_csv(
  tmp = pd.read_csv(
  tmp = pd.read_csv(
  tmp = pd.read_csv(
  tmp = pd.read_csv(
  tmp = pd.read_csv(
  tmp = pd.read_csv(
  tmp = pd.read_csv(
  tmp = pd.read_csv(
  tmp = pd.read_csv(
  tmp = pd.read_csv(
  tmp = pd.read_csv(
  tmp = pd.read_csv(
  tmp = pd.read_csv(
  tmp = pd.read_csv(
  tmp = pd.read_csv(
  tmp = pd.read_csv(
  tmp = pd.read_csv(
  tmp = pd.read_csv(
  tmp = pd.read_csv(
  tmp = pd.read_csv(
  tmp = pd.read_csv(
  tmp = pd.read_csv(
  tmp = pd.read_csv(
  tmp = pd.read_csv(
  tmp = pd.read_csv(
  tmp = pd.read_csv(
  tmp = pd.read_csv(
  tmp = pd.read_csv(
  tmp = pd.read_csv(
  tmp = pd.read_csv(
  tmp = pd.read_csv(
  tmp = pd.read_csv(
  tmp = pd.re

✓ Dataset guardado en: /Users/cbarril/dev/posgrado/tp_ast1_19co2024/datasets/merged/master_2014-09-17_2025-06-09.csv
