In [None]:
import torch

print("Torch:", torch.__version__)
print("MPS available:", torch.backends.mps.is_available())
print("MPS built:", torch.backends.mps.is_built())



Torch: 2.8.0
MPS available: True
MPS built: True


In [None]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print("Using device:", device)


Using device: mps


In [None]:


from pathlib import Path
import re
import numpy as np
import pandas as pd

IN_PATH  = Path("merged_all_30y_enriched.parquet")
OUT_PATH = Path("features_lgbm_final.parquet")

# --------------------------- utilities ---------------------------

def to_snake(name: str) -> str:
    """Lowercase, replace non-alnum with _, collapse repeats, strip edges."""
    name = str(name)
    name = name.replace("^", "")  # remove carets e.g., ^GSPC
    name = re.sub("([a-z0-9])([A-Z])", r"\1_\2", name)  # camel/Pascal -> snake
    name = re.sub("[^0-9a-zA-Z]+", "_", name)
    name = re.sub("_+", "_", name).strip("_")
    return name.lower()

def snake_case_columns(df: pd.DataFrame) -> pd.DataFrame:
    new_cols, seen = {}, {}
    for c in df.columns:
        sc = to_snake(c)
        if sc in seen:
            seen[sc] += 1
            sc = f"{sc}_dup{seen[sc]}"
        else:
            seen[sc] = 0
        new_cols[c] = sc
    return df.rename(columns=new_cols)

def safe_div(a, b):
    return a / b.replace({0: np.nan})

def ema(s, span):
    return s.ewm(span=span, adjust=False, min_periods=span).mean()

def rsi(price, window=14):
    d = price.diff()
    up = d.clip(lower=0.0)
    dn = -d.clip(upper=0.0)
    ag = up.ewm(alpha=1/window, min_periods=window, adjust=False).mean()
    al = dn.ewm(alpha=1/window, min_periods=window, adjust=False).mean()
    rs = ag / al.replace(0, np.nan)
    return 100 - (100/(1+rs))

def macd(price, fast=12, slow=26, signal=9):
    e_fast  = ema(price, fast)
    e_slow  = ema(price, slow)
    line    = e_fast - e_slow
    signal_ = ema(line, signal)
    hist    = line - signal_
    return line, signal_, hist

def dmi_adx(high, low, close, n=14):
    pc = close.shift(1)
    tr = pd.concat([(high-low).abs(), (high-pc).abs(), (low-pc).abs()], axis=1).max(axis=1)
    atr = tr.ewm(alpha=1/n, min_periods=n, adjust=False).mean()
    up_move, down_move = high.diff(), -low.diff()
    pos_dm = ((up_move > down_move) & (up_move > 0)).astype(float) * up_move
    neg_dm = ((down_move > up_move) & (down_move > 0)).astype(float) * down_move
    pos_dm_ema = pos_dm.ewm(alpha=1/n, min_periods=n, adjust=False).mean()
    neg_dm_ema = neg_dm.ewm(alpha=1/n, min_periods=n, adjust=False).mean()
    pos_di = 100 * safe_div(pos_dm_ema, atr)
    neg_di = 100 * safe_div(neg_dm_ema, atr)
    dx = 100 * safe_div((pos_di - neg_di).abs(), (pos_di + neg_di))
    adx = dx.ewm(alpha=1/n, min_periods=n, adjust=False).mean()
    return pos_di, neg_di, adx

def roc(price, n):
    return safe_div(price, price.shift(n)) - 1.0

def kst(price):
    r1 = roc(price, 10).rolling(10, min_periods=10).mean()
    r2 = roc(price, 15).rolling(10, min_periods=10).mean()
    r3 = roc(price, 20).rolling(10, min_periods=10).mean()
    r4 = roc(price, 30).rolling(15, min_periods=15).mean()
    kst_line = 1*r1 + 2*r2 + 3*r3 + 4*r4
    kst_sig  = kst_line.rolling(9, min_periods=9).mean()
    return kst_line, kst_sig

def to_float(s: pd.Series) -> pd.Series:
    return pd.to_numeric(s, errors="coerce").astype("float64")

def roll_zscore(s, win=63):
    m  = s.rolling(win, min_periods=win).mean()
    sd = s.rolling(win, min_periods=win).std()
    return safe_div(s - m, sd)

# ------ keep these encoders for rating + sector/industry keys ------
def encode_rating(series: pd.Series) -> pd.Series:
    """Ordinal map: S+..D+ -> 13..1. Unknown -> NaN."""
    order = ["S+","S","S-","A+","A","A-","B+","B","B-","C+","C","C-","D+"]
    mapping = {g: (len(order) - i) for i, g in enumerate(order)}  # S+=13 ... D+=1
    vals = (series.astype(str)
                  .str.strip()
                  .str.upper()
                  .replace({"NAN": np.nan, "NONE": np.nan, "": np.nan}))
    out = vals.map(mapping)
    return out.astype("Float32")

def encode_as_int(series: pd.Series, start_at_one: bool = True) -> pd.Series:
    """Deterministic ordinal encoding for object/categorical to 1..K (NaN -> 0)."""
    cat = pd.Categorical(series.astype("string"))
    codes = cat.codes.astype(np.int32)   # -1 for NaN
    if start_at_one:
        codes = codes + 1
        codes = np.where(codes > 0, codes, 0)
    return pd.Series(codes, index=series.index, dtype=np.int32)

# ----------------------- per-ticker feature builder -----------------------

ROLL_WINS = [5, 10, 21, 63, 126, 252]

def build_features_per_ticker(g: pd.DataFrame) -> pd.DataFrame:
    """
    Core STOCK-ONLY technical features per ticker (no external series).
    """
    g = g.copy()
    g["date"] = pd.to_datetime(g["date"], errors="coerce")
    g = g.dropna(subset=["date"])
    g = g.sort_values("date")

    # ensure numeric dtypes
    g["close"] = to_float(g["close"])
    g["high"]  = to_float(g["high"])
    g["low"]   = to_float(g["low"])
    g["open"]  = to_float(g["open"])
    if "adj_close" in g.columns:
        g["adj_close"] = to_float(g["adj_close"])
    if "vwap" in g.columns:
        g["vwap"] = to_float(g["vwap"])
    if "market_cap" in g.columns:
        g["market_cap"] = to_float(g["market_cap"])
    g["volume"] = to_float(g["volume"])

    price = g["adj_close"] if "adj_close" in g.columns else g["close"]
    c, h, l, o = g["close"], g["high"], g["low"], g["open"]
    v          = g["volume"]
    vwap       = g["vwap"] if "vwap" in g.columns else pd.Series(index=g.index, dtype="float64")
    mcap       = g["market_cap"] if "market_cap" in g.columns else pd.Series(index=g.index, dtype="float64")

    # robust log(market_cap)
    log_mcap = np.log(mcap.mask(mcap <= 0, np.nan))

    # returns
    g["ret_1d"]      = price.pct_change(1)
    g["ret_log_1d"]  = np.log(price).diff()

    # momentum & volatility windows
    for w in ROLL_WINS:
        g[f"mom_{w}"] = roc(price, w)
    for w in [21, 63, 126]:
        g[f"vol_{w}"] = g["ret_1d"].rolling(w, min_periods=w).std()
    g["vol_21_over_63"]  = safe_div(g.get("vol_21"), g.get("vol_63"))
    g["vol_21_over_126"] = safe_div(g.get("vol_21"), g.get("vol_126"))

    # SMA/EMA
    for w in [10, 20, 50, 100, 200]:
        g[f"sma_{w}"] = price.rolling(w, min_periods=w).mean()
        g[f"ema_{w}"] = ema(price, w)
    g["sma20_over_sma50"]  = safe_div(g["sma_20"], g["sma_50"])
    g["sma50_over_sma200"] = safe_div(g["sma_50"], g["sma_200"])
    g["price_over_sma20"]  = safe_div(price, g["sma_20"])
    g["price_over_sma50"]  = safe_div(price, g["sma_50"])
    g["price_over_sma200"] = safe_div(price, g["sma_200"])

    # EMA slopes / diffs
    g["ema20_diff_1"]          = g["ema_20"] - g["ema_20"].shift(1)
    g["ema50_diff_1"]          = g["ema_50"] - g["ema_50"].shift(1)
    g["ema20_slope_5"]         = (g["ema_20"] - g["ema_20"].shift(5)) / 5.0
    g["ema50_slope_5"]         = (g["ema_50"] - g["ema_50"].shift(5)) / 5.0
    g["ema20_diff_over_price"] = safe_div(g["ema20_diff_1"], price)
    g["ema50_diff_over_price"] = safe_div(g["ema50_diff_1"], price)

    # RSI / Bollinger
    g["rsi_14"] = rsi(price, 14)
    ma20 = price.rolling(20, min_periods=20).mean()
    sd20 = price.rolling(20, min_periods=20).std()
    up20 = ma20 + 2 * sd20
    lo20 = ma20 - 2 * sd20
    g["bb_ma20"]  = ma20
    g["bb_up20"]  = up20
    g["bb_lo20"]  = lo20
    g["bb_pct20"] = (price - lo20) / (up20 - lo20)
    g["bb_bw20"]  = safe_div(up20 - lo20, ma20)

    # ATR & ranges
    pc = c.shift(1)
    tr = pd.concat([(h-l), (h-pc).abs(), (l-pc).abs()], axis=1).max(axis=1)
    g["atr_14"]     = tr.rolling(14, min_periods=14).mean()
    g["hl_range"]   = safe_div(h - l, l)
    g["co_return"]  = safe_div(c - o, o)
    g["gap_return"] = safe_div(o - pc, pc)

    # Z-scored log return (63d)
    g["ret_log_1d_z63"] = roll_zscore(g["ret_log_1d"], win=63)

    # Stochastic
    ll = l.rolling(14, min_periods=14).min()
    hh = h.rolling(14, min_periods=14).max()
    k  = 100 * (c - ll) / (hh - ll)
    g["stoch_k14"] = k
    g["stoch_d3"]  = k.rolling(3, min_periods=3).mean()

    # MACD / DMI-ADX / KST
    macd_line, macd_sig, macd_hist = macd(price, 12, 26, 9)
    g["macd_line"]   = macd_line
    g["macd_signal"] = macd_sig
    g["macd_hist"]   = macd_hist
    pos_di, neg_di, adx = dmi_adx(h, l, c, n=14)
    g["dmi_pos14"] = pos_di
    g["dmi_neg14"] = neg_di
    g["adx_14"]    = adx
    kst_line, kst_sig = kst(price)
    g["kst_line"]   = kst_line
    g["kst_signal"] = kst_sig

    # Volume features
    g["log_volume"] = np.log1p(v)
    for w in [20, 60, 120]:
        g[f"vol_sma_{w}"] = v.rolling(w, min_periods=w).mean()
        g[f"vol_std_{w}"] = v.rolling(w, min_periods=w).std()
        g[f"vol_z_{w}"]   = safe_div(v - g[f"vol_sma_{w}"], g[f"vol_std_{w}"])
        g[f"dollar_vol_sma_{w}"] = (price * v).rolling(w, min_periods=w).mean()
    g["vol_ratio_20"]  = safe_div(v, g["vol_sma_20"])
    g["dollar_volume"] = price * v

    # Cumulative / long-horizon returns
    for w in [21, 63, 126, 252]:
        g[f"cumret_{w}"] = roc(price, w)

    # VWAP ratios
    if "vwap" in g.columns:
        g["close_over_vwap"] = safe_div(c, vwap)
        g["high_over_vwap"]  = safe_div(h, vwap)
        g["low_over_vwap"]   = safe_div(l, vwap)
        g["vwap_spread"]     = safe_div(c - vwap, vwap)

    # Market-cap scaled features
    if "market_cap" in g.columns:
        g["log_mcap"] = log_mcap
        g["ret1_over_logmcap"]   = safe_div(g["ret_1d"], g["log_mcap"])
        g["vol21_over_logmcap"]  = safe_div(g.get("vol_21"), g["log_mcap"])
        g["atr14_over_logmcap"]  = safe_div(g["atr_14"], g["log_mcap"])
        g["dollarvol_over_mcap"] = safe_div(g["dollar_volume"], mcap)
        g["turnover_approx"]     = safe_div(v * price, mcap)
        if "vwap" in g.columns:
            g["vwap_over_logmcap"] = safe_div(vwap, g["log_mcap"])

    # Calendar (weekday)
    dow = g["date"].dt.weekday
    g["dow_sin"] = np.sin(2*np.pi*dow/7.0)
    g["dow_cos"] = np.cos(2*np.pi*dow/7.0)

    return g

# ------------------------------- main -------------------------------

# Base columns we consider STOCK-SPECIFIC (from your list)
STOCK_BASE_COLS = {
    # --- strict stock OHLCV ---
    "ticker","date","market_cap","open","high","low","close","adj_close",
    "volume","unadjusted_volume","change","change_percent","vwap",
    "change_over_time",

    # --- ratings & fundamentals ---
    "rating",
    "overall_score","discounted_cashflow_score",
    "return_on_equity_score","return_on_assets_score",
    "debt_to_equity_score","price_to_earnings_score","price_to_book_score",

    # --- sector/industry descriptors ---
    "sector","industry","sector_key","industry_key",
    "sector_average_change","sector_pe",
    "industry_average_change","industry_pe",
}

# Everything else (macro indices/FX/commodities/volumes, sector/industry aggregates) will be dropped.

def main():
    # 1) Load & normalize columns (Parquet)
    df = pd.read_parquet(IN_PATH)
    df = snake_case_columns(df)

    # robust datetime parse
    df["date"] = pd.to_datetime(df["date"], errors="coerce")
    bad = df["date"].isna().sum()
    if bad:
        print(f"Dropping {bad} row(s) with unparsable date.")
        df = df.dropna(subset=["date"])

    # strip timezone if present
    if getattr(df["date"].dt, "tz", None) is not None:
        df["date"] = df["date"].dt.tz_localize(None)

    # unify a few common aliases
    rename_map = {
        "adjclose": "adj_close",
        "unadjustedvolume": "unadjusted_volume",
        "changepercent": "change_percent",
        "changeovertime": "change_over_time",
        "overallscore": "overall_score",
        "discountedcashflowscore": "discounted_cashflow_score",
        "returnonequityscore": "return_on_equity_score",
        "returnonassetsscore": "return_on_assets_score",
        "debttoequityscore": "debt_to_equity_score",
        "pricetoearningsscore": "price_to_earnings_score",
        "pricetobookscore": "price_to_book_score",
        "marketcap": "market_cap",
    }
    df = df.rename(columns=rename_map)

    # 2) Drop anything that is NOT stock-specific from your provided list
    available_stock_cols = [c for c in df.columns if c in STOCK_BASE_COLS]
    df = df[available_stock_cols].copy()

    # 3) drop 'label' if it slipped in (prevent leakage)
    if "label" in df.columns:
        df = df.drop(columns=["label"])

    # 4) encodings: rating + sector/industry keys (keep encoders as requested)
    if "rating" in df.columns:
        df["rating_num"] = encode_rating(df["rating"])
        df.drop(columns=["rating"], inplace=True)

    for cat_col in ["sector", "industry", "sector_key", "industry_key"]:
        if cat_col in df.columns:
            df[f"{cat_col}_code"] = encode_as_int(df[cat_col], start_at_one=True)
            df.drop(columns=[cat_col], inplace=True)

    # 5) sort/dedupe rows by (ticker, date)
    if "ticker" not in df.columns:
        raise ValueError("Expected 'ticker' column is missing after filtering.")
    df["ticker"] = df["ticker"].astype("string")
    df = df.sort_values(["ticker", "date"]).drop_duplicates(["ticker","date"], keep="last")

    # 6) build per-ticker STOCK-ONLY technical features
    feats = (df.groupby("ticker", group_keys=False)
               .apply(build_features_per_ticker)
               .reset_index(drop=True))

    # 7) next-day target (stock-only, safe)
    feats = feats.sort_values(["ticker","date"])
    feats["y_ret_next"] = feats.groupby("ticker")["close"].transform(lambda s: s.pct_change().shift(-0)).shift(-1)

    # 8) final cleanup: drop rows with any NaNs (from rolling ops)
    before = len(feats)
    feats = feats.dropna().reset_index(drop=True)
    print(f"Dropped {before - len(feats):,} rows due to NaNs from rolling windows.")

    # 9) downcast numerics for size
    for col in feats.select_dtypes(include=["float64"]).columns:
        feats[col] = feats[col].astype("float32")
    for col in feats.select_dtypes(include=["int64"]).columns:
        feats[col] = pd.to_numeric(feats[col], downcast="integer")

    # 10) save ONE final file (Parquet)
    OUT_PATH.parent.mkdir(parents=True, exist_ok=True)
    feats.to_parquet(OUT_PATH, engine="pyarrow", compression="snappy", index=False)
    print(f"Saved features: {OUT_PATH} | rows={len(feats):,} cols={feats.shape[1]}")

if __name__ == "__main__":
    main()


  .apply(build_features_per_ticker)


Dropped 65,631 rows due to NaNs from rolling windows.
Saved features: features_lgbm_final.parquet | rows=680,355 cols=118


In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import shap

# ---------- PARAMETERS ----------
FILE_PATH = "features_lgbm_final.parquet"
TARGET = "y_ret_next"
DATE_COL = "date"
TICKER_COL = "ticker"

TOP_K = 30               # how many to select in each selection stage
TRAIN_YEARS = 1
TEST_YEARS = 1
LAG_STEP = 7
MAX_LAG = 120

# Whether to include *all original base features* in the final parquet
INCLUDE_ALL_BASE_FEATURES = True

# ---------- LOAD DATA ----------
df = pd.read_parquet(FILE_PATH)
df[DATE_COL] = pd.to_datetime(df[DATE_COL], errors="coerce")
df = df.sort_values([TICKER_COL, DATE_COL]).reset_index(drop=True)

# identify feature columns (original/base features, non-lagged)
ignore_cols = {TICKER_COL, DATE_COL, TARGET}
X_cols = [c for c in df.columns if c not in ignore_cols]

# ---------- HELPER: rolling window importance ----------
def compute_feature_importance(data, x_cols, target, date_col, train_years=1, test_years=1):
    start_date = data[date_col].min()
    end_date = data[date_col].max()
    feature_importance = pd.Series(0.0, index=x_cols, dtype=float)
    n_windows = 0

    current_start = start_date
    # slide by TEST_YEARS each time: [start, start+train)->train, next year -> test
    while current_start + pd.DateOffset(years=train_years + test_years) <= end_date:
        train_end = current_start + pd.DateOffset(years=train_years)
        test_end = train_end + pd.DateOffset(years=test_years)

        train = data[(data[date_col] >= current_start) & (data[date_col] < train_end)]
        test = data[(data[date_col] >= train_end) & (data[date_col] < test_end)]

        # basic guardrails
        if len(train) < 1000 or len(test) < 100:
            current_start += pd.DateOffset(years=test_years)
            continue

        X_train = train[x_cols]
        y_train = train[target]

        model = lgb.LGBMRegressor(
            n_estimators=200,
            learning_rate=0.05,
            max_depth=-1,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42,
            force_col_wise=True,
        )
        model.fit(X_train, y_train)

        # SHAP importance on TRAIN ONLY (no leakage)
        explainer = shap.TreeExplainer(model)
        shap_vals = explainer.shap_values(X_train, check_additivity=False)
        imp = np.abs(shap_vals).mean(axis=0)

        feature_importance = feature_importance.add(
            pd.Series(imp, index=x_cols, dtype=float), fill_value=0.0
        )
        n_windows += 1
        current_start += pd.DateOffset(years=test_years)

    if n_windows == 0:
        raise ValueError("No rolling windows computed (not enough data per window).")

    return (feature_importance / n_windows).sort_values(ascending=False)

# ---------- STEP 1: get top features on original/base set ----------
imp1 = compute_feature_importance(df, X_cols, TARGET, DATE_COL, TRAIN_YEARS, TEST_YEARS)
topk_base = imp1.head(TOP_K).index.tolist()
print(f"[INFO] First-pass TOP-{TOP_K} base features: {topk_base}")

# ---------- STEP 2: build lagged dataset for those TOP-K features ----------
lagged_df = df[[TICKER_COL, DATE_COL, TARGET]].copy()
for col in topk_base:
    # create 7, 14, ..., MAX_LAG
    for lag in range(LAG_STEP, MAX_LAG + 1, LAG_STEP):
        lagged_df[f"{col}_lag{lag}"] = df.groupby(TICKER_COL, sort=False)[col].shift(lag)

# drop rows with NaNs introduced by lagging
lagged_df = lagged_df.dropna().reset_index(drop=True)

lag_cols = [c for c in lagged_df.columns if c not in [TICKER_COL, DATE_COL, TARGET]]

# ---------- STEP 3: re-rank importance on lagged set, pick final TOP-K lagged ----------
imp2 = compute_feature_importance(lagged_df, lag_cols, TARGET, DATE_COL, TRAIN_YEARS, TEST_YEARS)
final_topk_lagged = imp2.head(TOP_K).index.tolist()
print(f"[INFO] Second-pass TOP-{TOP_K} lagged features: {final_topk_lagged}")

# ---------- STEP 4: build final output with requested columns ----------
# Start from lagged_df (already trimmed by valid lags)
final_cols = [TICKER_COL, DATE_COL, TARGET]

# Include all original/base features if requested (non-lagged)
if INCLUDE_ALL_BASE_FEATURES:
    base_df = df[[TICKER_COL, DATE_COL] + X_cols].copy()
    # Merge to align rows to the (shorter) lagged_df index by keys
    out_df = lagged_df.merge(base_df, on=[TICKER_COL, DATE_COL], how="left")
    # Add only the final top-k lagged features (avoid dumping all lagged)
    out_df = out_df[final_cols + X_cols + final_topk_lagged]
else:
    # Include only the final top-k lagged features (original behavior)
    out_df = lagged_df[final_cols + final_topk_lagged]

out_df.to_parquet("final_topk_with_all_base.parquet", index=False)
print(f"[DONE] Saved: final_topk_with_all_base.parquet")
print(f"[SHAPE] {out_df.shape[0]:,} rows x {out_df.shape[1]:,} cols")


[LightGBM] [Info] Total Bins 35286
[LightGBM] [Info] Number of data points in the train set: 26263, number of used features: 159
[LightGBM] [Info] Start training from score -0.000623
[LightGBM] [Info] Total Bins 35251
[LightGBM] [Info] Number of data points in the train set: 27950, number of used features: 159
[LightGBM] [Info] Start training from score 0.000312
[LightGBM] [Info] Total Bins 35246
[LightGBM] [Info] Number of data points in the train set: 29728, number of used features: 159
[LightGBM] [Info] Start training from score 0.001713
[LightGBM] [Info] Total Bins 35272
[LightGBM] [Info] Number of data points in the train set: 30356, number of used features: 159
[LightGBM] [Info] Start training from score 0.001436
[LightGBM] [Info] Total Bins 35325
[LightGBM] [Info] Number of data points in the train set: 31504, number of used features: 159
[LightGBM] [Info] Start training from score 0.000415
[LightGBM] [Info] Total Bins 35249
[LightGBM] [Info] Number of data points in the train s

  lagged_df[f"{col}_lag{lag}"] = df.groupby(TICKER_COL, sort=False)[col].shift(lag)
  lagged_df[f"{col}_lag{lag}"] = df.groupby(TICKER_COL, sort=False)[col].shift(lag)
  lagged_df[f"{col}_lag{lag}"] = df.groupby(TICKER_COL, sort=False)[col].shift(lag)
  lagged_df[f"{col}_lag{lag}"] = df.groupby(TICKER_COL, sort=False)[col].shift(lag)
  lagged_df[f"{col}_lag{lag}"] = df.groupby(TICKER_COL, sort=False)[col].shift(lag)
  lagged_df[f"{col}_lag{lag}"] = df.groupby(TICKER_COL, sort=False)[col].shift(lag)
  lagged_df[f"{col}_lag{lag}"] = df.groupby(TICKER_COL, sort=False)[col].shift(lag)
  lagged_df[f"{col}_lag{lag}"] = df.groupby(TICKER_COL, sort=False)[col].shift(lag)
  lagged_df[f"{col}_lag{lag}"] = df.groupby(TICKER_COL, sort=False)[col].shift(lag)
  lagged_df[f"{col}_lag{lag}"] = df.groupby(TICKER_COL, sort=False)[col].shift(lag)
  lagged_df[f"{col}_lag{lag}"] = df.groupby(TICKER_COL, sort=False)[col].shift(lag)
  lagged_df[f"{col}_lag{lag}"] = df.groupby(TICKER_COL, sort=False)[col].shi

[LightGBM] [Info] Total Bins 111817
[LightGBM] [Info] Number of data points in the train set: 26385, number of used features: 510
[LightGBM] [Info] Start training from score -0.001101
[LightGBM] [Info] Total Bins 110451
[LightGBM] [Info] Number of data points in the train set: 28088, number of used features: 510
[LightGBM] [Info] Start training from score 0.002495
[LightGBM] [Info] Total Bins 111317
[LightGBM] [Info] Number of data points in the train set: 29873, number of used features: 510
[LightGBM] [Info] Start training from score 0.001462
[LightGBM] [Info] Total Bins 109398
[LightGBM] [Info] Number of data points in the train set: 29812, number of used features: 510
[LightGBM] [Info] Start training from score 0.000653
[LightGBM] [Info] Total Bins 111626
[LightGBM] [Info] Number of data points in the train set: 31499, number of used features: 510
[LightGBM] [Info] Start training from score 0.000835
[LightGBM] [Info] Total Bins 110028
[LightGBM] [Info] Number of data points in the t

In [None]:
import pandas as pd
df_final = pd.read_parquet("final_topk_with_all_base.parquet")
print(df_final.shape)
print(df_final.columns.tolist())

(650939, 194)
['ticker', 'date', 'y_ret_next', 'market_cap', 'open', 'high', 'low', 'close', 'adj_close', 'volume', 'unadjusted_volume', 'change', 'change_percent', 'vwap', 'change_over_time', 'overall_score', 'discounted_cashflow_score', 'return_on_equity_score', 'return_on_assets_score', 'debt_to_equity_score', 'price_to_earnings_score', 'price_to_book_score', 'sector_average_change', 'sector_pe', 'industry_average_change', 'industry_pe', 'price_bzusd', 'price_esusd', 'price_eurusd', 'price_gcusd', 'price_siusd', 'price_dji', 'price_ftse', 'price_gspc', 'price_hsi', 'price_ixic', 'price_n225', 'price_rut', 'price_stoxx50_e', 'price_vix', 'volume_bzusd', 'volume_esusd', 'volume_eurusd', 'volume_gcusd', 'volume_siusd', 'volume_dji', 'volume_gspc', 'volume_hsi', 'volume_ixic', 'volume_n225', 'volume_rut', 'rating_num', 'sector_code', 'industry_code', 'sector_key_code', 'industry_key_code', 'any_war', 'any_pandemic', 'ret_1d', 'ret_log_1d', 'mom_5', 'mom_10', 'mom_21', 'mom_63', 'mom_126