## Y, X corr + collinearity

In [None]:
import os
import re
from datetime import datetime, timedelta

import numpy as np
import pandas as pd


def _add_months(d: datetime, months: int) -> datetime:
    year = d.year + (d.month - 1 + months) // 12
    month = (d.month - 1 + months) % 12 + 1
    day = min(d.day, [31,
                      29 if year % 4 == 0 and (year % 100 != 0 or year % 400 == 0) else 28,
                      31, 30, 31, 30, 31, 31, 30, 31, 30, 31][month - 1])
    return datetime(year, month, day)


def generate_rolling_month_windows(
    start_month: str,
    window_months: int = 4,
    step_months: int = 2,
    n_windows: int = 5
):
    start = datetime.strptime(start_month + "-01", "%Y-%m-%d")
    windows = []
    for i in range(n_windows):
        w_start = _add_months(start, i * step_months)
        w_end = _add_months(w_start, window_months) - timedelta(days=1)
        windows.append((w_start.date(), w_end.date()))
    return windows


def list_xy_files(dataset_dir: str, top: int):
    file_infos = []
    pattern = re.compile(r"(\d{4}-\d{2}-\d{2})_xy_top(\d+)\.h5$")
    for fname in os.listdir(dataset_dir):
        m = pattern.match(fname)
        if not m:
            continue
        date_str, top_str = m.groups()
        if int(top_str) != top:
            continue
        date_obj = datetime.strptime(date_str, "%Y-%m-%d").date()
        path = os.path.join(dataset_dir, fname)
        file_infos.append((date_obj, path))
    file_infos.sort(key=lambda x: x[0])
    return file_infos


def load_xy_for_window(file_infos, start_date, end_date, y_col="y_60m"):
    paths = [p for (d, p) in file_infos if start_date <= d <= end_date]
    if not paths:
        raise ValueError(f"No xy files in window {start_date} ~ {end_date}")

    dfs = []
    for p in sorted(paths):
        df = pd.read_hdf(p, key="xy")
        if y_col not in df.columns:
            raise KeyError(f"{y_col} not found in {p}")
        dfs.append(df)
    full = pd.concat(dfs, ignore_index=True)
    return full


def select_features_for_window(
    df: pd.DataFrame,
    y_col: str = "y_60m",
    x_prefix: str = "x_",
    min_abs_corr_y: float = 0.02,
    max_xx_corr: float = 0.9,
):

    x_cols = [c for c in df.columns if c.startswith(x_prefix)]
    if y_col not in df.columns:
        raise KeyError(f"{y_col} not in DataFrame")

    sub = df[[y_col] + x_cols].dropna()

    nunique = sub[x_cols].nunique()
    x_cols = [c for c in x_cols if nunique[c] > 1]
    sub = sub[[y_col] + x_cols]

    if sub.empty or len(x_cols) == 0:
        raise ValueError("No valid data or features after dropna / constant filtering")

    corr_matrix = sub.corr()
    corr_xy = corr_matrix[y_col].drop(labels=[y_col])

    candidate = corr_xy[ corr_xy.abs() >= min_abs_corr_y ].sort_values(key=np.abs, ascending=False)

    if candidate.empty:
        print("  [WARN] No features satisfy min_abs_corr_y threshold")
        return [], corr_xy

    candidate_features = list(candidate.index)

    selected = []
    selected_set = set()

    corr_xx = corr_matrix.loc[candidate_features, candidate_features]

    for f in candidate_features:
        if not selected:
            selected.append(f)
            selected_set.add(f)
            continue

        max_corr_with_selected = corr_xx.loc[f, selected].abs().max()
        if max_corr_with_selected <= max_xx_corr:
            selected.append(f)
            selected_set.add(f)

    return selected, corr_xy


def run_feature_selection_rolling(
    dataset_dir: str = "data/xy",
    top: int = 30,
    y_col: str = "y_60m",
    start_month: str = "2024-06",
    window_months: int = 4,
    step_months: int = 2,
    n_windows: int = 5,
    min_abs_corr_y: float = 0.02,
    max_xx_corr: float = 0.9,
):
    file_infos = list_xy_files(dataset_dir, top=top)
    if not file_infos:
        raise ValueError(f"No xy files found in {dataset_dir} for top{top}")

    print(f"Found {len(file_infos)} xy files for top{top} in {dataset_dir}")

    windows = generate_rolling_month_windows(
        start_month=start_month,
        window_months=window_months,
        step_months=step_months,
        n_windows=n_windows,
    )

    windows_info = []
    all_corr_xy = {}
    feature_counts = {}

    for idx, (w_start, w_end) in enumerate(windows):
        print(f"\n=== Window {idx+1}/{len(windows)}: {w_start} ~ {w_end} ===")

        try:
            df_window = load_xy_for_window(file_infos, w_start, w_end, y_col=y_col)
        except ValueError as e:
            print("  [SKIP] ", e)
            windows_info.append({
                "window_index": idx,
                "start_date": w_start,
                "end_date": w_end,
                "selected_features": [],
                "corr_xy": pd.Series(dtype=float),
            })
            continue

        print(f"  Loaded {len(df_window):,} rows for this window.")

        selected, corr_xy = select_features_for_window(
            df_window,
            y_col=y_col,
            x_prefix="x_",
            min_abs_corr_y=min_abs_corr_y,
            max_xx_corr=max_xx_corr,
        )
        print(f"  Selected {len(selected)} features (after corr & colinearity).")

        for f, c in corr_xy.items():
            all_corr_xy.setdefault(f, []).append(abs(float(c)))

        for f in selected:
            feature_counts[f] = feature_counts.get(f, 0) + 1

        windows_info.append({
            "window_index": idx,
            "start_date": w_start,
            "end_date": w_end,
            "selected_features": selected,
            "corr_xy": corr_xy,
        })

    all_features = sorted(all_corr_xy.keys())
    records = []
    for f in all_features:
        counts = feature_counts.get(f, 0)
        corr_list = all_corr_xy.get(f, [])
        mean_abs_corr = float(np.mean(corr_list)) if corr_list else 0.0
        records.append({
            "feature": f,
            "selected_windows": counts,
            "selected_ratio": counts / max(1, n_windows),
            "mean_abs_corr": mean_abs_corr,
        })

    summary_df = pd.DataFrame(records).sort_values(
        ["selected_windows", "mean_abs_corr"], ascending=[False, False]
    ).reset_index(drop=True)

    return windows_info, summary_df


windows_info, summary_df = run_feature_selection_rolling(
    dataset_dir="data/xy",
    top=30,
    y_col="y_60m",
    start_month="2024-06",
    window_months=4,
    step_months=2,
    n_windows=5,
    min_abs_corr_y=0.02,
    max_xx_corr=0.9,
)

summary_df.head(30)


In [None]:
import os
import re
from datetime import datetime, timedelta

import numpy as np
import pandas as pd


def _add_months(d: datetime, months: int) -> datetime:
    year = d.year + (d.month - 1 + months) // 12
    month = (d.month - 1 + months) % 12 + 1
    day = min(d.day, [31,
                      29 if year % 4 == 0 and (year % 100 != 0 or year % 400 != 0) else 28,
                      31, 30, 31, 30, 31, 31, 30, 31, 30, 31][month - 1])
    return datetime(year, month, day)


def generate_rolling_month_windows(
    start_month: str,
    window_months: int = 4,
    step_months: int = 2,
    n_windows: int = 5
):
    start = datetime.strptime(start_month + "-01", "%Y-%m-%d")
    windows = []
    for i in range(n_windows):
        w_start = _add_months(start, i * step_months)
        w_end = _add_months(w_start, window_months) - timedelta(days=1)
        windows.append((w_start.date(), w_end.date()))
    return windows


def list_xy_files(dataset_dir: str, top: int):
    file_infos = []
    pattern = re.compile(r"(\d{4}-\d{2}-\d{2})_xy_top(\d+)\.h5$")
    for fname in os.listdir(dataset_dir):
        m = pattern.match(fname)
        if not m:
            continue
        date_str, top_str = m.groups()
        if int(top_str) != top:
            continue
        date_obj = datetime.strptime(date_str, "%Y-%m-%d").date()
        path = os.path.join(dataset_dir, fname)
        file_infos.append((date_obj, path))
    file_infos.sort(key=lambda x: x[0])
    return file_infos


def load_xy_for_window(file_infos, start_date, end_date, y_col="y_60m"):
    paths = [p for (d, p) in file_infos if start_date <= d <= end_date]
    if not paths:
        raise ValueError(f"No xy files in window {start_date} ~ {end_date}")

    dfs = []
    for p in sorted(paths):
        df = pd.read_hdf(p, key="xy")
        if y_col not in df.columns:
            raise KeyError(f"{y_col} not found in {p}")
        dfs.append(df)
    full = pd.concat(dfs, ignore_index=True)
    return full


def select_features_for_window(
    df: pd.DataFrame,
    y_col: str = "y_60m",
    x_prefix: str = "x_",
    min_abs_corr_y: float = 0.02,
    max_xx_corr: float = 0.9,
):


    x_cols = [c for c in df.columns if c.startswith(x_prefix)]
    if y_col not in df.columns:
        raise KeyError(f"{y_col} not in DataFrame")

    sub = df[[y_col] + x_cols].dropna()

    nunique = sub[x_cols].nunique()
    x_cols = [c for c in x_cols if nunique[c] > 1]
    sub = sub[[y_col] + x_cols]

    if sub.empty or len(x_cols) == 0:
        raise ValueError("No valid data or features after dropna / constant filtering")

    corr_matrix = sub.corr()
    corr_xy = corr_matrix[y_col].drop(labels=[y_col])

    candidate = corr_xy[corr_xy.abs() >= min_abs_corr_y].sort_values(
        key=np.abs, ascending=False
    )

    if candidate.empty:
        print("  [WARN] No features satisfy min_abs_corr_y threshold")
        return [], corr_xy

    candidate_features = list(candidate.index)

    selected = []
    selected_set = set()

    corr_xx = corr_matrix.loc[candidate_features, candidate_features]

    for f in candidate_features:
        if not selected:
            selected.append(f)
            selected_set.add(f)
            continue

        max_corr_with_selected = corr_xx.loc[f, selected].abs().max()
        if max_corr_with_selected <= max_xx_corr:
            selected.append(f)
            selected_set.add(f)

    return selected, corr_xy


def run_feature_selection_rolling(
    dataset_dir: str = "data/xy",
    top: int = 30,
    y_col: str = "y_60m",
    start_month: str = "2024-06",
    window_months: int = 4,
    step_months: int = 2,
    n_windows: int = 5,
    min_abs_corr_y: float = 0.02,
    max_xx_corr: float = 0.9,
    sample_frac: float | None = None,
    sample_random_state: int = 42,
):
    file_infos = list_xy_files(dataset_dir, top=top)
    if not file_infos:
        raise ValueError(f"No xy files found in {dataset_dir} for top{top}")

    print(f"Found {len(file_infos)} xy files for top{top} in {dataset_dir}")

    windows = generate_rolling_month_windows(
        start_month=start_month,
        window_months=window_months,
        step_months=step_months,
        n_windows=n_windows,
    )

    windows_info = []
    all_corr_xy = {}
    feature_counts = {}

    for idx, (w_start, w_end) in enumerate(windows):
        print(f"\n=== Window {idx+1}/{len(windows)}: {w_start} ~ {w_end} ===")

        try:
            df_window = load_xy_for_window(file_infos, w_start, w_end, y_col=y_col)
        except ValueError as e:
            print("  [SKIP] ", e)
            windows_info.append({
                "window_index": idx,
                "start_date": w_start,
                "end_date": w_end,
                "selected_features": [],
                "corr_xy": pd.Series(dtype=float),
            })
            continue

        print(f"  Loaded {len(df_window):,} rows for this window.")

        if sample_frac is not None and 0 < sample_frac < 1:
            df_window = df_window.sample(
                frac=sample_frac,
                random_state=sample_random_state
            )
            print(f"  -> Sampled {len(df_window):,} rows (frac={sample_frac})")

        selected, corr_xy = select_features_for_window(
            df_window,
            y_col=y_col,
            x_prefix="x_",
            min_abs_corr_y=min_abs_corr_y,
            max_xx_corr=max_xx_corr,
        )
        print(f"  Selected {len(selected)} features (after corr & colinearity).")

        for f, c in corr_xy.items():
            all_corr_xy.setdefault(f, []).append(abs(float(c)))

        for f in selected:
            feature_counts[f] = feature_counts.get(f, 0) + 1

        windows_info.append({
            "window_index": idx,
            "start_date": w_start,
            "end_date": w_end,
            "selected_features": selected,
            "corr_xy": corr_xy,
        })

    all_features = sorted(all_corr_xy.keys())
    records = []
    for f in all_features:
        counts = feature_counts.get(f, 0)
        corr_list = all_corr_xy.get(f, [])
        mean_abs_corr = float(np.mean(corr_list)) if corr_list else 0.0
        records.append({
            "feature": f,
            "selected_windows": counts,
            "selected_ratio": counts / max(1, n_windows),
            "mean_abs_corr": mean_abs_corr,
        })

    summary_df = pd.DataFrame(records).sort_values(
        ["selected_windows", "mean_abs_corr"], ascending=[False, False]
    ).reset_index(drop=True)

    return windows_info, summary_df

y_col = "y_30m"
universe=50
windows_info, summary_df = run_feature_selection_rolling(
    dataset_dir="data/xy",
    top=universe,
    y_col=y_col,
    start_month="2024-03",
    window_months=4,
    step_months=2,
    n_windows=5,
    min_abs_corr_y=0.005,
    max_xx_corr=0.8,
    sample_frac=0.05,
    sample_random_state=42,
)

summary_df.head(30)


Found 629 xy files for top50 in data/xy

=== Window 1/5: 2024-06-01 ~ 2024-09-30 ===
  Loaded 8,640,000 rows for this window.
  -> Sampled 432,000 rows (frac=0.05)
  Selected 132 features (after corr & colinearity).

=== Window 2/5: 2024-08-01 ~ 2024-11-30 ===
  Loaded 8,784,000 rows for this window.
  -> Sampled 439,200 rows (frac=0.05)
  Selected 146 features (after corr & colinearity).

=== Window 3/5: 2024-10-01 ~ 2025-01-31 ===
  Loaded 8,856,000 rows for this window.
  -> Sampled 442,800 rows (frac=0.05)
  Selected 91 features (after corr & colinearity).

=== Window 4/5: 2024-12-01 ~ 2025-03-31 ===
  Loaded 8,712,000 rows for this window.
  -> Sampled 435,600 rows (frac=0.05)
  Selected 97 features (after corr & colinearity).

=== Window 5/5: 2025-02-01 ~ 2025-05-31 ===
  Loaded 8,640,000 rows for this window.
  -> Sampled 432,000 rows (frac=0.05)
  Selected 116 features (after corr & colinearity).


Unnamed: 0,feature,selected_windows,selected_ratio,mean_abs_corr
0,x_60m_PremiumDiff,5,1.0,0.018628
1,x_30m_PremiumDiff_neut,5,1.0,0.017008
2,x_120m_PremiumDiff_neut,5,1.0,0.017004
3,x_30m_PremiumDiff,5,1.0,0.016968
4,x_240m_PremiumDiff,5,1.0,0.016441
5,x_1m_Premium_neut,5,1.0,0.016146
6,x_60m_PremiumDiff_neut,5,1.0,0.016065
7,x_240m_PremiumDiff_neut,5,1.0,0.015834
8,x_1440m_PremiumDiff_neut,5,1.0,0.014656
9,x_15m_PremiumDiff_neut,5,1.0,0.013004


In [15]:
selected_features = list(summary_df[summary_df['selected_windows'] >= 2]['feature'][:])

In [None]:
out_path = f"{y_col}/top{universe}_example_features_{len(selected_features)}.json"

with open(os.path.join('feature_list',out_path), "w") as f:
    json.dump(f"{selected_features}", f, indent=2)

print(f"Saved {len(selected_features)} features to {out_path}")


Saved 152 features to y_60m/top50_example_features_152.json


In [None]:
import json
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler


def lasso_select_features_for_window(
    df: pd.DataFrame,
    y_col: str,
    feature_list: list[str],
    alphas: np.ndarray | None = None,
):
    cols = [y_col] + feature_list

    sub = df[cols].replace([np.inf, -np.inf], np.nan).dropna()
    if sub.empty:
        raise ValueError("No data left after dropna in lasso_select_features_for_window")

    X = sub[feature_list].to_numpy()
    y = sub[y_col].to_numpy()

    print("  [DEBUG] X has NaN?", np.isnan(X).any())
    print("  [DEBUG] X has non-finite?", ~np.isfinite(X).all())

    if ~np.isfinite(X).all():
        col_bad = ~np.isfinite(X).all(axis=0)
        bad_cols = [f for f, bad in zip(feature_list, col_bad) if bad]
        print("  [DEBUG] Non-finite values in columns:", bad_cols)

        row_bad = ~np.isfinite(X).all(axis=1)
        print("  [DEBUG] Example bad rows (head):")
        print(sub.loc[row_bad].head())

        raise ValueError("X contains non-finite values (inf or NaN)")

    max_abs = np.max(np.abs(X))
    print("  [DEBUG] max |X| =", max_abs)

    if max_abs > 1e6:
        print("  [WARN] X has very large values (>|1e6|). Consider clipping or log-scaling.")

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    if alphas is None:
        alphas = np.logspace(-4, 0, 30)

    lasso_cv = LassoCV(alphas=alphas, cv=3, random_state=0, n_jobs=-1)
    lasso_cv.fit(X_scaled, y)

    coefs = pd.Series(lasso_cv.coef_, index=feature_list)
    selected = coefs[coefs != 0].sort_values(key=np.abs, ascending=False)

    return selected, coefs, float(lasso_cv.alpha_)


def run_lasso_feature_selection_rolling(
    dataset_dir: str,
    top: int,
    y_col: str,
    candidate_features: list[str],
    start_month: str = "2024-06",
    window_months: int = 4,
    step_months: int = 2,
    n_windows: int = 5,
    sample_frac: float | None = None,
    sample_random_state: int = 42,
    alphas: np.ndarray | None = None,
):
    file_infos = list_xy_files(dataset_dir, top=top)
    if not file_infos:
        raise ValueError(f"No xy files found in {dataset_dir} for top{top}")

    print(f"[LASSO] Found {len(file_infos)} xy files for top{top} in {dataset_dir}")

    windows = generate_rolling_month_windows(
        start_month=start_month,
        window_months=window_months,
        step_months=step_months,
        n_windows=n_windows,
    )

    lasso_windows_info = []
    feature_counts = {}
    feature_abs_coefs = {}

    for idx, (w_start, w_end) in enumerate(windows):
        print(f"\n[LASSO] === Window {idx+1}/{len(windows)}: {w_start} ~ {w_end} ===")

        try:
            df_window = load_xy_for_window(file_infos, w_start, w_end, y_col=y_col)
        except ValueError as e:
            print("  [SKIP] ", e)
            lasso_windows_info.append({
                "window_index": idx,
                "start_date": w_start,
                "end_date": w_end,
                "selected_features": pd.Series(dtype=float),
                "coefs": pd.Series(dtype=float),
                "alpha": None,
            })
            continue

        print(f"  Loaded {len(df_window):,} rows for this window.")

        if sample_frac is not None and 0 < sample_frac < 1:
            df_window = df_window.sample(
                frac=sample_frac,
                random_state=sample_random_state,
            )
            print(f"  -> Sampled {len(df_window):,} rows (frac={sample_frac})")

        try:
            selected, coefs, best_alpha = lasso_select_features_for_window(
                df_window,
                y_col=y_col,
                feature_list=candidate_features,
                alphas=alphas,
            )
        except ValueError as e:
            print("  [SKIP LASSO] ", e)
            lasso_windows_info.append({
                "window_index": idx,
                "start_date": w_start,
                "end_date": w_end,
                "selected_features": pd.Series(dtype=float),
                "coefs": pd.Series(dtype=float),
                "alpha": None,
            })
            continue

        print(f"  LASSO selected {len(selected)} features, alpha={best_alpha:.4g}")

        for f, coef in selected.items():
            feature_counts[f] = feature_counts.get(f, 0) + 1
            feature_abs_coefs.setdefault(f, []).append(abs(float(coef)))

        lasso_windows_info.append({
            "window_index": idx,
            "start_date": w_start,
            "end_date": w_end,
            "selected_features": selected,
            "coefs": coefs,
            "alpha": best_alpha,
        })

    all_features = sorted(set(candidate_features))
    records = []
    for f in all_features:
        counts = feature_counts.get(f, 0)
        abs_list = feature_abs_coefs.get(f, [])
        mean_abs_coef = float(np.mean(abs_list)) if abs_list else 0.0
        records.append({
            "feature": f,
            "selected_windows": counts,
            "selected_ratio": counts / max(1, n_windows),
            "mean_abs_coef": mean_abs_coef,
        })

    lasso_summary_df = pd.DataFrame(records).sort_values(
        ["selected_windows", "mean_abs_coef"], ascending=[False, False]
    ).reset_index(drop=True)

    return lasso_windows_info, lasso_summary_df


In [None]:
candidate_features = summary_df[summary_df["selected_windows"] >= 3]["feature"].tolist()
print(len(candidate_features), "candidate features for LASSO")

lasso_windows_info, lasso_summary_df = run_lasso_feature_selection_rolling(
    candidate_features=candidate_features,
    dataset_dir="data/xy",
    top=30,
    y_col="y_60m",
    start_month="2024-06",
    window_months=4,
    step_months=2,
    n_windows=5,
    sample_frac=0.1,
    sample_random_state=42,
    alphas=None,
)

lasso_summary_df.head(30)


76 candidate features for LASSO
[LASSO] Found 631 xy files for top30 in data/xy

[LASSO] === Window 1/5: 2024-06-01 ~ 2024-09-30 ===
  Loaded 5,184,000 rows for this window.
  -> Sampled 518,400 rows (frac=0.1)
  [DEBUG] X has NaN? False
  [DEBUG] X has non-finite? False
  [DEBUG] max |X| = 92528.445725514
  LASSO selected 48 features, alpha=0.1083

[LASSO] === Window 2/5: 2024-08-01 ~ 2024-11-30 ===
  Loaded 5,270,400 rows for this window.
  -> Sampled 527,040 rows (frac=0.1)
  [DEBUG] X has NaN? False
  [DEBUG] X has non-finite? False
  [DEBUG] max |X| = 107427.55453547179
  LASSO selected 52 features, alpha=0.1083

[LASSO] === Window 3/5: 2024-10-01 ~ 2025-01-31 ===
  Loaded 5,313,600 rows for this window.
  -> Sampled 531,360 rows (frac=0.1)
  [DEBUG] X has NaN? False
  [DEBUG] X has non-finite? False
  [DEBUG] max |X| = 107427.55453547179
  LASSO selected 62 features, alpha=0.0788

[LASSO] === Window 4/5: 2024-12-01 ~ 2025-03-31 ===
  Loaded 5,227,200 rows for this window.
  -> Sa

Unnamed: 0,feature,selected_windows,selected_ratio,mean_abs_coef
0,x_1440m_L2C,5,1.0,2.693279
1,x_720m_O2C_neut,5,1.0,2.465108
2,x_480m_C2VWAP_neut,5,1.0,1.815757
3,x_1440m_L2C_neut,5,1.0,1.081914
4,x_240m_PremiumDiff,5,1.0,0.97337
5,x_240m_EffRatio_neut,5,1.0,0.497688
6,x_480m_EffRatio_neut,5,1.0,0.342031
7,x_480m_OI_Chg,4,0.8,4.720581
8,x_30m_H2L_Vol,4,0.8,3.594052
9,x_120m_PremiumDiff,4,0.8,2.369883


In [69]:
list(lasso_summary_df['feature'][:30])

['x_1440m_Force',
 'x_30m_Force',
 'x_1440m_Force_neut',
 'x_480m_Force',
 'x_720m_Force',
 'x_240m_Force',
 'x_5m_Force',
 'x_60m_Force_neut',
 'x_30m_Force_neut',
 'x_1m_Force',
 'x_240m_Force_neut',
 'x_5m_AvgTrade_neut',
 'x_15m_AvgTrade',
 'x_15m_Force',
 'x_120m_Force',
 'x_120m_Force_neut',
 'x_720m_Force_neut',
 'x_5m_Force_neut',
 'x_1440m_OI_P_Corr_neut',
 'x_120m_Vol_neut',
 'x_1440m_OI_Z',
 'x_120m_NetTaker_neut',
 'x_240m_OI_P_Corr_neut',
 'x_240m_OI_XSkew_neut',
 'x_240m_NetTaker_neut',
 'x_720m_CrowdingPressure_neut',
 'x_240m_CrowdingPressure',
 'x_1440m_AvgTrade',
 'x_60m_Force',
 'x_15m_Force_neut',
 'x_1440m_AvgTrade_neut',
 'x_1m_Force_neut',
 'x_120m_OI_Z_neut',
 'x_1440m_NetTaker',
 'x_720m_NetTaker',
 'x_480m_Force_neut',
 'x_480m_OI_P_Corr_neut',
 'x_60m_VolDiff',
 'x_120m_OI_P_Corr_neut',
 'x_60m_Vol_neut',
 'x_120m_H2C_neut',
 'x_120m_O2C',
 'x_720m_OI_P_Corr_neut',
 'x_720m_RatioSkewDiff_neut',
 'x_60m_H2L_Vol',
 'x_120m_VolDiff',
 'x_720m_OI_Price_Ratio_neut

In [None]:
selected_features = list(lasso_summary_df['feature'][:30])
out_path = f"top{universe}_example_features_lasso_{len(selected_features)}.json"

with open(os.path.join(f'feature_list/{y_col}',out_path), "w") as f:
    json.dump(f"{selected_features}", f, indent=2)

print(f"Saved {len(selected_features)} features to {out_path}")


Saved 30 features to top50_example_features_lasso_30.json
