In [None]:
# orchestrator.py
# requirements: pip install openai langchain pandas scikit-learn importlib

import os
import re
import ast
import pandas as pd
import numpy as np
from importlib.machinery import SourceFileLoader
import importlib.util
from typing import TypedDict, Annotated, List
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest, f_classif

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, f1_score, accuracy_score
from sklearn.model_selection import train_test_split

from langchain_core.messages import BaseMessage, HumanMessage
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI
from langgraph.graph import StateGraph, END
from langgraph.prebuilt import create_react_agent
from langgraph.pregel import Pregel

from feature_agent import feature_generator, correlation_inspector, load_data
# your existing backtest functions
import sys
sys.path.append(os.path.abspath("../.."))
from backtest import evaluate_regression, evaluate_classification


In [None]:
# --- Loop configuration ---

API_KEY = os.getenv("OPENAI_API_KEY", "")
os.environ["OPENAI_API_KEY"] = API_KEY
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.7)
react_agent = create_react_agent(llm, tools=[correlation_inspector, feature_generator]) 

MAX_ITERATIONS = 5
TARGET_COL = "reg_target_lookahead6"
DATA_PATH = f"parquet/labeled_data_6NQ.parquet"
CLASS_TARGET_COL = "clf_target_numba_6"

# baseline models
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression

In [None]:
def dynamic_import_and_apply(df: pd.DataFrame, iteration: int) -> pd.DataFrame:
    module_name = f"generated_features_iter_{iteration}"
    file_path   = os.path.abspath(f"{module_name}.py")
    
    # 1) Unload any previous import of that module
    if module_name in sys.modules:
        del sys.modules[module_name]
    
    # 2) Load it directly from the .py
    spec   = importlib.util.spec_from_file_location(module_name, file_path)
    module = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(module)
    
    # 3) Apply
    if not hasattr(module, "apply_features"):
        raise AttributeError(f"{module_name}.py is missing `apply_features(df)`")
    return module.apply_features(df.copy())

In [None]:
def select_features_by_correlation(
    df: pd.DataFrame,
    feature_cols: list[str],
    target_col: str,
    top_k: int | None = None,
    corr_thresh: float = 0.9
) -> tuple[list[str], pd.Series, list[str]]:
    """
    1) Drop zero‐variance features
    2) Compute absolute Pearson correlation of each feature with target.
    3) Take top_k, then drop pairwise correlations > corr_thresh
    4) Fill any NaNs with 0 so no invalid values remain.
    """
    # --- 1) Drop zero‐variance features ---
    std = df[feature_cols].std()
    non_const = std[std > 0].index.tolist()
    
    # --- 2) Corr with target (abs) ---
    corr = df[non_const].corrwith(df[target_col]).abs()
    corr = corr.dropna().sort_values(ascending=False)
    if top_k is not None:
        corr = corr.head(top_k)

    ranked = list(corr.index)

    # --- 3) Pairwise correlation matrix on the ranked features ---
    #    Use numpy.errstate to suppress warnings just in case
    with np.errstate(divide='ignore', invalid='ignore'):
        mat = df[ranked].corr().abs()
    mat = mat.fillna(0)  # turn any NaNs into 0

    upper = mat.where(np.triu(np.ones(mat.shape), k=1).astype(bool))

    # --- 4) Drop features too correlated with a higher‐ranked one ---
    dropped = [col for col in upper.columns if any(upper[col] > corr_thresh)]
    final  = [f for f in ranked if f not in dropped]

    return final, corr, dropped

def evaluate_performance(df: pd.DataFrame, top_k: int = 20) -> dict:
    """
    1) Pick the top_k features most correlated with the regression target.
    2) Pick the top_k features most predictive of the classification target via ANOVA F-test.
    3) Train separate models on each feature set and report metrics.
    """
    metrics = {}

    # --- Prepare base X/y ---
    # Numeric candidates
    feat_cols = [
        c for c in df.columns 
        if pd.api.types.is_numeric_dtype(df[c]) 
        and c not in [TARGET_COL, CLASS_TARGET_COL]
    ]

    # Regression target
    mask_reg = df[feat_cols].notna().all(axis=1) & df[TARGET_COL].notna()
    X_reg_full = df.loc[mask_reg, feat_cols]
    y_reg      = df.loc[mask_reg, TARGET_COL]

    # Classification target (if present)
    mask_cls   = df[feat_cols].notna().all(axis=1) & df[CLASS_TARGET_COL].notna()
    X_cls_full = df.loc[mask_cls, feat_cols]
    y_cls      = df.loc[mask_cls, CLASS_TARGET_COL]

    # --- 1) Regression feature selection by Pearson correlation ---
    corr = X_reg_full.corrwith(y_reg).abs().sort_values(ascending=False)
    reg_feats = corr.head(top_k).index.tolist()
    metrics["regression_features"] = reg_feats

    # --- 2) Classification feature selection by ANOVA F-test ---
    cls_feats = []
    skb = SelectKBest(score_func=f_classif, k=top_k)
    # fillna(0) or dropna as appropriate
    skb.fit(X_cls_full.fillna(0), y_cls)
    cls_feats = X_cls_full.columns[skb.get_support()].tolist()
    metrics["classification_features"] = cls_feats

    # --- 3) Train & evaluate regression ---
    Xr_train, Xr_test, yr_train, yr_test = train_test_split(
        X_reg_full[reg_feats], y_reg, test_size=0.3, random_state=42
    )
    reg = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-4, verbose=2)
    reg.fit(Xr_train, yr_train)
    preds_r = reg.predict(Xr_test)
    metrics["mse"] = mean_squared_error(yr_test, preds_r)

    # --- 4) Train & evaluate classification ---
    Xc_train, Xc_test, yc_train, yc_test = train_test_split(
        X_cls_full[cls_feats].fillna(0), y_cls, 
        test_size=0.3, random_state=42, stratify=y_cls
    )
    clf = RandomForestClassifier(
        n_estimators=100, class_weight="balanced", random_state=42, n_jobs=-4, verbose=2
    )
    clf.fit(Xc_train, yc_train)
    preds_c = clf.predict(Xc_test)
    metrics["f1_score"] = f1_score(yc_test, preds_c, average="weighted")
    metrics["accuracy"] = accuracy_score(yc_test, preds_c)

    return metrics

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, r2_score, f1_score, accuracy_score
from sklearn.model_selection import train_test_split
from typing import Dict, List, Optional, Tuple


def _clean_and_split(
    df: pd.DataFrame,
    target_col: str,
    drop_cols: List[str]
) -> Tuple[pd.DataFrame, pd.Series]:
    """
    Select numeric columns, drop infinities/NaNs, then split into X (features) and y (target).
    drop_cols are removed from X.
    """
    # 1) Numeric only
    num = df.select_dtypes(include="number").copy()
    # 2) Replace infinities → NaN
    num.replace([np.inf, -np.inf], np.nan, inplace=True)
    # 3) Drop rows with any NaNs in num or target
    if target_col not in num:
        raise ValueError(f"Target column '{target_col}' not in DataFrame")
    subset = num.dropna(subset=[target_col])
    subset = subset.dropna(axis=1, how="all")
    # 4) Split X/y
    y = subset[target_col]
    X = subset.drop(columns=[target_col] + drop_cols, errors="ignore")
    return X, y

def evaluate_two_pipelines(
    df_reg: pd.DataFrame,
    df_cls: pd.DataFrame,
    target_reg: str,
    target_cls: Optional[str] = None
) -> Dict[str, any]:
    """
    Train & evaluate:
      - A regressor on df_reg predicting target_reg
      - A classifier on df_cls predicting target_cls (if provided)
    Returns a metrics dict including mse, r2_score, f1_score, accuracy, 
    and feature lists used.
    """
    metrics: Dict[str, any] = {}

    # --- Regression ---
    Xr, yr = _clean_and_split(df_reg, target_col=target_reg, drop_cols=[target_cls] if target_cls else [])
    Xr_tr, Xr_te, yr_tr, yr_te = train_test_split(Xr, yr, test_size=0.3, random_state=42)
    reg = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-4, verbose=2)
    reg.fit(Xr_tr, yr_tr)
    preds_r = reg.predict(Xr_te)

    metrics["mse"]      = mean_squared_error(yr_te, preds_r)
    metrics["r2_score"] = r2_score(yr_te, preds_r)
    metrics["regression_features"] = Xr.columns.tolist()

    # --- Classification (optional) ---
    if target_cls and target_cls in df_cls.columns:
        Xc, yc = _clean_and_split(df_cls, target_col=target_cls, drop_cols=[target_reg])
        Xc_tr, Xc_te, yc_tr, yc_te = train_test_split(
            Xc, yc, test_size=0.3, random_state=42, stratify=yc
        )
        clf = RandomForestClassifier(
            n_estimators=100, class_weight="balanced", random_state=42, verbose=2, n_jobs=-4
        )
        clf.fit(Xc_tr, yc_tr)
        preds_c = clf.predict(Xc_te)

        metrics["f1_score"]              = f1_score(yc_te, preds_c, average="weighted")
        metrics["accuracy"]              = accuracy_score(yc_te, preds_c)
        metrics["classification_features"] = Xc.columns.tolist()

    return metrics


In [None]:
def prune_features(
    df: pd.DataFrame,
    missing_thresh: float = 0.2,
    variance_thresh: float = 0.01,
    corr_thresh: float = 0.9,
    preserve: list[str] | None = None
) -> tuple[pd.DataFrame, list[str]]:
    """
    Drop bad columns—but never drop anything in `preserve`.
    """
    if preserve is None:
        preserve = []
    preserve = set(preserve)

    # 1) Missing‐value drop
    miss_frac   = df.isna().mean()
    drop_missing = {c for c, f in miss_frac.items() if f > missing_thresh}

    # 2) Low‐variance drop
    numeric     = df.select_dtypes(include="number")
    selector    = VarianceThreshold(variance_thresh)
    selector.fit(numeric.dropna())
    low_var     = set(numeric.columns[~selector.get_support()])

    # 3) High‐correlation drop
    corr        = numeric.corr().abs().fillna(0)
    upper       = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
    drop_corr   = {c for c in upper.columns if any(upper[c] > corr_thresh)}

    # 4) Combine, but never drop anything in preserve
    to_drop = list((drop_missing | low_var | drop_corr) - preserve)
    pruned  = df.drop(columns=to_drop)
    return pruned, to_drop

def prune_for_task(
    df: pd.DataFrame,
    preserve_cols: list[str],
    missing_thresh: float = 0.2,
    variance_thresh: float = 0.01,
    corr_thresh: float = 0.9
) -> tuple[pd.DataFrame, list[str]]:
    """
    Prunes df (dropping low‐variance or highly correlated features)
    but never drops any column in preserve_cols.
    Returns the pruned DataFrame and list of dropped columns.
    """
    # Use your existing prune_features but pass preserve
    pruned, dropped = prune_features(
        df,
        missing_thresh=missing_thresh,
        variance_thresh=variance_thresh,
        corr_thresh=corr_thresh,
        preserve=preserve_cols
    )
    return pruned, dropped

In [None]:
class AgentState(TypedDict):
    iteration: int
    task: str
    feedback: str
    generated_code: str
    metrics: dict
    # The 'agent_outcome' field is used by create_react_agent
    agent_outcome: Annotated[List[BaseMessage], lambda x, y: x + y]

# --- LangGraph Nodes ---

def code_generation_node(state: AgentState):
    """Generates feature-engineering code, with a fallback load for df."""
    # ─── Guard: pull df in if Pregel didn’t pass it ───────────────────────
    if "df" not in state or state["df"] is None:
        print("ℹ️ [generate] no df in state, loading from DATA_PATH")
        state["df"] = pd.read_parquet(DATA_PATH)
    # ─────────────────────────────────────────────────────────────────────

    print(f"\n--- Iteration {state['iteration']}: Generating Code ---")
    print("   state keys:", list(state.keys()))
    existing = list(state["df"].columns)
    previous     = state.get("recent_kept", [])
    prompt = (
        f"Your DataFrame currently has {len(existing)} columns: {existing} …\n"
        f"On the last iteration you _kept_ these {len(previous)} features: {previous}"
        "We prune any feature with variance ≤0.01 or correlation ≥0.9.\n"
        "Generate 5 brand-new features with variance >0.01, corr <0.9 vs every existing column,\n"
        "don't just name them features_n, have a name for it to be easy to recognize, \n"
        "and with names not already in the DataFrame.\n"
        "Return only the raw Python code for `def apply_features(df):` (no markdown fences)."
    )
    if state.get("feedback"):
        prompt += "\n\nPrevious feedback:\n" + state["feedback"]

    agent_outcome = react_agent.invoke({"messages": [HumanMessage(content=prompt)]})
    raw = agent_outcome["messages"][-1].content
    code = re.sub(r"^```(?:python)?\s*|\s*```$", "", raw.strip(), flags=re.MULTILINE)
    return {"generated_code": code}

In [None]:
def code_validation_and_application_node(state: AgentState):
    print("--- Validating and Applying Code ---")

    # 1) Bootstrap df if missing
    if "df" not in state or state["df"] is None:
        state["df"] = pd.read_parquet(DATA_PATH)
    df_orig   = state["df"]
    code      = state.get("generated_code", "")
    iteration = state["iteration"]
    module_name = f"generated_features_iter_{iteration}"
    file_path   = os.path.abspath(f"{module_name}.py")

    # 2) Syntax-check the generated code
    try:
        ast.parse(code)
    except SyntaxError as e:
        return {"feedback": f"Syntax error: {e.msg} (line {e.lineno})"}

    # 3) **Write** the code to disk so dynamic_import_and_apply can find it
    with open(file_path, "w") as f:
        f.write("# Auto-generated by code_generation_node\n")
        f.write("import pandas as pd, numpy as np\n\n")
        f.write(code)

    print(f"✏️  Wrote {len(code)} characters to {file_path}")
        
# 4) Load & apply
    if module_name in sys.modules:
        del sys.modules[module_name]
    spec   = importlib.util.spec_from_file_location(module_name, file_path)
    module = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(module)
    df_new = module.apply_features(df_orig.copy())
    added_raw = set(df_new.columns) - set(df_orig.columns)
    print("🔍 Raw added columns:", added_raw)

    # 5) Prune
    base_cols   = ["open","high","low","close","volume", TARGET_COL]
    kept_so_far = list(df_orig.columns)  # after previous iteration
    df_pruned, dropped = prune_features(
        df_new,
        missing_thresh=0.2,
        variance_thresh=0.01,
        corr_thresh=0.9,
        preserve=base_cols + kept_so_far
    )
    print("🗑️  Dropped features:", dropped)

    # 3) persist and compare
    state["df"] = df_pruned
    added = set(df_pruned.columns) - set(df_orig.columns)

    # 6) Decide what to return
    added_final = set(df_pruned.columns) - set(df_orig.columns)
    if added_final:
        print("🎉 Kept new columns:", added_final)
        return {
            "feedback": None,
            "df":       df_pruned,
            "recent_kept": list(added_final)
        }
    else:
        fb = "No net new columns after pruning."
        print("⚠️ ", fb)
        return {
            "feedback": fb,
            "df":       df_pruned,
            "recent_kept": list(added_final)
        }

def model_evaluation_node(state: AgentState):
    """
    Orchestrates pruning + evaluation on two separate pipelines.
    """
    df_master = state["df"]

    # Normalize class target
    cls_target = None
    if isinstance(CLASS_TARGET_COL, str) and CLASS_TARGET_COL in df_master.columns:
        cls_target = CLASS_TARGET_COL

    # Base columns never to drop
    base_cols = ["open", "high", "low", "close", "volume", TARGET_COL]
    if cls_target:
        base_cols.append(cls_target)

    # Prune for regression
    df_reg, dropped_reg = prune_for_task(df_master.copy(), preserve_cols=base_cols)
    print("🗑️  Dropped from regression pipeline:", dropped_reg)

    # Prune for classification
    df_cls, dropped_cls = pd.DataFrame(), []
    if cls_target:
        df_cls, dropped_cls = prune_for_task(df_master.copy(), preserve_cols=base_cols)
        print("🗑️  Dropped from classification pipeline:", dropped_cls)

    # Evaluate
    metrics = evaluate_two_pipelines(df_reg, df_cls, target_reg=TARGET_COL, target_cls=cls_target)
    print("Evaluation Metrics:", metrics)

    return {
        "metrics": metrics,
        "df_reg":  df_reg,
        "df_cls":  df_cls if cls_target else None
    }

def decision_node(state: AgentState):
    print("--- Making a Decision ---")
    
    # If feedback ⇒ retry next iteration
    if state.get("feedback"):
        next_i = state["iteration"] + 1
        print(f"⚠️  Feedback present, moving to iteration {next_i}")
        return {"iteration": next_i, "next_step": "generate_code"}
    
    # Max‐iters check
    if state["iteration"] >= MAX_ITERATIONS:
        print("✅ Reached max iterations → done")
        return END
    
    # Otherwise continue
    next_i = state["iteration"] + 1
    print(f"🔄 No feedback → iteration {next_i}")
    return {"iteration": next_i, "next_step": "generate_code"}

In [None]:
from typing import Any, Dict

def main_loop(max_iters: int = 10):
    # 1) Bootstrap once
    state: Dict[str, Any] = {
        "iteration": 1,
        "task":     "Generate 5 new features to predict the target variable. \n"
                    "don't just name them features_n, have a name for it to be easy to recognize, \n"
                    "Focus on interaction terms and rolling statistics.",
        "feedback":  None,
        "df":        pd.read_parquet(DATA_PATH),
        "metrics":   {},
    }

    prev_mse = None

    for _ in range(max_iters):
        print(f"Initial feature count {len(state['df'].columns)}")
        print(f"\n🚀 Iteration {state['iteration']}")

        # 2) Generate code
        out = code_generation_node(state)
        state.update(out)
        print("  ✔ code_generation_node")

        # 3) Validate & apply
        out = code_validation_and_application_node(state)
        state.update(out)
        print("  ✔ code_validation_and_application_node "
              f"(features now {len(state['df'].columns)})")

        # 4) Evaluate
        out = model_evaluation_node(state)
        state.update(out)
        curr_mse = state["metrics"].get("mse")
        print("  ✔ model_evaluation_node "
              f"(metrics={state['metrics']})")
        
        if prev_mse is not None and curr_mse is not None:
            imp = prev_mse - curr_mse
            state["metrics"]["mse_improvement"] = imp
            print(f"    ↳ mse_improvement = {imp:.3e}")
        else:
            state["metrics"]["mse_improvement"] = None
            print("    ↳ first pass, no mse_improvement")

        # 5) Decide next step
        out = decision_node(state)
        if out is END:
            print("🛑 decision_node returned END. Exiting.")
            break
        state.update(out)
        print(f"  ✔ decision_node → next iteration={state['iteration']}")

        # 6) Early‐stop if no improvement
        if state["iteration"] > 2 and state["metrics"].get("mse_improvement", 1e9) <= 0:
            print("⚠️  No improvement after 2+ iterations; stopping.")
            break

    print("\n✅ Final feature count:", len(state["df"].columns))
    print("Columns:", state["df"].columns.tolist())

if __name__ == "__main__":
    main_loop(max_iters=5)