In [2]:
# REQUIREMENTS:
# pip install pandas numpy scikit-learn streamlit plotly

import pandas as pd
import numpy as np
import pickle
from pathlib import Path

# -----------------------
# Config (tweak as needed)
# -----------------------
FEATURES_CSV = "step5_lagged_data.csv"     # lagged features & label by month (t row built from t-1 features)
PRICES_CSV   = "prices_monthly.csv"        # unlagged execution prices month t open/close
MODEL_PKL    = "ensemble_model_0p5_0p5.pkl"

START_MONTH  = "2020-01"                   # backtest window (inclusive)
END_MONTH    = "2025-10"                   # backtest window (inclusive)

# Composite weights
PROB_W   = 0.60
VOLUME_W = 0.30
VOL_W    = 0.10

# Filters and caps
PROBA_MIN        = 0.50
VOLUME_THRESHOLD = 50_000
TOP_N            = 50
MAX_ALLOC        = 0.20

# Costs (bps, per side) â€“ override in the UI later
FEE_BPS_PER_SIDE      = 10.0
SLIPPAGE_BPS_PER_SIDE = 20.0

# Outputs
ALLOC_CSV    = Path("allocations.csv")
BACKTEST_CSV = Path("backtest_monthly.csv")

# -----------------------
# Load data & model
# -----------------------
df = pd.read_csv(FEATURES_CSV, parse_dates=["month"])
df["month_period"] = df["month"].dt.to_period("M")

prices = pd.read_csv(PRICES_CSV, parse_dates=["month"])
prices["month_period"] = prices["month"].dt.to_period("M")

with open(MODEL_PKL, "rb") as f:
    model = pickle.load(f)

# Determine feature set the model expects
if hasattr(model, "feature_names_in_"):
    feat_cols = [c for c in model.feature_names_in_ if c in df.columns]
else:
    # Fallback: all numeric columns except the target
    numeric = df.select_dtypes(include="number").columns.tolist()
    feat_cols = [c for c in numeric if c != "direction"]

# -----------------------
# Helper: month list
# -----------------------
months = (
    df[(df["month_period"] >= START_MONTH) & (df["month_period"] <= END_MONTH)]
    ["month_period"].drop_duplicates().sort_values()
    .tolist()
)

# -----------------------
# Backtest
# -----------------------
alloc_rows = []
bt_rows    = []

portfolio_value = 1_000_000.0
prev_weights = {}  # ticker -> weight

# For quick lookups
px = prices.set_index(["ticker","month_period"])[["open","close","volume"]]

for m in months:
    df_m = df[df["month_period"] == m].copy()
    if df_m.empty:
        continue

    # Score with ensemble model
    X = df_m[feat_cols].copy()
    proba = model.predict_proba(X)[:, 1]
    df_m["proba"] = proba

    # Vol/ATR scores (lower vol => higher score)
    vol_med = df_m["vol_3m"].median() if "vol_3m" in df_m.columns else 1.0
    atr_med = df_m["atr_3m"].median() if "atr_3m" in df_m.columns else 1.0
    df_m["vol_score"] = 1 / (1 + df_m.get("vol_3m", vol_med))
    df_m["atr_score"] = 1 / (1 + df_m.get("atr_3m", atr_med))
    df_m["volatility_score"] = (df_m["vol_score"] + df_m["atr_score"]) / 2

    # Volume score normalized in-month
    max_vol = df_m["volume"].max() if "volume" in df_m.columns and df_m["volume"].max() > 0 else 1.0
    df_m["volume_score"] = df_m.get("volume", 0) / max_vol

    # Composite
    df_m["composite"] = PROB_W*df_m["proba"] + VOLUME_W*df_m["volume_score"] + VOL_W*df_m["volatility_score"]

    # Filters
    cand = df_m[df_m["proba"] >= PROBA_MIN].copy()
    if "volume" in cand.columns:
        cand = cand[cand["volume"] >= VOLUME_THRESHOLD]
    if cand.empty:
        # No positions; mark cash carry
        bt_rows.append({
            "month": str(m),
            "equity": portfolio_value,
            "monthly_ret": 0.0,
            "gross_ret": 0.0,
            "net_ret": 0.0,
            "turnover": 0.0,
            "costs_bps_rt": 0.0
        })
        continue

    # Rank & cap
    cand = cand.sort_values("composite", ascending=False).head(TOP_N).reset_index(drop=True)
    cand["raw_w"]   = cand["composite"] / cand["composite"].sum()
    cand["cap_w"]   = cand["raw_w"].clip(upper=MAX_ALLOC)
    if cand["cap_w"].sum() == 0:
        # all clipped to zero
        bt_rows.append({
            "month": str(m),
            "equity": portfolio_value,
            "monthly_ret": 0.0,
            "gross_ret": 0.0,
            "net_ret": 0.0,
            "turnover": 0.0,
            "costs_bps_rt": 0.0
        })
        continue
    cand["w"] = cand["cap_w"] / cand["cap_w"].sum()

    # Compute turnover vs previous month
    new_weights = dict(zip(cand["ticker"], cand["w"]))
    all_tickers = set(prev_weights) | set(new_weights)
    turnover = sum(abs(new_weights.get(t,0.0) - prev_weights.get(t,0.0)) for t in all_tickers)

    # Execution: month t open -> month t close (from prices CSV)
    rets = []
    for _, r in cand.iterrows():
        key = (r["ticker"], m)
        if key not in px.index:
            # No execution prices for this name-month; treat as zero allocation
            continue
        o, c = px.loc[key, ["open","close"]]
        if o <= 0 or pd.isna(o) or pd.isna(c):
            continue
        gross_ret = (c - o) / o
        rets.append(r["w"] * gross_ret)

        alloc_rows.append({
            "month": str(m),
            "ticker": r["ticker"],
            "proba": r["proba"],
            "volume_score": r["volume_score"],
            "volatility_score": r["volatility_score"],
            "composite": r["composite"],
            "weight": r["w"],
            "exec_open": float(o),
            "exec_close": float(c)
        })

    gross_portfolio_ret = float(np.sum(rets)) if rets else 0.0

    # Costs: round-trip bps on traded notional (turnover), per-side
    roundtrip_bps = (FEE_BPS_PER_SIDE + SLIPPAGE_BPS_PER_SIDE) * 2.0
    cost_fraction = (roundtrip_bps / 10_000.0) * turnover
    net_portfolio_ret = gross_portfolio_ret - cost_fraction

    # Update equity
    portfolio_value *= (1.0 + net_portfolio_ret)

    bt_rows.append({
        "month": str(m),
        "equity": portfolio_value,
        "monthly_ret": net_portfolio_ret,
        "gross_ret": gross_portfolio_ret,
        "net_ret": net_portfolio_ret,
        "turnover": turnover,
        "costs_bps_rt": roundtrip_bps
    })

    prev_weights = new_weights

# Save outputs
pd.DataFrame(alloc_rows).to_csv(ALLOC_CSV, index=False)
pd.DataFrame(bt_rows).to_csv(BACKTEST_CSV, index=False)

print(f"Wrote {ALLOC_CSV.resolve()}")
print(f"Wrote {BACKTEST_CSV.resolve()}")

configuration generated by an older version of XGBoost, please export the model by calling
`Booster.save_model` from that version first, then load it back in current version. See:

    https://xgboost.readthedocs.io/en/stable/tutorials/saving_model.html

for more details about differences between saving model and serializing.

  model = pickle.load(f)
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Wrote C:\Users\HP\Desktop\Deployment Folder\allocations.csv
Wrote C:\Users\HP\Desktop\Deployment Folder\backtest_monthly.csv
