In [None]:
# ======================
# Model 2 - Stock Return Regressor (with Train/Val/Test preds)
# ======================

import os
import io
import time
import pandas as pd
from azure.storage.blob import BlobServiceClient
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

# ---------- CONFIG ----------
import os
from dotenv import load_dotenv

load_dotenv()  # loads .env file if present (for local dev)

AZURE_CONNECTION_STRING = os.getenv("AZURE_CONN_STR")
CONTAINER_NAME = "stock-data"

FEATURES_PATH = "training_data"   # where train/val/test features are stored
PREDICTIONS_PATH = "predictions"  # where model1 predictions are stored

MODEL2_THRESHOLD = 0.2            # min model1_prob to be candidate
N_JOBS = 12                       # for 12 CPU cores
SEED = 42

# ---------- CONNECT ----------
blob_service = BlobServiceClient.from_connection_string(AZURE_CONNECTION_STRING)
container = blob_service.get_container_client(CONTAINER_NAME)

def load_parquet_from_blob(blob_name):
    data = container.download_blob(blob_name).readall()
    return pd.read_parquet(io.BytesIO(data))

# ---------- LOAD DATA ----------
print("📥 Loading features and Model 1 predictions...")
splits = ["train", "val", "test"]
features = {}
preds = {}
for split in splits:
    features[split] = load_parquet_from_blob(f"{FEATURES_PATH}/{split}.parquet")
    preds[split]    = load_parquet_from_blob(f"{PREDICTIONS_PATH}/model1_predictions_{split}.parquet")
    print(f"✅ {split.upper()} features: {features[split].shape}, preds: {preds[split].shape}")

# ---------- MERGE + FILTER ----------
candidates = {}
for split in splits:
    df = features[split].merge(preds[split], on=["Date", "Ticker"])
    before = len(df)
    df = df[df["model1_prob"] >= MODEL2_THRESHOLD].reset_index(drop=True)
    after = len(df)
    print(f"🎯 {split.upper()} candidates >= {MODEL2_THRESHOLD}: {after:,} / {before:,}")
    candidates[split] = df

# ---------- TRAIN MODEL 2 ----------
exclude_cols = {"Date", "Ticker", "future_return", "high_growth_label", "model1_prob", "model1_pred"}
X_train = candidates["train"].drop(columns=list(exclude_cols))
y_train = candidates["train"]["future_return"]
X_val = candidates["val"].drop(columns=list(exclude_cols))
y_val = candidates["val"]["future_return"]

print("🚀 Training Model 2 (XGBRegressor)...")
start_time = time.time()
model2 = XGBRegressor(
    objective="reg:squarederror",
    n_estimators=600,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.85,
    colsample_bytree=0.85,
    reg_lambda=1.5,
    reg_alpha=0.1,
    n_jobs=N_JOBS,
    random_state=SEED,
    tree_method="hist"
)
model2.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=30, verbose=False)
print(f"⏱ Training done in {time.time()-start_time:.1f} sec")

# ---------- EVALUATE ----------
for split in ["val", "test"]:
    X_split = candidates[split].drop(columns=list(exclude_cols))
    y_split = candidates[split]["future_return"]
    preds_split = model2.predict(X_split)
    rmse = mean_squared_error(y_split, preds_split, squared=False)
    r2   = r2_score(y_split, preds_split)
    print(f"📊 {split.upper()} RMSE: {rmse:.4f}, R²: {r2:.4f}")

# ---------- PREDICT & SAVE FOR ALL SPLITS ----------
for split in splits:
    X_split = candidates[split].drop(columns=list(exclude_cols))
    preds_split = model2.predict(X_split)
    candidates[split]["model2_pred_return"] = preds_split

    # Save locally
    local_out = f"model2_predictions_{split}.parquet"
    candidates[split].to_parquet(local_out, index=False)

    # Upload to Blob
    blob_out = f"{PREDICTIONS_PATH}/model2_predictions_{split}.parquet"
    with open(local_out, "rb") as f:
        container.upload_blob(name=blob_out, data=f, overwrite=True)

    print(f"💾 Saved local: {local_out}")
    print(f"☁️ Uploaded to Blob: {blob_out}")

print("✅ Model 2 pipeline complete.")


📥 Loading features and Model 1 predictions...
✅ TRAIN features: (2748516, 56), preds: (2748516, 4)
✅ VAL features: (589040, 56), preds: (589040, 4)
✅ TEST features: (589192, 56), preds: (589192, 4)
🎯 TRAIN candidates >= 0.2: 2,504,470 / 2,748,516
🎯 VAL candidates >= 0.2: 481,855 / 589,040
🎯 TEST candidates >= 0.2: 491,957 / 589,192
🚀 Training Model 2 (XGBRegressor)...


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


⏱ Training done in 23.3 sec


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


📊 VAL RMSE: 0.3114, R²: 0.0435




📊 TEST RMSE: 0.3611, R²: -0.0252


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


💾 Saved local: model2_predictions_train.parquet
☁️ Uploaded to Blob: predictions/model2_predictions_train.parquet


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


💾 Saved local: model2_predictions_val.parquet
☁️ Uploaded to Blob: predictions/model2_predictions_val.parquet


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


💾 Saved local: model2_predictions_test.parquet
☁️ Uploaded to Blob: predictions/model2_predictions_test.parquet
✅ Model 2 pipeline complete.
