In [None]:
# ======================
# Model 1 - High Growth Stock Classifier
# (CPU Optimized + Threshold Tuning)
# ======================

import pandas as pd
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve
from xgboost import XGBClassifier
import time

# ===== CONFIG =====
TRAIN_PATH = "training_data/train.parquet"
VAL_PATH   = "training_data/val.parquet"
TEST_PATH  = "training_data/test.parquet"

SAMPLE_FRAC = 0.10   # % of train data for tuning (keep small for speed)
N_ITER      = 25     # random search trials
OPTIMIZE_FOR = "f1"  # "f1" or "recall" or "precision"

# ===== LOAD DATA =====
print("📥 Loading training & validation data...")
train_df = pd.read_parquet(TRAIN_PATH)
val_df   = pd.read_parquet(VAL_PATH)
test_df  = pd.read_parquet(TEST_PATH)

# Downcast numeric types to save RAM
def downcast_df(df):
    float_cols = df.select_dtypes(include=['float']).columns
    int_cols = df.select_dtypes(include=['int']).columns
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols] = df[int_cols].astype(np.int32)
    return df

train_df = downcast_df(train_df)
val_df   = downcast_df(val_df)
test_df  = downcast_df(test_df)

# ===== SAMPLE TRAINING DATA =====
train_sample = train_df.sample(frac=SAMPLE_FRAC, random_state=42)

# Features & Target
exclude_cols = {"Date", "Ticker", "future_return", "high_growth_label"}
X_train = train_sample.drop(columns=list(exclude_cols))
y_train = train_sample["high_growth_label"]

X_val   = val_df.drop(columns=list(exclude_cols))
y_val   = val_df["high_growth_label"]

# ===== PARAMETER SEARCH SPACE =====
param_dist = {
    "n_estimators": [200, 400, 600],
    "max_depth": [3, 5, 7],
    "learning_rate": [0.01, 0.05, 0.1],
    "subsample": [0.7, 0.85, 1.0],
    "colsample_bytree": [0.7, 0.85, 1.0],
    "min_child_weight": [1, 3, 5],
    "gamma": [0, 0.1, 0.3],
    "reg_lambda": [1, 1.5, 2],
    "reg_alpha": [0, 0.1, 0.3],
    "scale_pos_weight": [1, float(np.sum(y_train==0) / np.sum(y_train==1))]
}

# ===== RANDOMIZED SEARCH =====
xgb = XGBClassifier(
    objective="binary:logistic",
    tree_method="hist",  # fast CPU
    eval_metric="auc",
    n_jobs=-1,
    random_state=42
)

print(f"🔍 RandomizedSearchCV on {SAMPLE_FRAC*100:.0f}% sample ({len(train_sample):,} rows)...")
start_time = time.time()

random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=N_ITER,
    scoring="roc_auc",
    n_jobs=-1,
    cv=3,
    verbose=1,
    random_state=42
)

random_search.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    early_stopping_rounds=25,
    verbose=False
)

print(f"⏱ Tuning completed in {(time.time()-start_time):.1f} sec")
print("✅ Best Params:", random_search.best_params_)
print("📊 Best CV Score:", random_search.best_score_)

# ===== FINAL TRAIN ON FULL TRAIN+VAL =====
best_params = random_search.best_params_
final_model = XGBClassifier(
    **best_params,
    objective="binary:logistic",
    tree_method="hist",
    eval_metric="auc",
    n_jobs=-1,
    random_state=42
)

full_X_train = pd.concat([train_df.drop(columns=list(exclude_cols)),
                          val_df.drop(columns=list(exclude_cols))], axis=0)
full_y_train = pd.concat([train_df["high_growth_label"],
                          val_df["high_growth_label"]], axis=0)

print("🚀 Training final model...")
final_model.fit(full_X_train, full_y_train)

# ===== THRESHOLD TUNING ON VALIDATION =====
val_probs = final_model.predict_proba(X_val)[:,1]
prec, rec, thresh = precision_recall_curve(y_val, val_probs)

if OPTIMIZE_FOR == "f1":
    f1_scores = (2 * prec * rec) / (prec + rec + 1e-8)
    best_idx = np.argmax(f1_scores)
elif OPTIMIZE_FOR == "recall":
    best_idx = np.argmax(rec)
elif OPTIMIZE_FOR == "precision":
    best_idx = np.argmax(prec)
else:
    best_idx = np.argmax((2 * prec * rec) / (prec + rec + 1e-8))

best_thresh = thresh[best_idx]
print(f"🎯 Best threshold for {OPTIMIZE_FOR.upper()}: {best_thresh:.3f}")

# ===== TEST EVALUATION =====
X_test = test_df.drop(columns=list(exclude_cols))
y_test = test_df["high_growth_label"]

test_probs = final_model.predict_proba(X_test)[:,1]
test_preds = (test_probs >= best_thresh).astype(int)

print("\n📈 TEST RESULTS")
print(classification_report(y_test, test_preds))
print("ROC AUC:", roc_auc_score(y_test, test_probs))


  from pandas import MultiIndex, Int64Index


📥 Loading training & validation data...
🔍 RandomizedSearchCV on 10% sample (274,852 rows)...
Fitting 3 folds for each of 25 candidates, totalling 75 fits


  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Ind

⏱ Tuning completed in 1594.7 sec
✅ Best Params: {'subsample': 0.85, 'scale_pos_weight': 1, 'reg_lambda': 1, 'reg_alpha': 0.1, 'n_estimators': 200, 'min_child_weight': 1, 'max_depth': 7, 'learning_rate': 0.05, 'gamma': 0, 'colsample_bytree': 1.0}
📊 Best CV Score: 0.6696355076396355
🚀 Training final model...


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


🎯 Best threshold for F1: 0.269


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):



📈 TEST RESULTS
              precision    recall  f1-score   support

           0       0.80      0.39      0.52    412924
           1       0.35      0.77      0.48    176268

    accuracy                           0.50    589192
   macro avg       0.57      0.58      0.50    589192
weighted avg       0.66      0.50      0.51    589192

ROC AUC: 0.615384601919299


In [None]:
import os
import joblib
from azure.storage.blob import BlobServiceClient

# ===== CONFIG =====
import os
from dotenv import load_dotenv

load_dotenv()  # loads .env file if present (for local dev)

AZURE_CONNECTION_STRING = os.getenv("AZURE_CONN_STR")
CONTAINER_NAME = "stock-data"
BLOB_PREFIX = "model1"  # Folder inside container

# ===== LOCAL FILES =====
local_files = [
    ("model1_xgb.pkl", "model1_xgb.pkl"),
    ("training_data/train.parquet", "train.parquet"),
    ("training_data/val.parquet", "val.parquet"),
    ("training_data/test.parquet", "test.parquet"),
    ("training_data/feature_columns.csv", "feature_columns.csv")
]

# Save model locally before upload
joblib.dump(final_model, "model1_xgb.pkl")

# ===== UPLOAD =====
blob_service = BlobServiceClient.from_connection_string(AZURE_CONNECTION_STRING)
container = blob_service.get_container_client(CONTAINER_NAME)

for local_path, blob_name in local_files:
    blob_path = f"{BLOB_PREFIX}/{blob_name}"
    with open(local_path, "rb") as f:
        container.upload_blob(name=blob_path, data=f, overwrite=True)
    print(f"☁️ Uploaded: {blob_path}")

print("✅ All Model 1 files uploaded to Azure Blob Storage.")


☁️ Uploaded: model1/model1_xgb.pkl
☁️ Uploaded: model1/train.parquet
☁️ Uploaded: model1/val.parquet
☁️ Uploaded: model1/test.parquet
☁️ Uploaded: model1/feature_columns.csv
✅ All Model 1 files uploaded to Azure Blob Storage.


In [None]:
from azure.storage.blob import BlobServiceClient

# ===== SAVE MODEL 1 PREDICTIONS TO BLOB =====
import os
from dotenv import load_dotenv

load_dotenv()  # loads .env file if present (for local dev)

AZURE_CONNECTION_STRING = os.getenv("AZURE_CONN_STR")
CONTAINER_NAME = "stock-data"

# Create DataFrame with predictions
model1_results = test_df[["Date", "Ticker"]].copy()
model1_results["model1_prob"] = test_probs
model1_results["model1_pred"] = test_preds

# Save locally
predictions_local_path = "model1_predictions.parquet"
model1_results.to_parquet(predictions_local_path, index=False)

# Upload to blob
blob_service = BlobServiceClient.from_connection_string(AZURE_CONNECTION_STRING)
container_client = blob_service.get_container_client(CONTAINER_NAME)
container_client.upload_blob(
    name="predictions/model1_predictions.parquet",
    data=open(predictions_local_path, "rb"),
    overwrite=True
)

print("☁️ Model 1 predictions saved to Blob at: predictions/model1_predictions.parquet")


☁️ Model 1 predictions saved to Blob at: predictions/model1_predictions.parquet


In [None]:
from azure.storage.blob import BlobServiceClient
import pandas as pd
import io

# Azure Blob credentials
import os
from dotenv import load_dotenv

load_dotenv()  # loads .env file if present (for local dev)

BLOB_CONN_STR = os.getenv("AZURE_CONN_STR")
CONTAINER_NAME = "stock-data"
BLOB_NAME = "predictions/model1_predictions.parquet"

# Connect to Blob storage
blob_service_client = BlobServiceClient.from_connection_string(BLOB_CONN_STR)
blob_client = blob_service_client.get_blob_client(container=CONTAINER_NAME, blob=BLOB_NAME)

# Download and read as DataFrame
data = blob_client.download_blob().readall()
df_preds = pd.read_parquet(io.BytesIO(data))

# Show basic info
print(df_preds.head())
print(df_preds.describe())

# Check how many pass your current filter threshold
THRESHOLD = 0.2
print(f"Candidates above {THRESHOLD}: {(df_preds['model1_prob'] >= THRESHOLD).sum()}")


                       Date Ticker  model1_prob  model1_pred
0 2020-11-25 05:00:00+00:00      A     0.326960            1
1 2020-11-27 05:00:00+00:00      A     0.358348            1
2 2020-11-30 05:00:00+00:00      A     0.333080            1
3 2020-12-01 05:00:00+00:00      A     0.338309            1
4 2020-12-02 05:00:00+00:00      A     0.339183            1
         model1_prob    model1_pred
count  589192.000000  589192.000000
mean        0.303728       0.660243
std         0.112145       0.473627
min         0.005215       0.000000
25%         0.240137       0.000000
50%         0.307728       1.000000
75%         0.366064       1.000000
max         0.958054       1.000000
Candidates above 0.2: 491957


In [None]:
import io
from azure.storage.blob import BlobServiceClient

# ===== CONFIG =====
import os
from dotenv import load_dotenv

load_dotenv()  # loads .env file if present (for local dev)

AZURE_CONNECTION_STRING = os.getenv("AZURE_CONN_STR")
CONTAINER_NAME = "stock-data"
PREDICTIONS_BLOB_DIR = "predictions"

# Connect to Blob
blob_service = BlobServiceClient.from_connection_string(AZURE_CONNECTION_STRING)
container = blob_service.get_container_client(CONTAINER_NAME)

# Columns to exclude from features
exclude_cols = {"Date", "Ticker", "future_return", "high_growth_label"}

def save_preds_to_blob(df, split_name):
    X_split = df.drop(columns=list(exclude_cols))
    probs = final_model.predict_proba(X_split)[:, 1]

    preds_df = df[["Date", "Ticker"]].copy()
    preds_df["model1_prob"] = probs
    preds_df["model1_pred"] = (probs >= best_thresh).astype(int)

    # Save to buffer
    buf = io.BytesIO()
    preds_df.to_parquet(buf, index=False)

    # Upload to Blob
    blob_name = f"{PREDICTIONS_BLOB_DIR}/model1_predictions_{split_name}.parquet"
    container.upload_blob(name=blob_name, data=buf.getvalue(), overwrite=True)
    print(f"☁️ Uploaded: {blob_name} ({len(preds_df):,} rows)")

# Save train/val/test predictions
save_preds_to_blob(train_df, "train")
save_preds_to_blob(val_df, "val")
save_preds_to_blob(test_df, "test")

print("✅ All predictions uploaded to Blob.")


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


☁️ Uploaded: predictions/model1_predictions_train.parquet (2,748,516 rows)


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


☁️ Uploaded: predictions/model1_predictions_val.parquet (589,040 rows)


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


☁️ Uploaded: predictions/model1_predictions_test.parquet (589,192 rows)
✅ All predictions uploaded to Blob.


In [None]:
import io
import pandas as pd
from azure.storage.blob import BlobServiceClient

# ===== CONFIG =====
import os
from dotenv import load_dotenv

load_dotenv()  # loads .env file if present (for local dev)

AZURE_CONNECTION_STRING = os.getenv("AZURE_CONN_STR")
CONTAINER_NAME = "stock-data"
PREDICTIONS_BLOB_DIR = "predictions"

# Connect to Blob
blob_service = BlobServiceClient.from_connection_string(AZURE_CONNECTION_STRING)
container = blob_service.get_container_client(CONTAINER_NAME)

def preview_blob(blob_name, n=5):
    print(f"📥 Downloading {blob_name} ...")
    data = container.download_blob(blob_name).readall()
    df = pd.read_parquet(io.BytesIO(data))

    print(f"✅ Loaded {len(df):,} rows")
    print(df.head(n))
    print(df.describe())
    print(f"Candidates above 0.2: {(df['model1_prob'] >= 0.2).sum():,}")
    print("="*60)

# Check all splits
preview_blob(f"{PREDICTIONS_BLOB_DIR}/model1_predictions_train.parquet")
preview_blob(f"{PREDICTIONS_BLOB_DIR}/model1_predictions_val.parquet")
preview_blob(f"{PREDICTIONS_BLOB_DIR}/model1_predictions_test.parquet")


📥 Downloading predictions/model1_predictions_train.parquet ...
✅ Loaded 2,748,516 rows
                       Date Ticker  model1_prob  model1_pred
0 2000-01-06 05:00:00+00:00      A     0.363375            1
1 2000-01-07 05:00:00+00:00      A     0.343405            1
2 2000-01-10 05:00:00+00:00      A     0.304278            1
3 2000-01-11 05:00:00+00:00      A     0.299840            1
4 2000-01-12 05:00:00+00:00      A     0.327700            1
        model1_prob   model1_pred
count  2.748516e+06  2.748516e+06
mean   3.658482e-01  8.020728e-01
std    1.201886e-01  3.984370e-01
min    1.085060e-03  0.000000e+00
25%    2.929601e-01  1.000000e+00
50%    3.680962e-01  1.000000e+00
75%    4.377828e-01  1.000000e+00
max    9.725748e-01  1.000000e+00
Candidates above 0.2: 2,504,470
📥 Downloading predictions/model1_predictions_val.parquet ...
✅ Loaded 589,040 rows
                       Date Ticker  model1_prob  model1_pred
0 2017-03-22 04:00:00+00:00      A     0.345263            1
1 20