In [8]:
import datetime as dt

import lightgbm as lgb
import numpy as np
import polars as pl

from stocksense.config import config
from stocksense.database import DatabaseHandler
from stocksense.pipeline import clean, engineer_features

features = config.model.features
date_col = config.model.date_col
aux_cols = ["tic", "datadate", "rdq"]

features = config.model.features
targets = config.model.targets
prediction_horizon = config.processing.prediction_horizon
min_train_years = config.model.min_train_years

trade_date = dt.datetime(2022, 6, 1)

In [9]:
def prepare_data():
    """Prepare data for model operations."""
    data = engineer_features()
    data = clean(data)
    return data


data = prepare_data()
constituents = DatabaseHandler().fetch_constituents(trade_date)

[32m2025-02-07 19:21:21.525[0m | [1mINFO    [0m | [36mstocksense.pipeline.preprocess[0m:[36mengineer_features[0m:[36m20[0m - [1mSTART processing stock data[0m
[32m2025-02-07 19:21:21.526[0m | [32m[1mSUCCESS [0m | [36mstocksense.database.schema[0m:[36mcreate_tables[0m:[36m121[0m - [32m[1mTables created successfully[0m
[32m2025-02-07 19:21:24.577[0m | [1mINFO    [0m | [36mstocksense.pipeline.preprocess[0m:[36mengineer_features[0m:[36m33[0m - [1mSTART feature engineering[0m
[32m2025-02-07 19:21:30.119[0m | [1mINFO    [0m | [36mstocksense.pipeline.preprocess[0m:[36mcompute_performance_targets[0m:[36m1258[0m - [1mTarget hit rates: shape: (1, 3)
┌─────────────────┬───────────────┬──────────────┐
│ aggressive_rate ┆ balanced_rate ┆ relaxed_rate │
│ ---             ┆ ---           ┆ ---          │
│ f64             ┆ f64           ┆ f64          │
╞═════════════════╪═══════════════╪══════════════╡
│ 0.277383        ┆ 0.291025      ┆ 0.300353    

In [10]:
def get_dataset_imbalance_scale(train: pl.DataFrame, target: str):
    """
    Compute dataset class imbalance scale.

    Parameters
    ----------
    train : pl.DataFrame
        Training dataset.

    Returns
    -------
    float
        Class imbalance scale.
    """
    neg_count = len(train.filter(pl.col(target) == 0))
    pos_count = len(train.filter(pl.col(target) == 1))
    pos_ratio = pos_count / (neg_count + pos_count)

    if pos_ratio >= 0.4:
        return 1.0

    scale = neg_count / pos_count
    return round(scale, 2)


def train_model(data, trade_date, targets, features):
    models = {}
    for target in targets:
        print(f"START training model for {target}, {trade_date}")

        train = data.filter(
            (pl.col("tdq") < trade_date - dt.timedelta(days=360))
            & ~pl.all_horizontal(pl.col(target).is_null())
        ).select(["tdq", "tic"] + features + [target])

        scale = get_dataset_imbalance_scale(train, target)

        X_train = train.select(features).to_pandas()
        y_train = train.select(target).to_pandas().values.ravel()

        model = lgb.LGBMClassifier(
            objective='binary',
            n_estimators=500,
            scale_pos_weight=scale
        )
        model.fit(X_train, y_train)
        models[target] = model
    return models


models = train_model(data, trade_date, targets, features)

START training model for aggressive_hit, 2022-06-01 00:00:00
[LightGBM] [Info] Number of positive: 7600, number of negative: 21868
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003071 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19551
[LightGBM] [Info] Number of data points in the train set: 29468, number of used features: 111
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.257907 -> initscore=-1.056876
[LightGBM] [Info] Start training from score -1.056876
START training model for balanced_hit, 2022-06-01 00:00:00
[LightGBM] [Info] Number of positive: 8751, number of negative: 20657
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004109 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19551
[LightGBM] [Info] Number of data points in the train set: 29408, number of used features: 111
[LightGBM] [Info] [binar

In [11]:
def score(data, models, trade_date, targets, features, stocks):
    """
    Score stocks using rank-based ensemble of target-specific models.

    Parameters
    ----------
    data : pl.DataFrame
        Preprocessed financial data.
    stocks : list[str]
        List of stocks to score.

    Returns
    -------
    pl.DataFrame
        Dataframe with stock ranks.
    """

    test = data.filter((pl.col("tdq") == trade_date) & pl.col("tic").is_in(stocks))
    final_ranks = test.clone()
    pred_cols = []
    perc_cols = []

    # Get predictions for each target
    for target in targets:

        model = models[target]

        test_df = test.select(features).to_pandas()
        prob_scores = model.predict_proba(test_df)[:, 1]
        n_bins = 100
        n_elements = len(prob_scores)
        final_ranks = final_ranks.with_columns([
            pl.Series(f"pred_{target}", prob_scores),
            (
                pl.Series(f"pred_{target}", prob_scores)
                .rank(method="ordinal", descending=False)
                .map_elements(
                    lambda x, n=n_bins, total=n_elements: int(np.ceil(x * n / total))
                )
            ).alias(f"perc_{target}")
        ])
        pred_cols.append(f"pred_{target}")
        perc_cols.append(f"perc_{target}")


    final_ranks = final_ranks.with_columns(
        pl.mean_horizontal([pl.col(col) for col in perc_cols]).round(2).alias("avg_score")
    ).sort("avg_score", descending=True)

    return final_ranks.select(
        "tic", "perc_aggressive_hit", "perc_balanced_hit", "perc_relaxed_hit", "avg_score",
        "max_return_4Q", "fwd_return_4Q"
    )


ranks = score(data, models, trade_date, targets, features, constituents)
ranks.head(20)

  .map_elements(
  .map_elements(
  .map_elements(


tic,perc_aggressive_hit,perc_balanced_hit,perc_relaxed_hit,avg_score,max_return_4Q,fwd_return_4Q
str,i64,i64,i64,f64,f64,f64
"""SBAC""",99,99,96,98.0,7.26412,-29.635108
"""CCI""",83,100,99,94.0,-1.225592,-36.639401
"""FRT""",88,100,93,93.67,3.746036,-16.944197
"""CPT""",87,98,95,93.33,2.91194,-22.840902
"""AVB""",82,98,98,92.67,6.95732,-11.639978
…,…,…,…,…,…,…
"""AMT""",81,95,87,87.67,12.130806,-22.294006
"""VTR""",84,96,82,87.33,-3.378919,-16.581553
"""ORCL""",64,99,98,87.0,50.645793,42.680976
"""ALLE""",65,96,97,86.0,10.84038,-2.408305


In [12]:
ranks

tic,perc_aggressive_hit,perc_balanced_hit,perc_relaxed_hit,avg_score,max_return_4Q,fwd_return_4Q
str,i64,i64,i64,f64,f64,f64
"""SBAC""",99,99,96,98.0,7.26412,-29.635108
"""CCI""",83,100,99,94.0,-1.225592,-36.639401
"""FRT""",88,100,93,93.67,3.746036,-16.944197
"""CPT""",87,98,95,93.33,2.91194,-22.840902
"""AVB""",82,98,98,92.67,6.95732,-11.639978
…,…,…,…,…,…,…
"""PSX""",21,3,3,9.0,10.992492,-5.37655
"""PXD""",19,1,4,8.0,0.152197,-18.526358
"""OKE""",11,6,5,7.33,9.486536,-5.258943
"""MPC""",16,3,1,6.67,32.928546,7.295805


In [13]:
top = ranks.head(100)
bottom = ranks.tail(100)

top_freturn = top.select(pl.col("risk_return_4Q")).mean().item()
bottom_freturn = bottom.select(pl.col("risk_return_4Q")).mean().item()

# Calculate hit rates (% of stocks with positive returns)
top_hits = top.select(pl.col("risk_return_4Q") > 0).sum().item()
bottom_hits = bottom.select(pl.col("risk_return_4Q") > 0).sum().item()

top_hitrate = (top_hits / len(top)) * 100
bottom_hitrate = (bottom_hits / len(bottom)) * 100


print(f"\nDATE {trade_date}")
print(f"Average top return: {top_freturn:.2f}% ({top_hitrate:.1f})")
print(f"Average bottom return: {bottom_freturn:.2f}% ({bottom_hitrate:.1f})")

ColumnNotFoundError: risk_return_4Q

Resolved plan until failure:

	---> FAILED HERE RESOLVING 'select' <---
DF ["tic", "perc_aggressive_hit", "perc_balanced_hit", "perc_relaxed_hit"]; PROJECT */7 COLUMNS; SELECTION: None