In [1]:
import datetime as dt

import polars as pl

from stocksense.config import config
from stocksense.database import DatabaseHandler
from stocksense.model import XGBoostRegressor
from stocksense.pipeline import clean, engineer_features

features = config.model.features
date_col = config.model.date_col
aux_cols = ["tic", "datadate", "rdq"]

features = config.model.features
targets = ["risk_return_3Q_hit", "risk_return_4Q_hit", "fwd_return_4Q_hit", "risk_return_4Q_hit"]
prediction_horizon = config.processing.prediction_horizon
min_train_years = config.model.min_train_years

trade_date = dt.datetime(2023, 6, 1)

In [2]:
def prepare_data():
    """Prepare data for model operations."""
    data = engineer_features()
    data = clean(data)
    return data


data = prepare_data()
constituents = DatabaseHandler().fetch_constituents(trade_date)

[32m2024-12-28 19:18:29.286[0m | [1mINFO    [0m | [36mstocksense.pipeline.preprocess[0m:[36mengineer_features[0m:[36m20[0m - [1mSTART processing stock data[0m
[32m2024-12-28 19:18:29.288[0m | [32m[1mSUCCESS [0m | [36mstocksense.database.schema[0m:[36mcreate_tables[0m:[36m121[0m - [32m[1mTables created successfully[0m
[32m2024-12-28 19:18:33.720[0m | [1mINFO    [0m | [36mstocksense.pipeline.preprocess[0m:[36mengineer_features[0m:[36m33[0m - [1mSTART feature engineering[0m
[32m2024-12-28 19:18:46.905[0m | [32m[1mSUCCESS [0m | [36mstocksense.pipeline.preprocess[0m:[36mengineer_features[0m:[36m47[0m - [32m[1mEND 58960 rows PROCESSED[0m
[32m2024-12-28 19:18:46.909[0m | [1mINFO    [0m | [36mstocksense.pipeline.preprocess[0m:[36mclean[0m:[36m69[0m - [1mSTART cleaning data[0m
[32m2024-12-28 19:18:47.043[0m | [32m[1mSUCCESS [0m | [36mstocksense.pipeline.preprocess[0m:[36mclean[0m:[36m106[0m - [32m[1m37806 rows retained

In [4]:
def format_parameters(solution, scale: float) -> dict:
    return {
        "objective": "binary:logistic",
        "learning_rate": solution[0],
        "n_estimators": round(solution[1]),
        "max_depth": round(solution[2]),
        "min_child_weight": solution[3],
        "gamma": solution[4],
        "subsample": solution[5],
        "colsample_bytree": solution[6],
        "reg_alpha": solution[7],
        "reg_lambda": solution[8],
        "scale_pos_weight": scale,
        "eval_metric": "logloss",
        "tree_method": "hist",
        "nthread": -1,
        "random_state": 100,
    }

In [5]:
def get_dataset_imbalance_scale(train: pl.DataFrame, target: str):
    """
    Compute dataset class imbalance scale.

    Parameters
    ----------
    train : pl.DataFrame
        Training dataset.

    Returns
    -------
    float
        Class imbalance scale.
    """
    min_year = pl.col("tdq").dt.year().min()
    filtered_data = train.filter(pl.col("tdq").dt.year() < min_year + min_train_years)
    neg_count = len(filtered_data.filter(pl.col(target) == 0))
    pos_count = len(filtered_data.filter(pl.col(target) == 1))
    return round(neg_count / pos_count, 2)


def train_model(data, trade_date, targets, features):
    models = {}
    for target in targets:
        print(f"START training model for {target}, {trade_date}")

        train = data.filter(
            (pl.col("tdq") < trade_date - dt.timedelta(days=360))
            & ~pl.all_horizontal(pl.col(target).is_null())
        ).select(["tdq", "tic"] + features + [target])

        scale = get_dataset_imbalance_scale(train, target)
        print(scale)

        params = [0.10, 192.50, 7.85, 6.65, 0.47, 0.52, 0.87, 6.89, 8.68]
        params = format_parameters(params, scale)

        X_train = train.select(features).to_pandas()
        y_train = train.select(target).to_pandas().values.ravel()

        model = XGBoostRegressor(params)
        model.train(X_train, y_train)
        models[target] = model
    return models


models = train_model(data, trade_date, targets, features)

START training model for risk_return_3Q_hit, 2023-06-01 00:00:00
1.83
START training model for risk_return_4Q_hit, 2023-06-01 00:00:00
2.17
START training model for fwd_return_4Q_hit, 2023-06-01 00:00:00
3.64
START training model for risk_return_4Q_hit, 2023-06-01 00:00:00
2.17


In [6]:
def score_models(data, models, trade_date, targets, features, stocks):
    final_ranks = data.filter(
        (pl.col("tdq") == trade_date) & pl.col("tic").is_in(constituents)
    ).select(
        [
            "tic",
            "adj_close",
            "f_score",
            "pe",
            "pb",
            "saleq_yoy",
            "price_mom",
            "index_mom",
            "risk_return_4Q",
        ]
    )

    rank_cols = []
    pred_cols = []
    for target in targets:
        test_df = (
            data.filter((pl.col("tdq") == trade_date) & pl.col("tic").is_in(stocks))
            .select(features)
            .to_pandas()
        )

        model = models[target]
        print(f"loaded model with params: {model.params}")

        prob_scores = model.predict(test_df)
        final_ranks = final_ranks.with_columns(
            [pl.Series(prob_scores).rank("dense", descending=True).alias(f"rank_{target}")]
        )
        rank_cols.append(f"rank_{target}")
        pred_cols.append(f"pred_{target}")

    # Calculate average rank
    return (
        final_ranks.with_columns(pl.mean_horizontal(rank_cols).alias("avg_score"))
        .sort("avg_score", descending=False)
        .with_columns(pl.col("avg_score").round(3).alias("avg_score"))
    )


ranks = score_models(data, models, trade_date, targets, features, constituents)
ranks.head(20)

loaded model with params: {'objective': 'binary:logistic', 'learning_rate': 0.1, 'n_estimators': 192, 'max_depth': 8, 'min_child_weight': 6.65, 'gamma': 0.47, 'subsample': 0.52, 'colsample_bytree': 0.87, 'reg_alpha': 6.89, 'reg_lambda': 8.68, 'scale_pos_weight': 1.83, 'eval_metric': 'logloss', 'tree_method': 'hist', 'nthread': -1, 'random_state': 100}
loaded model with params: {'objective': 'binary:logistic', 'learning_rate': 0.1, 'n_estimators': 192, 'max_depth': 8, 'min_child_weight': 6.65, 'gamma': 0.47, 'subsample': 0.52, 'colsample_bytree': 0.87, 'reg_alpha': 6.89, 'reg_lambda': 8.68, 'scale_pos_weight': 2.17, 'eval_metric': 'logloss', 'tree_method': 'hist', 'nthread': -1, 'random_state': 100}
loaded model with params: {'objective': 'binary:logistic', 'learning_rate': 0.1, 'n_estimators': 192, 'max_depth': 8, 'min_child_weight': 6.65, 'gamma': 0.47, 'subsample': 0.52, 'colsample_bytree': 0.87, 'reg_alpha': 6.89, 'reg_lambda': 8.68, 'scale_pos_weight': 3.64, 'eval_metric': 'logloss

tic,adj_close,f_score,pe,pb,saleq_yoy,price_mom,index_mom,risk_return_4Q,rank_risk_return_3Q_hit,rank_risk_return_4Q_hit,rank_fwd_return_4Q_hit,avg_score
str,f64,i8,f64,f64,f64,f64,f64,f64,u32,u32,u32,f64
"""GL""",102.404884,5,13.999426,2.641819,1.434948,-3.433519,2.462385,-1.114624,1,5,79,22.5
"""LKQ""",50.051731,8,12.192901,2.449266,0.029869,-8.982872,2.462385,-3.596411,3,45,30,30.75
"""AMP""",297.257599,5,15.532451,8.062331,3.227586,3.239292,2.462385,37.020242,93,6,20,31.25
"""AOS""",62.996155,7,40.402664,5.479704,-1.155774,-7.451988,2.462385,25.215617,26,25,78,38.5
"""FDX""",210.729767,4,18.373786,2.222713,-6.226471,-4.864274,2.462385,14.244502,32,37,58,41.0
…,…,…,…,…,…,…,…,…,…,…,…,…
"""LOW""",198.007385,4,19.186799,-8.276019,-5.545458,-0.995292,2.462385,13.168511,172,4,100,70.0
"""KEYS""",163.139999,6,24.747508,6.206601,2.886751,14.54852,2.462385,-4.196431,125,61,64,77.75
"""JKHY""",148.719849,3,31.625135,7.181,6.333793,-5.495129,2.462385,10.380574,224,9,72,78.5
"""WRB""",36.645657,3,9.701942,1.510597,-0.699902,-2.551376,2.462385,38.026687,128,13,162,79.0


In [7]:
ranks

tic,adj_close,f_score,pe,pb,saleq_yoy,price_mom,index_mom,risk_return_4Q,rank_risk_return_3Q_hit,rank_risk_return_4Q_hit,rank_fwd_return_4Q_hit,avg_score
str,f64,i8,f64,f64,f64,f64,f64,f64,u32,u32,u32,f64
"""GL""",102.404884,5,13.999426,2.641819,1.434948,-3.433519,2.462385,-1.114624,1,5,79,22.5
"""LKQ""",50.051731,8,12.192901,2.449266,0.029869,-8.982872,2.462385,-3.596411,3,45,30,30.75
"""AMP""",297.257599,5,15.532451,8.062331,3.227586,3.239292,2.462385,37.020242,93,6,20,31.25
"""AOS""",62.996155,7,40.402664,5.479704,-1.155774,-7.451988,2.462385,25.215617,26,25,78,38.5
"""FDX""",210.729767,4,18.373786,2.222713,-6.226471,-4.864274,2.462385,14.244502,32,37,58,41.0
…,…,…,…,…,…,…,…,…,…,…,…,…
"""APH""",37.403244,7,24.533259,6.381125,0.74867,0.610402,2.462385,52.409538,405,415,432,416.75
"""GE""",82.949905,6,10.801327,2.791898,14.287968,3.112997,2.462385,49.846659,424,446,351,416.75
"""RCL""",83.03595,5,-20.509055,7.027901,172.38119,22.740018,2.462385,31.61142,475,461,355,438.0
"""VZ""",32.211952,3,6.968586,1.596461,-1.913334,-5.176525,2.462385,13.291541,374,465,451,438.75


In [8]:
top = ranks.head(100)
bottom = ranks.tail(100)

top_freturn = top.select(pl.col("risk_return_4Q")).mean().item()
bottom_freturn = bottom.select(pl.col("risk_return_4Q")).mean().item()

# Calculate hit rates (% of stocks with positive returns)
top_hits = top.select(pl.col("risk_return_4Q") > 0).sum().item()
bottom_hits = bottom.select(pl.col("risk_return_4Q") > 0).sum().item()

top_hitrate = (top_hits / len(top)) * 100
bottom_hitrate = (bottom_hits / len(bottom)) * 100


print(f"\nDATE {trade_date}")
print(f"Average top return: {top_freturn:.2f}% ({top_hitrate:.1f})")
print(f"Average bottom return: {bottom_freturn:.2f}% ({bottom_hitrate:.1f})")


DATE 2023-06-01 00:00:00
Average top return: 16.87% (79.0)
Average bottom return: 13.49% (84.0)
