In [1]:
import datetime as dt

import polars as pl

from stocksense.config import config
from stocksense.database import DatabaseHandler
from stocksense.model import XGBoostRegressor
from stocksense.pipeline import clean, engineer_features

features = config.model.features
date_col = config.model.date_col
aux_cols = ["tic", "datadate", "rdq"]

features = config.model.features
targets = ["risk_return_3Q_hit", "fwd_return_4Q_hit", "risk_return_4Q_hit"]
prediction_horizon = config.processing.prediction_horizon
min_train_years = config.model.min_train_years

trade_date = dt.datetime(2023, 6, 1)

In [2]:
def prepare_data():
    """Prepare data for model operations."""
    data = engineer_features()
    data = clean(data)
    return data


data = prepare_data()
constituents = DatabaseHandler().fetch_constituents(trade_date)

[32m2024-12-25 12:04:05.841[0m | [1mINFO    [0m | [36mstocksense.pipeline.preprocess[0m:[36mengineer_features[0m:[36m20[0m - [1mSTART processing stock data[0m
[32m2024-12-25 12:04:05.842[0m | [32m[1mSUCCESS [0m | [36mstocksense.database.schema[0m:[36mcreate_tables[0m:[36m121[0m - [32m[1mTables created successfully[0m
[32m2024-12-25 12:04:09.281[0m | [1mINFO    [0m | [36mstocksense.pipeline.preprocess[0m:[36mengineer_features[0m:[36m33[0m - [1mSTART feature engineering[0m
[32m2024-12-25 12:04:25.943[0m | [32m[1mSUCCESS [0m | [36mstocksense.pipeline.preprocess[0m:[36mengineer_features[0m:[36m47[0m - [32m[1mEND 58960 rows PROCESSED[0m
[32m2024-12-25 12:04:25.948[0m | [1mINFO    [0m | [36mstocksense.pipeline.preprocess[0m:[36mclean[0m:[36m69[0m - [1mSTART cleaning data[0m
[32m2024-12-25 12:04:26.355[0m | [32m[1mSUCCESS [0m | [36mstocksense.pipeline.preprocess[0m:[36mclean[0m:[36m110[0m - [32m[1m37806 rows retained

In [5]:
data.filter(pl.col("tic") == "AAPL")

tdq,tic,datadate,rdq,saleq,cogsq,xsgaq,niq,ebitdaq,cshoq,actq,atq,cheq,rectq,invtq,ppentq,lctq,dlttq,ltq,req,seqq,oancfq,ivncfq,fincfq,dvq,capxq,icaptq,surprise_pct,stock_split,n_purch,val_purch,n_sales,val_sales,insider_balance,roa,roi,roe,…,eps_2y,ev_ebitda_yoy,ltcr_yoy,itr_yoy,rtr_yoy,atr_yoy,size_yoy,roa_sec_qoq,roa_sec_yoy,f_score,forward_vol_yoy,forward_vol_sos,forward_vol_qoq,excess_return_1Q,sharpe_ratio_1Q,risk_return_1Q,fwd_return_1Q_hit,excess_return_1Q_hit,risk_return_1Q_hit,excess_return_2Q,sharpe_ratio_2Q,risk_return_2Q,fwd_return_2Q_hit,excess_return_2Q_hit,risk_return_2Q_hit,excess_return_3Q,sharpe_ratio_3Q,risk_return_3Q,fwd_return_3Q_hit,excess_return_3Q_hit,risk_return_3Q_hit,excess_return_4Q,sharpe_ratio_4Q,risk_return_4Q,fwd_return_4Q_hit,excess_return_4Q_hit,risk_return_4Q_hit
date,str,date,date,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i8,u32,f64,u32,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,i8,f64,f64,f64,f64,f64,f64,i8,i8,i8,f64,f64,f64,i8,i8,i8,f64,f64,f64,i8,i8,i8,f64,f64,f64,i8,i8,i8
2007-03-01,"""AAPL""",2006-12-31,2007-01-17,7115.0,4821.0,898.0,1004.0,1396.0,24086.16,16664.0,19461.0,11869.0,3113.0,303.0,1362.0,7337.0,0.0,8233.0,6634.0,11228.0,1813.0,-1234.0,188.0,0.0,142.0,11228.0,0.453,0,0,0.0,9,83.8,83.8,12.476235,8.941931,0.216245,…,,-14.607498,,-11.661243,-22.623626,-13.145444,3.310886,31.40398,171.993547,4,2.613724,2.198614,1.445804,9.625824,6.657767,10.682745,1,1,1,40.845842,28.251306,32.906736,1,1,1,77.23883,35.130699,38.510469,1,1,1,84.161474,32.199834,32.355575,1,1,1
2007-06-01,"""AAPL""",2007-03-31,2007-04-25,5264.0,3346.0,863.0,770.0,1055.0,24211.404,16029.0,18711.0,12577.0,1667.0,208.0,1409.0,5485.0,0.0,6450.0,7413.0,12261.0,734.0,-978.0,180.0,0.0,105.0,12261.0,0.359,0,0,0.0,2,2.08,2.08,14.900326,6.280075,0.227388,…,,20.089345,,5.873221,11.085553,-11.127631,3.107109,59.90405,97.834928,5,2.792059,2.766874,2.792311,15.141031,5.4224,4.288529,1,1,1,39.002772,13.967916,13.097882,1,1,1,45.582556,16.474386,13.492304,1,1,1,42.675714,15.284672,11.282641,1,1,0
2007-09-01,"""AAPL""",2007-06-30,2007-07-25,5410.0,3334.0,954.0,818.0,1122.0,24336.536,18745.0,21647.0,13767.0,2901.0,251.0,1626.0,6992.0,0.0,8243.0,8255.0,13404.0,1227.0,-1433.0,229.0,0.0,283.0,13404.0,0.271,0,0,0.0,8,38.399,38.399,14.477757,6.102656,0.233811,…,,20.110764,,12.470648,10.519002,-10.96548,3.733049,-17.000355,42.806873,5,2.702126,2.963765,2.710762,22.986262,8.47963,9.219309,1,1,1,19.298171,7.119094,5.386367,1,1,0,20.906667,7.054091,4.512087,0,1,0,36.943247,13.67192,9.291562,0,1,0
2007-12-01,"""AAPL""",2007-09-30,2007-10-22,6217.0,4034.0,1030.0,904.0,1153.0,24425.212,21956.0,25347.0,15386.0,4029.0,346.0,1832.0,9299.0,0.0,10815.0,9164.0,14532.0,1696.0,396.0,142.0,0.0,205.0,14532.0,0.179,0,1,0.696,3,131.879,131.183,13.792559,6.220754,0.240573,…,,15.326081,,3.986648,-8.396816,-11.60651,3.972748,-12.499977,52.599244,5,3.635551,2.812971,3.086454,-13.717511,-4.444424,-6.938818,0,0,0,-5.03582,-1.631588,-4.185152,0,0,0,7.192317,2.55684,-1.977352,0,0,0,-6.577482,-1.809212,-9.914149,0,0,0
2008-03-01,"""AAPL""",2007-12-31,2008-01-22,9608.0,6170.0,1206.0,1581.0,2232.0,24601.612,26189.0,30039.0,18448.0,4422.0,459.0,1870.0,10535.0,0.0,13235.0,10758.0,16804.0,2787.0,-3462.0,485.0,0.0,224.0,16804.0,0.089,0,0,0.0,2,6.884,6.884,13.55904,9.408474,0.242383,…,145.304665,-13.714245,,-3.879321,-4.797032,-10.603272,4.395268,-4.206943,11.244333,5,3.64166,2.404911,2.40588,32.114818,13.348468,15.072915,1,1,1,40.92954,17.012291,15.982153,1,1,1,13.604471,5.656953,-2.870685,0,1,0,9.154933,2.513945,-7.455066,0,0,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
2023-12-01,"""AAPL""",2023-09-30,2023-11-02,89498.0,49071.0,6151.0,22956.0,30653.0,15550.061,143566.0,352583.0,61555.0,60985.0,6331.0,43715.0,145308.0,95281.0,290437.0,-214.0,62146.0,21598.0,2394.0,-23153.0,3758.0,2163.0,173234.0,0.049,0,0,0.0,7,111.955,111.955,27.509835,13.25144,1.56076,…,11.107477,26.39167,-16.337621,-24.686506,2.25749,-0.510488,-0.003818,-5.737774,-0.656043,6,1.415201,1.343185,1.106571,-9.601592,-8.676889,-2.369563,0,0,0,-20.033258,-18.10391,-6.6633,0,0,0,-4.960896,-3.693383,10.558652,0,0,0,-6.181044,-4.367609,13.699963,0,0,0
2024-03-01,"""AAPL""",2023-12-31,2024-02-01,119575.0,64720.0,6786.0,33916.0,43221.0,15460.223,143692.0,353514.0,73100.0,50102.0,6511.0,43666.0,133973.0,95088.0,279414.0,8242.0,74100.0,39895.0,1927.0,-30585.0,3825.0,2392.0,182140.0,0.039,0,0,0.0,1,1.058,1.058,28.545687,18.620841,1.361849,…,7.230683,14.453625,15.551229,-6.485394,5.764673,1.113202,0.151514,5.112562,6.896212,8,,1.615673,1.491581,-1.781402,-1.194305,-0.78917,0,0,0,14.397554,9.652546,13.99829,1,1,1,14.486879,8.966466,16.441985,1,1,1,13.434717,,,0,0,
2024-06-01,"""AAPL""",2024-03-31,2024-05-02,90753.0,48482.0,6468.0,23636.0,30736.0,15337.686,128416.0,337411.0,67150.0,41150.0,6232.0,43546.0,123822.0,91831.0,263217.0,4339.0,74194.0,22690.0,-310.0,-30433.0,3710.0,1996.0,178784.0,0.02,0,0,0.0,7,90.244,90.244,29.752735,13.220422,1.353061,…,5.325788,-2.113215,-23.431138,7.40212,-5.535435,-5.969847,0.123374,6.398108,4.326248,7,,1.485033,1.706893,11.739695,6.877816,9.385973,1,1,1,9.040261,5.296326,10.807127,1,1,1,9.423971,6.345968,15.97776,0,0,1,,,,,,
2024-09-01,"""AAPL""",2024-06-30,2024-08-01,85777.0,46099.0,6320.0,21448.0,28202.0,15222.259,125435.0,331612.0,61801.0,43172.0,6165.0,44502.0,131624.0,86196.0,264904.0,-4726.0,66708.0,28858.0,-127.0,-36017.0,3895.0,2151.0,168012.0,0.0399,0,0,0.0,4,38.092,38.092,30.74557,12.765755,1.528392,…,9.322555,9.742691,25.113659,25.471726,-6.621743,4.579645,-0.080792,0.61492,6.069032,7,,,1.223949,-3.655292,-2.986473,-0.091351,0,0,0,-3.465013,-2.83101,2.509138,0,0,0,,,,,,,,,,,,


In [3]:
def format_parameters(solution, scale: float) -> dict:
    return {
        "objective": "binary:logistic",
        "learning_rate": solution[0],
        "n_estimators": round(solution[1]),
        "max_depth": round(solution[2]),
        "min_child_weight": solution[3],
        "gamma": solution[4],
        "subsample": solution[5],
        "colsample_bytree": solution[6],
        "reg_alpha": solution[7],
        "reg_lambda": solution[8],
        "scale_pos_weight": scale,
        "eval_metric": "logloss",
        "tree_method": "hist",
        "nthread": -1,
        "random_state": 100,
    }

In [4]:
def get_dataset_imbalance_scale(train: pl.DataFrame, target: str):
    """
    Compute dataset class imbalance scale.

    Parameters
    ----------
    train : pl.DataFrame
        Training dataset.

    Returns
    -------
    float
        Class imbalance scale.
    """
    min_year = pl.col("tdq").dt.year().min()
    filtered_data = train.filter(pl.col("tdq").dt.year() < min_year + min_train_years)
    neg_count = len(filtered_data.filter(pl.col(target) == 0))
    pos_count = len(filtered_data.filter(pl.col(target) == 1))
    return round(neg_count / pos_count, 2)


def train_model(data, trade_date, targets, features):
    models = {}
    for target in targets:
        print(f"START training model for {target}, {trade_date}")

        train = data.filter(
            (pl.col("tdq") < trade_date - dt.timedelta(days=360))
            & ~pl.all_horizontal(pl.col(target).is_null())
        ).select(["tdq", "tic"] + features + [target])

        scale = get_dataset_imbalance_scale(train, target)
        print(scale)

        params = [0.10, 192.50, 7.85, 6.65, 0.47, 0.52, 0.87, 6.89, 8.68]
        params = format_parameters(params, scale)

        X_train = train.select(features).to_pandas()
        y_train = train.select(target).to_pandas().values.ravel()

        model = XGBoostRegressor(params)
        model.train(X_train, y_train)
        models[target] = model
    return models


models = train_model(data, trade_date, targets, features)

START training model for risk_return_3Q_hit, 2023-06-01 00:00:00
2.2
START training model for fwd_return_4Q_hit, 2023-06-01 00:00:00
2.84
START training model for risk_return_4Q_hit, 2023-06-01 00:00:00
1.92


In [5]:
def score_models(data, models, trade_date, targets, features, stocks):
    final_ranks = data.filter(
        (pl.col("tdq") == trade_date) & pl.col("tic").is_in(constituents)
    ).select(
        [
            "tic",
            "adj_close",
            "f_score",
            "pe",
            "pb",
            "saleq_yoy",
            "price_mom",
            "index_mom",
            "risk_return_4Q",
        ]
    )

    rank_cols = []
    pred_cols = []
    for target in targets:
        test_df = (
            data.filter((pl.col("tdq") == trade_date) & pl.col("tic").is_in(stocks))
            .select(features)
            .to_pandas()
        )

        model = models[target]
        print(f"loaded model with params: {model.params}")

        prob_scores = model.predict(test_df)
        final_ranks = final_ranks.with_columns(
            [pl.Series(prob_scores).rank("dense", descending=True).alias(f"rank_{target}")]
        )
        rank_cols.append(f"rank_{target}")
        pred_cols.append(f"pred_{target}")

    # Calculate average rank
    return (
        final_ranks.with_columns(pl.mean_horizontal(rank_cols).alias("avg_score"))
        .sort("avg_score", descending=False)
        .with_columns(pl.col("avg_score").round(3).alias("avg_score"))
    )


ranks = score_models(data, models, trade_date, targets, features, constituents)
ranks.head(20)

loaded model with params: {'objective': 'binary:logistic', 'learning_rate': 0.1, 'n_estimators': 192, 'max_depth': 8, 'min_child_weight': 6.65, 'gamma': 0.47, 'subsample': 0.52, 'colsample_bytree': 0.87, 'reg_alpha': 6.89, 'reg_lambda': 8.68, 'scale_pos_weight': 2.2, 'eval_metric': 'logloss', 'tree_method': 'hist', 'nthread': -1, 'random_state': 100}
loaded model with params: {'objective': 'binary:logistic', 'learning_rate': 0.1, 'n_estimators': 192, 'max_depth': 8, 'min_child_weight': 6.65, 'gamma': 0.47, 'subsample': 0.52, 'colsample_bytree': 0.87, 'reg_alpha': 6.89, 'reg_lambda': 8.68, 'scale_pos_weight': 2.84, 'eval_metric': 'logloss', 'tree_method': 'hist', 'nthread': -1, 'random_state': 100}
loaded model with params: {'objective': 'binary:logistic', 'learning_rate': 0.1, 'n_estimators': 192, 'max_depth': 8, 'min_child_weight': 6.65, 'gamma': 0.47, 'subsample': 0.52, 'colsample_bytree': 0.87, 'reg_alpha': 6.89, 'reg_lambda': 8.68, 'scale_pos_weight': 1.92, 'eval_metric': 'logloss'

tic,adj_close,f_score,pe,pb,saleq_yoy,price_mom,index_mom,risk_return_4Q,rank_risk_return_3Q_hit,rank_fwd_return_4Q_hit,rank_risk_return_4Q_hit,avg_score
str,f64,i8,f64,f64,f64,f64,f64,f64,u32,u32,u32,f64
"""DPZ""",290.792206,7,22.730088,-2.55163,1.310292,-4.788338,2.462385,40.475988,14,50,8,24.0
"""TECH""",81.149132,5,48.662854,6.98652,1.298317,1.857862,2.462385,-4.873225,17,41,16,24.667
"""PH""",320.088196,4,28.232592,4.332433,23.866511,0.298444,2.462385,46.037079,33,29,40,34.0
"""TSCO""",200.908157,5,21.074042,11.938254,9.096594,-14.726179,2.462385,20.285739,24,71,20,38.333
"""IT""",340.23999,6,29.337466,57.766067,11.572374,12.665983,2.462385,21.359485,15,52,66,44.333
…,…,…,…,…,…,…,…,…,…,…,…,…
"""GL""",102.404884,5,13.999426,2.641819,1.434948,-3.433519,2.462385,-1.114624,12,188,3,67.667
"""KEYS""",163.139999,6,24.747508,6.206601,2.886751,14.54852,2.462385,-4.196431,25,165,33,74.333
"""CTVA""",53.034714,4,32.745783,1.49288,6.150837,-11.180737,2.462385,2.267512,97,61,80,79.333
"""ZBH""",125.822914,7,59.561112,2.204098,10.088985,-10.943287,2.462385,-1.861312,61,117,62,80.0


In [6]:
top = ranks.head(100)
bottom = ranks.tail(100)

top_freturn = top.select(pl.col("risk_return_4Q")).mean().item()
bottom_freturn = bottom.select(pl.col("risk_return_4Q")).mean().item()

# Calculate hit rates (% of stocks with positive returns)
top_hits = top.select(pl.col("risk_return_4Q") > 0).sum().item()
bottom_hits = bottom.select(pl.col("risk_return_4Q") > 0).sum().item()

top_hitrate = (top_hits / len(top)) * 100
bottom_hitrate = (bottom_hits / len(bottom)) * 100


print(f"\nDATE {trade_date}")
print(f"Average top return: {top_freturn:.2f}% ({top_hitrate:.1f})")
print(f"Average bottom return: {bottom_freturn:.2f}% ({bottom_hitrate:.1f})")


DATE 2023-06-01 00:00:00
Average top return: 15.76% (79.0)
Average bottom return: 11.29% (80.0)
