In [9]:
from xgboost import XGBRegressor
from config import TRAIN_END_DATE, VALID_END_DATE
import pandas as pd
import numpy as np

In [10]:
def rmse(y_true, y_pred):
    """
    Root Mean Squared Error: measures magnitude error sensitive to outliers 
    rmse computes the square root of the mean of the squared differences between 
    predicted and actual values
    """
    return np.sqrt(np.mean((y_true - y_pred) ** 2))

def spearman_rank_corr(y_true, y_pred):
    """
    Spearman Rank Correlation: measures ranking accuracy
    """
    # if no samples, return NaN
    if len(y_true) == 0:
        return np.nan
    return pd.Series(y_true).corr(pd.Series(y_pred), method="spearman")

df = pd.read_csv("model_data_scored.csv")
# print(df.columns)
df["period_end"] = pd.to_datetime(df["period_end"])
target_col = "next_excess_ret"

In [11]:
biasedcols = [col for col in df.columns if col.startswith("next_")]
id_cols = ["SP Identifier", "PERMNO"]
features = [col for col in df.columns if col not in biasedcols + id_cols + ["period_end"]]
df = df.dropna(subset = [target_col])

In [15]:
# keep numeric features only to avoid object dtype issues
X_full = df[features]
# keep only the numeric columns
numeric_cols = X_full.select_dtypes(include=[np.number]).columns.tolist()
# X is matrix of numeric factor features
X = X_full[numeric_cols].fillna(0.0)
y = df[target_col].values
train_mask = df["period_end"] <= pd.to_datetime(TRAIN_END_DATE)
valid_mask = (df["period_end"] > pd.to_datetime(TRAIN_END_DATE)) & (df["period_end"] <= pd.to_datetime(VALID_END_DATE))
test_mask = df["period_end"] > pd.to_datetime(VALID_END_DATE)

y_train = df.loc[train_mask, "next_excess_ret"]
y_valid = df.loc[valid_mask, "next_excess_ret"]

In [16]:
XGBoost_model = XGBRegressor(
    n_estimators = 3000,
    learning_rate = 0.02,
    max_depth = 3,
    subsample = 0.8,
    colsample_bytree = 0.8,
    random_state = 66,
    eval_metric="rmse",
    early_stopping_rounds = 200
)

In [17]:
# Train the XGBoost model before running validation metrics
XGBoost_model.fit(
    X[train_mask],
    y[train_mask],
    eval_set=[(X[train_mask], y[train_mask]), (X[valid_mask], y[valid_mask])],
    verbose=True,
)

[0]	validation_0-rmse:1.15672	validation_1-rmse:1.40298
[1]	validation_0-rmse:1.14496	validation_1-rmse:1.38701
[2]	validation_0-rmse:1.13346	validation_1-rmse:1.37005
[3]	validation_0-rmse:1.12230	validation_1-rmse:1.35475
[4]	validation_0-rmse:1.11161	validation_1-rmse:1.33894
[5]	validation_0-rmse:1.10112	validation_1-rmse:1.32351
[6]	validation_0-rmse:1.09080	validation_1-rmse:1.30882
[7]	validation_0-rmse:1.08092	validation_1-rmse:1.29448
[8]	validation_0-rmse:1.07132	validation_1-rmse:1.28028
[9]	validation_0-rmse:1.06181	validation_1-rmse:1.26714
[10]	validation_0-rmse:1.05260	validation_1-rmse:1.25436
[11]	validation_0-rmse:1.04360	validation_1-rmse:1.24209
[12]	validation_0-rmse:1.03496	validation_1-rmse:1.23034
[13]	validation_0-rmse:1.02669	validation_1-rmse:1.21834
[14]	validation_0-rmse:1.01865	validation_1-rmse:1.20636
[15]	validation_0-rmse:1.01069	validation_1-rmse:1.19518
[16]	validation_0-rmse:1.00301	validation_1-rmse:1.18401
[17]	validation_0-rmse:0.99555	validation

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,200
,enable_categorical,False


In [19]:
# Model Validation
# validation set 
# y_valid is the actual next_excess_ret for the validation subset
y_valid = y[valid_mask]
# valid_pred is the model’s predicted next_excess_ret for that same validation subset.
valid_pred = XGBoost_model.predict(X[valid_mask])
valid_rmse = rmse(y_valid, valid_pred)
# Spearman rank correlation (IC): how well the model ranks stocks by future excess return.
valid_spearman = spearman_rank_corr(y_valid, valid_pred)
directional_accuracy = np.mean(np.sign(valid_pred) == np.sign(y_valid))

print(f"\nValidation RMSE: {valid_rmse:.4f}")
print(f"Validation Spearman IC: {valid_spearman:.4f}")
print(f"Validation direction hit-rate: {directional_accuracy:.4f}")


Validation RMSE: 0.8576
Validation Spearman IC: 0.7833
Validation direction hit-rate: 0.8886


In [20]:
# predict next_excess_ret for every row in X (all stock–date combinations)
df["predicted_next_excess_ret"] = XGBoost_model.predict(X)
predicted_df = df[["SP Identifier", "PERMNO", "period_end", "predicted_next_excess_ret"]].copy()
# for each period, Rank stocks within that period by their predicted excess return.
# creating a cross-sectional ranking
predicted_df["rank"] = predicted_df.groupby("period_end")["predicted_next_excess_ret"].rank(method="first", ascending=False)
top10 = predicted_df[predicted_df["rank"] <= 10].copy()
# save the top 10 stocks for each period
#top10.to_csv("top10_long_only.csv", index=False)