In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from datetime import datetime
import os
import joblib
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import MultiHeadAttention, LayerNormalization, Add
from tensorflow.keras.layers import (
    Input, LSTM, Bidirectional, Dense, Dropout, BatchNormalization,
    Embedding, Concatenate, RepeatVector, TimeDistributed, Lambda, Layer,
    MaxPooling1D, Conv1D, GlobalMaxPooling1D, GlobalAveragePooling1D,
    Flatten
)
from sklearn.utils import class_weight
from tensorflow.keras import regularizers
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import classification_report, accuracy_score, f1_score
import xgboost as xgb
from itertools import product
from pathlib import Path


In [None]:
SEED = 42
os.environ['PYTHONHASHSEED'] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

In [None]:
# Setting up the subfolder to store general data

project_root = Path.cwd().parent
data_path = project_root / "data"
data_path.mkdir(exist_ok=True)

In [None]:
balanced_train_df = pd.read_csv(data_path / "all_train_df.csv")
val_df = pd.read_csv(data_path / "all_val_df.csv")
test_df = pd.read_csv(data_path / "all_test_df.csv")

In [31]:
lstm_scaled_features = [
    'Close', 'Open', 'High', 'Low', 'Volume',
    'SMA_5', 'SMA_10', 'SMA_20', 'SMA_30',
    'RSI_14', 'BB_Width',
    'MACD', 'MACD_Signal', 'MACD_Diff',
    'Price Range', 'Volatility_5', 'Volatility_10', 'Return_5', 'Price Change'
]

lstm_unscaled_features = []

In [32]:
lstm_features = lstm_scaled_features + lstm_unscaled_features

In [33]:
xgb_scaled_features = [
    'Weighted_Sentiment', 'Mean_Sentiment', 'Headline_Count',
    'SMA_5', 'RSI_14', 'MACD', 'MACD_Diff', 'BB_Width',
]

xgb_unscaled_features = [
    'Month_sin', 'Month_cos', 'DayOfWeek_sin', 'DayOfWeek_cos',
    'Has_Sentiment'
]


In [34]:
xgb_features = xgb_scaled_features + xgb_unscaled_features

In [35]:
all_scaled_features = list(dict.fromkeys(lstm_scaled_features + xgb_scaled_features))

In [None]:
def fit_scaler(df, feature_cols):
    df = df.copy()
    feature_cols = list(dict.fromkeys(feature_cols))
    df = df[feature_cols]
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df)
    df_scaled = pd.DataFrame(df_scaled, columns=feature_cols, index=df.index)
    return df_scaled, scaler

def transform_scaler(df, scaler, feature_cols):
    df = df.copy()
    for col in feature_cols:
        if col not in df.columns:
            df[col] = 0
    df = df[feature_cols]
    df_scaled = scaler.transform(df)
    df_scaled = pd.DataFrame(df_scaled, columns=feature_cols, index=df.index)
    return df_scaled


In [37]:
# Scale
scaled_train_features, scaler = fit_scaler(balanced_train_df, all_scaled_features)
scaled_val_features = transform_scaler(val_df, scaler, all_scaled_features)
scaled_test_features = transform_scaler(test_df, scaler, all_scaled_features)

balanced_train_df = balanced_train_df.copy()
val_df = val_df.copy()
test_df = test_df.copy()

# Replaced only scaled features in original dfs
for df, scaled in zip(
    [balanced_train_df, val_df, test_df],
    [scaled_train_features, scaled_val_features, scaled_test_features]
):
    df[all_scaled_features] = scaled

joblib.dump(scaler, 'lsxg_scaler.pkl')


['lsxg_scaler.pkl']

In [38]:
balanced_train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9993 entries, 0 to 9992
Data columns (total 74 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Stock Type          9993 non-null   int64         
 1   Date                9993 non-null   datetime64[ns]
 2   Close               9993 non-null   float64       
 3   High                9993 non-null   float64       
 4   Low                 9993 non-null   float64       
 5   Open                9993 non-null   float64       
 6   Volume              9993 non-null   float64       
 7   Year                9993 non-null   int64         
 8   MonthLength         9993 non-null   int64         
 9   DayOfYear_sin       9993 non-null   float64       
 10  DayOfYear_cos       9993 non-null   float64       
 11  Month_sin           9993 non-null   float64       
 12  Month_cos           9993 non-null   float64       
 13  Day_sin             9993 non-null   float64     

In [39]:
balanced_train_df = balanced_train_df.sort_values(['Stock Type', 'Date']).reset_index(drop=True)
val_df = val_df.sort_values(['Stock Type', 'Date']).reset_index(drop=True)
test_df = test_df.sort_values(['Stock Type', 'Date']).reset_index(drop=True)


In [40]:
def create_lstm_sequences(df, seq_len, feature_cols, target_col='Target', stock_col='Stock Type'):
    X, y = [], []
    for stock_id, group in df.groupby(stock_col):
        group = group.sort_values('Date')
        for i in range(seq_len, len(group)):
            X.append(group[feature_cols].iloc[i - seq_len:i].values)
            y.append(group[target_col].iloc[i])
    return np.array(X), np.array(y)

SEQ_LEN = 30

X_train_lstm, y_train_lstm = create_lstm_sequences(balanced_train_df, SEQ_LEN, lstm_features)
X_val_lstm, y_val_lstm = create_lstm_sequences(val_df, SEQ_LEN, lstm_features)
X_test_lstm, y_test_lstm = create_lstm_sequences(test_df, SEQ_LEN, lstm_features)

In [41]:
def extract_stock_ids(df, seq_len, stock_col='Stock Type'):
    stock_ids = []
    for _, group in df.groupby(stock_col):
        group = group.sort_values('Date')
        for i in range(seq_len, len(group)):
            stock_ids.append(group[stock_col].iloc[i])
    return np.array(stock_ids).reshape(-1, 1)

stock_train_ids = extract_stock_ids(balanced_train_df, SEQ_LEN)
stock_val_ids   = extract_stock_ids(val_df, SEQ_LEN)
stock_test_ids  = extract_stock_ids(test_df, SEQ_LEN)


In [42]:
def create_xgb_inputs(df, seq_len, xgb_features, stock_col='Stock Type'):
    xgb_inputs = []
    for stock_id, group in df.groupby(stock_col):
        group = group.sort_values('Date')
        for i in range(seq_len, len(group)):
            xgb_inputs.append(group[xgb_features].iloc[i].values)
    return np.array(xgb_inputs)

X_train_xgb = create_xgb_inputs(balanced_train_df, SEQ_LEN, xgb_features)
X_val_xgb = create_xgb_inputs(val_df, SEQ_LEN, xgb_features)
X_test_xgb = create_xgb_inputs(test_df, SEQ_LEN, xgb_features)

In [43]:
assert X_train_lstm.shape[0] == X_train_xgb.shape[0] == y_train_lstm.shape[0]
assert X_val_lstm.shape[0] == X_val_xgb.shape[0] == y_val_lstm.shape[0]
assert X_test_lstm.shape[0] == X_test_xgb.shape[0] == y_test_lstm.shape[0]

In [44]:
print("LSTM Input:", X_train_lstm.shape)
print("XGB Input:", X_train_xgb.shape)
print("Target:", y_train_lstm.shape)

LSTM Input: (9843, 30, 19)
XGB Input: (9843, 13)
Target: (9843,)


In [None]:
unique, counts = np.unique(y_train_lstm, return_counts=True)
print("Class distribution:", dict(zip(unique, counts)))

Class distribution: {np.int64(0): np.int64(3277), np.int64(1): np.int64(3284), np.int64(2): np.int64(3282)}


In [47]:
X_seq_train = X_train_lstm
X_seq_val = X_val_lstm
X_seq_test = X_test_lstm

X_stock_train = balanced_train_df['Stock Type'].values[-len(X_seq_train):]
X_stock_val = val_df['Stock Type'].values[-len(X_seq_val):]
X_stock_test = test_df['Stock Type'].values[-len(X_seq_test):]

In [None]:
class_weights = dict(enumerate(class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train_lstm),
    y=y_train_lstm
)))

print("Class Weights:", class_weights)

Class Weights: {0: np.float64(1.0012206286237413), 1: np.float64(0.9990864799025578), 2: np.float64(0.9996953077391835)}


In [None]:
def transformer_block(x, heads=4, key_dim=32):
    attn_output = MultiHeadAttention(num_heads=heads, key_dim=key_dim)(x, x)
    x = Add()([x, attn_output])
    x = LayerNormalization()(x)
    return x

In [50]:
def build_lstm(seq_len, n_lstm_features, num_stocks=5, embed_dim=3):
    seq_input = Input(shape=(seq_len, n_lstm_features), name="seq_input")
    stock_input = Input(shape=(1,), dtype='int32', name="stock_input")

    stock_embed = Embedding(input_dim=num_stocks, output_dim=embed_dim)(stock_input)
    stock_embed = Lambda(lambda x: tf.squeeze(x, axis=1))(stock_embed)
    stock_embed = RepeatVector(seq_len)(stock_embed)
    stock_embed = TimeDistributed(Dense(embed_dim))(stock_embed)

    x = Concatenate(axis=-1)([seq_input, stock_embed])
    x = Bidirectional(LSTM(128, return_sequences=True))(x)
    x = BatchNormalization()(x)
    x = Dropout(0.4)(x)
    x = Bidirectional(LSTM(64, return_sequences=True))(x)
    x = BatchNormalization()(x)

    x = transformer_block(x)

    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    x = Concatenate()([avg_pool, max_pool])
    x = Dense(128, activation='relu')(x)
    x = BatchNormalization()(x)
    x = Dropout(0.4)(x)
    x = Dense(64, activation='relu', name="lstm_embedding")(x)
    x = Dropout(0.3)(x)
    out = Dense(3, activation='softmax')(x)

    return Model(inputs=[seq_input, stock_input], outputs=out)

In [51]:
def focal_loss(gamma=2., alpha = [1.2, 0.6, 1.4]):
    alpha = tf.constant(alpha, dtype=tf.float32)

    def loss(y_true, y_pred):
        y_true = tf.one_hot(tf.cast(y_true, tf.int32), depth=3)
        y_pred = tf.clip_by_value(y_pred, 1e-7, 1.0 - 1e-7)

        cross_entropy = -y_true * tf.math.log(y_pred)
        weight = alpha * tf.pow(1 - y_pred, gamma)
        loss = weight * cross_entropy
        return tf.reduce_mean(tf.reduce_sum(loss, axis=-1))
    return loss

In [52]:
class_weights = {
    0: 1.1,
    1: 1.0,
    2: 1.3
}


In [53]:
loss = tf.keras.losses.SparseCategoricalCrossentropy()
# loss = focal_loss(gamma=2., alpha=[1.4, 0.7, 1.6])

lstm_model = build_lstm(SEQ_LEN, X_seq_train.shape[2], num_stocks=5, embed_dim=3)
lstm_model.compile(optimizer=Adam(1e-3), loss=loss, metrics=['accuracy'])
lstm_model.fit([X_seq_train, X_stock_train], y_train_lstm,
               validation_data=([X_seq_val, X_stock_val], y_val_lstm),
               epochs=30, batch_size=64, class_weight=class_weights,
               callbacks=[tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)])

Epoch 1/30
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 260ms/step - accuracy: 0.3431 - loss: 1.6148 - val_accuracy: 0.3638 - val_loss: 1.1220
Epoch 2/30
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 260ms/step - accuracy: 0.3387 - loss: 1.3492 - val_accuracy: 0.3448 - val_loss: 1.0932
Epoch 3/30
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 251ms/step - accuracy: 0.3493 - loss: 1.2817 - val_accuracy: 0.3783 - val_loss: 1.0872
Epoch 4/30
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 252ms/step - accuracy: 0.3442 - loss: 1.2584 - val_accuracy: 0.3339 - val_loss: 1.0887
Epoch 5/30
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 246ms/step - accuracy: 0.3439 - loss: 1.2529 - val_accuracy: 0.3629 - val_loss: 1.0908
Epoch 6/30
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 252ms/step - accuracy: 0.3448 - loss: 1.2450 - val_accuracy: 0.3629 - val_loss: 1.0904
Epoch 7/30

<keras.src.callbacks.history.History at 0x7d1396aa6d10>

In [54]:
encoder_model = Model(inputs=lstm_model.input, outputs=lstm_model.get_layer("lstm_embedding").output)
X_train_lstm_encoded = encoder_model.predict([X_seq_train, X_stock_train])
X_val_lstm_encoded   = encoder_model.predict([X_seq_val, X_stock_val])
X_test_lstm_encoded  = encoder_model.predict([X_seq_test, X_stock_test])

[1m308/308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 51ms/step
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 42ms/step
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 43ms/step


In [55]:
# lstm_encoder.save("supervised_lstm_encoder.keras")

In [56]:
X_train_fused = np.concatenate([X_train_lstm_encoded, X_train_xgb], axis=1)
X_val_fused   = np.concatenate([X_val_lstm_encoded, X_val_xgb], axis=1)
X_test_fused  = np.concatenate([X_test_lstm_encoded, X_test_xgb], axis=1)

In [None]:

y_pred_lstm = lstm_model.predict([X_seq_test, X_stock_test])
y_pred_labels = np.argmax(y_pred_lstm, axis=1)

print("Test Accuracy:", accuracy_score(y_test_lstm, y_pred_labels))
print(classification_report(y_test_lstm, y_pred_labels))

[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 58ms/step
Test Accuracy: 0.5313559322033898
              precision    recall  f1-score   support

           0       0.34      0.12      0.18       522
           1       0.55      0.95      0.70      1227
           2       0.36      0.04      0.08       611

    accuracy                           0.53      2360
   macro avg       0.42      0.37      0.32      2360
weighted avg       0.46      0.53      0.42      2360



In [None]:
xgb_model = xgb.XGBClassifier(
    objective='multi:softprob',
    num_class=3,
    eval_metric='mlogloss',
    use_label_encoder=False,
    n_estimators=400,
    max_depth=8,
    learning_rate=0.03,
    subsample=0.9,
    colsample_bytree=0.9,
    # scale_pos_weight=class_weights[0] / class_weights[1],
    min_child_weight=3,
    # class_weight='balanced',
    early_stopping_rounds=15,
    random_state=SEED
)

xgb_model.fit(X_train_fused, y_train_lstm, eval_set=[(X_val_fused, y_val_lstm)], verbose=True)


[0]	validation_0-mlogloss:1.09744


Parameters: { "use_label_encoder" } are not used.



[1]	validation_0-mlogloss:1.09695
[2]	validation_0-mlogloss:1.09583
[3]	validation_0-mlogloss:1.09488
[4]	validation_0-mlogloss:1.09423
[5]	validation_0-mlogloss:1.09374
[6]	validation_0-mlogloss:1.09303
[7]	validation_0-mlogloss:1.09301
[8]	validation_0-mlogloss:1.09182
[9]	validation_0-mlogloss:1.09126
[10]	validation_0-mlogloss:1.09114
[11]	validation_0-mlogloss:1.09068
[12]	validation_0-mlogloss:1.09007
[13]	validation_0-mlogloss:1.09007
[14]	validation_0-mlogloss:1.09043
[15]	validation_0-mlogloss:1.08975
[16]	validation_0-mlogloss:1.08912
[17]	validation_0-mlogloss:1.08880
[18]	validation_0-mlogloss:1.08843
[19]	validation_0-mlogloss:1.08795
[20]	validation_0-mlogloss:1.08735
[21]	validation_0-mlogloss:1.08729
[22]	validation_0-mlogloss:1.08715
[23]	validation_0-mlogloss:1.08692
[24]	validation_0-mlogloss:1.08674
[25]	validation_0-mlogloss:1.08698
[26]	validation_0-mlogloss:1.08685
[27]	validation_0-mlogloss:1.08666
[28]	validation_0-mlogloss:1.08665
[29]	validation_0-mlogloss:1.

In [59]:
y_probs_xgb = xgb_model.predict_proba(X_test_fused)
y_pred = np.argmax(y_probs_xgb, axis=1)
print("\nTest Accuracy:", accuracy_score(y_test_lstm, y_pred))
print(classification_report(y_test_lstm, y_pred))


Test Accuracy: 0.451271186440678
              precision    recall  f1-score   support

           0       0.28      0.34      0.31       522
           1       0.61      0.59      0.60      1227
           2       0.31      0.27      0.29       611

    accuracy                           0.45      2360
   macro avg       0.40      0.40      0.40      2360
weighted avg       0.46      0.45      0.45      2360



In [None]:


def thresholded_prediction(probs, thresholds):
    probs = np.array(probs)
    thresholds = np.array(thresholds)
    normalized = probs / thresholds
    return np.argmax(normalized, axis=1)


In [61]:
grid = np.linspace(0.2, 0.6, 9)
best_f1 = 0
best_thresholds = (1.0, 1.0, 1.0)

y_proba_val = xgb_model.predict_proba(X_val_fused)

for t0, t1, t2 in product(grid, grid, grid):
    thresholds = (t0, t1, t2)
    y_pred_val = thresholded_prediction(y_proba_val, thresholds)
    f1 = f1_score(y_val_lstm, y_pred_val, average='macro')
    if f1 > best_f1:
        best_f1 = f1
        best_thresholds = thresholds

print("Best Thresholds:", best_thresholds)
print("Best Macro F1 Score:", best_f1)


Best Thresholds: (np.float64(0.44999999999999996), np.float64(0.44999999999999996), np.float64(0.5))
Best Macro F1 Score: 0.3927688016274031


In [62]:
t0, t1, t2 = best_thresholds
y_proba_test = xgb_model.predict_proba(X_test_fused)
y_pred_test = thresholded_prediction(y_proba_test, best_thresholds)

print("\n Thresholded Test Accuracy:", accuracy_score(y_test_lstm, y_pred_test))
print(classification_report(y_test_lstm, y_pred_test))


 Thresholded Test Accuracy: 0.45
              precision    recall  f1-score   support

           0       0.27      0.39      0.32       522
           1       0.60      0.62      0.61      1227
           2       0.29      0.15      0.20       611

    accuracy                           0.45      2360
   macro avg       0.39      0.39      0.38      2360
weighted avg       0.45      0.45      0.44      2360



In [None]:
alphas = np.linspace(0.2, 1.0, 50)

best_alpha = None
best_acc = 0
best_report = None

for alpha in alphas:
    combined_probs = alpha * y_probs_xgb + (1 - alpha) * y_pred_lstm
    combined_preds = np.argmax(combined_probs, axis=1)

    acc = accuracy_score(y_test_lstm, combined_preds)

    if acc > best_acc:
        best_acc = acc
        best_alpha = alpha
        best_report = classification_report(y_test_lstm, combined_preds, digits=4)

print(f"Best ensemble weight for XGBoost: {best_alpha}")
print(f"Best ensemble test accuracy: {best_acc:.6f}")
print("Classification report:\n", best_report)


Best ensemble weight for XGBoost: 0.2163265306122449
Best ensemble test accuracy: 0.533898
Classification report:
               precision    recall  f1-score   support

           0     0.3542    0.1628    0.2231       522
           1     0.5736    0.8989    0.7003      1227
           2     0.3655    0.1178    0.1782       611

    accuracy                         0.5339      2360
   macro avg     0.4311    0.3932    0.3672      2360
weighted avg     0.4712    0.5339    0.4596      2360



In [64]:
y_probs_xgb
y_pred_lstm
final_probs = best_alpha * y_probs_xgb + (1- best_alpha) * y_pred_lstm
y_ensemble = np.argmax(final_probs, axis=1)
print(accuracy_score(y_test_lstm, y_ensemble))
print(classification_report(y_test_lstm, y_ensemble))

0.5338983050847458
              precision    recall  f1-score   support

           0       0.35      0.16      0.22       522
           1       0.57      0.90      0.70      1227
           2       0.37      0.12      0.18       611

    accuracy                           0.53      2360
   macro avg       0.43      0.39      0.37      2360
weighted avg       0.47      0.53      0.46      2360

