In [1]:
import pandas as pd
import polars as pl
import numpy as np
import gc

import tensorflow as tf
import keras
from keras import layers, models, callbacks, losses, optimizers, metrics

2024-12-31 21:11:43.655074: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-31 21:11:43.671580: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1735650703.691277  387003 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1735650703.696247  387003 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-31 21:11:43.714414: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [2]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
print("GPU Devices: ", tf.config.list_physical_devices('GPU'))

Num GPUs Available:  1
GPU Devices:  [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [3]:
class CONFIG:
    seed = 2025
    target_col = "responder_6"
    # data_id is not included as it's not relavant
    feature_cols = [f"feature_{idx:02d}" for idx in range(79)] \
        + [f"responder_{idx}_lag_1" for idx in range(9)]
    categorical_cols = []
    batch_size = 4096

In [4]:
train = pl.scan_parquet("/root/autodl-tmp/jane-street-2024/training.parquet").collect().to_pandas()
valid = pl.scan_parquet("/root/autodl-tmp/jane-street-2024/validation.parquet").collect().to_pandas()
train.shape, valid.shape

((31646824, 103), (1643664, 103))

In [5]:
# Trick of boosting LB score, data leakage on the validation set
train = pd.concat([train, valid]).reset_index(drop=True)
train.shape

(33290488, 103)

In [6]:
train.head()

Unnamed: 0,id,date_id,time_id,symbol_id,weight,feature_00,feature_01,feature_02,feature_03,feature_04,...,partition_id,responder_0_lag_1,responder_1_lag_1,responder_2_lag_1,responder_3_lag_1,responder_4_lag_1,responder_5_lag_1,responder_6_lag_1,responder_7_lag_1,responder_8_lag_1
0,21489858,1000,0,0,3.324375,-0.276179,-0.655325,-0.40481,-0.349785,-2.882722,...,5,-0.504762,-0.758082,-0.795381,0.124352,0.036644,0.296034,0.321345,0.207008,0.598205
1,21489859,1000,0,1,4.711303,-0.418316,-0.762019,-0.43368,-0.616798,-2.57797,...,5,-0.390356,0.187457,-0.609749,-0.128713,-0.070782,-0.449838,-0.532821,-0.17038,-0.582633
2,21489860,1000,0,2,3.028847,-0.724897,-1.223187,-0.452174,-0.523907,-2.61743,...,5,-0.420631,0.208989,-0.563919,-0.031235,-0.015218,0.298194,0.166585,0.105961,0.160067
3,21489861,1000,0,3,2.099438,-0.717159,-0.259479,-0.522695,-0.066547,-2.712632,...,5,5.0,0.456872,1.149381,0.179879,0.108469,0.089928,-0.113625,-0.033634,-0.246281
4,21489862,1000,0,4,3.166049,-0.377845,-0.360645,-0.641121,-0.508439,-2.661481,...,5,-0.074267,0.026688,0.697218,-0.044805,-0.018261,0.148812,-0.173818,-0.044245,-0.285043


In [7]:
X_train = train[ CONFIG.feature_cols ]
X_train = X_train.ffill().fillna(0).values
y_train = train[ CONFIG.target_col ].values
w_train = train["weight"].values

X_valid = valid[ CONFIG.feature_cols ]
X_valid = X_valid.ffill().fillna(0).values
y_valid = valid[ CONFIG.target_col ].values
w_valid = valid["weight"].values

X_train.shape, y_train.shape, w_train.shape, X_valid.shape, y_valid.shape, w_valid.shape

((33290488, 88),
 (33290488,),
 (33290488,),
 (1643664, 88),
 (1643664,),
 (1643664,))

In [8]:
means = X_train.mean()
stds = X_train.std()

X_train = (X_train - means) / stds
X_valid = (X_valid - means) / stds

In [19]:
np.savez('scaler_params.npz', means=means, stds=stds)

In [9]:
class DataGenerator(tf.keras.utils.Sequence):
    def __init__(self, X, y, w, batch_size, **kwargs):
        super().__init__(**kwargs)
        self.X = X
        self.y = y
        self.w = w
        self.batch_size = batch_size
        
    def __len__(self):
        return int(np.ceil(len(self.X) / self.batch_size))
    
    def __getitem__(self, idx):
        start_idx = idx * self.batch_size
        end_idx = min((idx + 1) * self.batch_size, len(self.X))
        batch_X = self.X[start_idx:end_idx]
        batch_y = self.y[start_idx:end_idx]
        batch_w = self.w[start_idx:end_idx]
        
        return (
            batch_X,
            {
                'decoder_output': batch_X,
                'prediction': tf.expand_dims(batch_y, -1)
            },
            batch_w
        )

## Define the model

In [10]:
def dense_autoencoder(input_shape):
    
    # Stage 1: Encoder Input
    encoder_input = layers.Input(shape=input_shape)
    # prevent overfitting
    x0 = layers.BatchNormalization()(encoder_input)

    # for data augmentation and to prevent overfitting
    encoded = layers.GaussianNoise(0.05)(x0)
    # Stage 2: Dense => ReLU
    encoded = layers.Dense(512)(encoded)
    encoded = layers.Activation("swish")(encoded)
    encoded = layers.BatchNormalization()(encoded)
    encoded = layers.Dropout(0.3)(encoded)
    
    # Stage 3: Dense => ReLU
    encoded = layers.Dense(256)(encoded)
    encoded = layers.Activation("swish")(encoded)
    encoded = layers.BatchNormalization()(encoded)
    encoded = layers.Dropout(0.3)(encoded)
    
    # Stage 4: Dense => ReLU
    encoded = layers.Dense(64)(encoded)
    encoded = layers.Activation("swish")(encoded)
    encoded = layers.BatchNormalization()(encoded)
    encoded = layers.Dropout(0.3)(encoded)

    # Stage 4: Dense => ReLU
    encoded = layers.Dense(32)(encoded)
    encoded = layers.Activation("swish")(encoded)
    encoded = layers.BatchNormalization()(encoded)
    encoded = layers.Dropout(0.3)(encoded)

    # Stage 5: 32 features as the final output, same as pca
    encoded = layers.Dense(16)(encoded)
    encoded = layers.Activation("swish", name="encoder_output")(encoded)
    
    ##################### Encoder model ########################
    encoder = models.Model(inputs=encoder_input, outputs=encoded)
    ############################################################
    
    # # Stage 6: Decoder Input
    # decoder_input = layers.Input(shape=(32,))

    # Stage 7: Dense => ReLU
    decoded = layers.Dense(32)(encoded)
    decoded = layers.Activation("swish")(decoded)
    decoded = layers.BatchNormalization()(decoded)
    decoded = layers.Dropout(0.3)(decoded)

    # Stage 7: Dense => ReLU
    decoded = layers.Dense(64)(encoded)
    decoded = layers.Activation("swish")(decoded)
    decoded = layers.BatchNormalization()(decoded)
    decoded = layers.Dropout(0.3)(decoded)
    
    # Stage 8: Dense => ReLU
    decoded = layers.Dense(256)(decoded)
    decoded = layers.Activation("swish")(decoded)
    decoded = layers.BatchNormalization()(decoded)
    decoded = layers.Dropout(0.3)(decoded)
    
    # Stage 9: Dense => ReLU
    decoded = layers.Dense(512)(decoded)
    decoded = layers.Activation("swish")(decoded)
    decoded = layers.BatchNormalization()(decoded)
    decoded = layers.Dropout(0.3)(decoded)
    
    # Stage 10: Dense => ReLU
    decoded = layers.Dense(input_shape[0], name="decoder_output")(decoded)
    # decoded = layers.Activation("swish", name="decoder_output")(decoded)
    
    ###################### Decoder Model #######################
    decoder = models.Model(inputs=encoded, outputs=decoded)
    ############################################################
    
    ##################### autoencoder model workflow #####################
    # autoencoder_input = layers.Input(input_shape)
    # encoded_autoencoder = encoder(autoencoder_input) # encoded
    # decoded_autoencoder = decoder(encoded_autoencoder) # decoded

    ################# Prediction branch from bottleneck ####################
    prediction = layers.Dense(64, activation="swish")(encoded)
    prediction = layers.BatchNormalization()(prediction)
    prediction = layers.Dropout(0.3)(prediction)
    prediction = layers.Dense(1)(prediction)
    prediction = layers.Activation('tanh')(prediction)
    prediction_output = layers.Lambda(lambda x: 5 * x, name='prediction')(prediction)

    autoencoder = models.Model(inputs=encoder_input, outputs=[decoded, prediction_output])
    
    autoencoder.compile(optimizer='adam',
                loss={'decoder_output': 'mse',
                      'prediction': 'mse'})
    
    return autoencoder, encoder

In [11]:
# 3. Create generators for training and validation
train_generator = DataGenerator(
    X_train,
    y_train,
    w_train,
    CONFIG.batch_size,
    # workers=8,
    # use_multiprocessing=True
)
valid_generator = DataGenerator(
    X_valid,
    y_valid,
    w_valid,
    CONFIG.batch_size,
    # workers=8,
    # use_multiprocessing=True
)

In [12]:
autoencoder, encoder = dense_autoencoder((X_train.shape[1], ))

I0000 00:00:1735651020.445922  387003 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 22456 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3090, pci bus id: 0000:4f:00.0, compute capability: 8.6


In [13]:
ckp = callbacks.ModelCheckpoint(
    "best-js-autoen.weights.h5", 
    monitor='val_decoder_output_loss', 
    verbose=0, 
    save_best_only=True, 
    save_weights_only=True, 
    mode='min'
)

# Add learning rate scheduling
lr_scheduler = callbacks.ReduceLROnPlateau(
    monitor='val_decoder_output_loss',
    factor=0.5,
    patience=5,
    min_lr=1e-6,
    verbose=1
)

# Update early stopping
es = callbacks.EarlyStopping(
    monitor='val_decoder_output_loss',
    min_delta=1e-4,
    patience=15,  # Increased patience
    mode='min',
    restore_best_weights=True,
    verbose=1
)

# Train with updated callbacks
history = autoencoder.fit(
    train_generator,
    validation_data=valid_generator,
    epochs=100,
    callbacks=[ckp, es, lr_scheduler],
    verbose=1
)

Epoch 1/100


I0000 00:00:1735651032.989537  387590 service.cc:148] XLA service 0x7f664c002b20 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1735651032.989627  387590 service.cc:156]   StreamExecutor device (0): NVIDIA GeForce RTX 3090, Compute Capability 8.6
2024-12-31 21:17:13.225840: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1735651033.857985  387590 cuda_dnn.cc:529] Loaded cuDNN version 90300









[1m  18/8128[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1:17[0m 10ms/step - decoder_output_loss: 5.5749 - loss: 26.3826 - prediction_loss: 20.8077

I0000 00:00:1735651043.275547  387590 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m7200/8128[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m7s[0m 8ms/step - decoder_output_loss: 0.3481 - loss: 2.5713 - prediction_loss: 2.2231












[1m8128/8128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 10ms/step - decoder_output_loss: 0.3207 - loss: 2.4788 - prediction_loss: 2.1580 - val_decoder_output_loss: 0.0142 - val_loss: 1.2021 - val_prediction_loss: 1.1884 - learning_rate: 0.0010
Epoch 2/100
[1m8128/8128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 8ms/step - decoder_output_loss: 0.0219 - loss: 1.5418 - prediction_loss: 1.5199 - val_decoder_output_loss: 0.0084 - val_loss: 1.1815 - val_prediction_loss: 1.1737 - learning_rate: 0.0010
Epoch 3/100
[1m8128/8128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 8ms/step - decoder_output_loss: 0.0137 - loss: 1.4833 - prediction_loss: 1.4696 - val_decoder_output_loss: 0.0045 - val_loss: 1.1762 - val_prediction_loss: 1.1724 - learning_rate: 0.0010
Epoch 4/100
[1m8128/8128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 8ms/step - decoder_output_loss: 0.0121 - loss: 1.5109 - prediction_loss: 1.4989 - val_decoder_output_loss: 0.0065 - val_loss: 

In [11]:
autoencoder, encoder = dense_autoencoder((X_train.shape[1],))
autoencoder.load_weights("/root/autodl-tmp/jane-street-2024/best-js-autoen.weights.h5")

2024-12-31 20:21:06.999121: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-12-31 20:21:07.100098: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1613] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 22456 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3090, pci bus id: 0000:4f:00.0, compute capability: 8.6


In [14]:
encoded_train = encoder.predict(X_train, batch_size=4096)
encoded_valid = encoder.predict(X_valid, batch_size=4096)

[1m8128/8128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 2ms/step
[1m402/402[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step


In [15]:
# Convert to DataFrame with meaningful column names
encoded_train_df = pd.DataFrame(
    encoded_train,
    columns=[f'encoded_feature_{i}' for i in range(encoded_train.shape[1])]
)
encoded_valid_df = pd.DataFrame(
    encoded_valid,
    columns=[f'encoded_feature_{i}' for i in range(encoded_valid.shape[1])]
)

In [16]:
# Add target column and weights
encoded_train_df[CONFIG.target_col] = train[CONFIG.target_col]
encoded_train_df['weight'] = train['weight']
encoded_train_df['symbol_id'] = train['symbol_id']
encoded_train_df['date_id'] = train['date_id']
encoded_train_df['time_id'] = train['time_id']

encoded_valid_df[CONFIG.target_col] = valid[CONFIG.target_col]
encoded_valid_df['weight'] = valid['weight']
encoded_valid_df['symbol_id'] = valid['symbol_id']
encoded_valid_df['date_id'] = valid['date_id']
encoded_valid_df['time_id'] = valid['time_id']

In [17]:
encoded_train_df.head()

Unnamed: 0,encoded_feature_0,encoded_feature_1,encoded_feature_2,encoded_feature_3,encoded_feature_4,encoded_feature_5,encoded_feature_6,encoded_feature_7,encoded_feature_8,encoded_feature_9,...,encoded_feature_11,encoded_feature_12,encoded_feature_13,encoded_feature_14,encoded_feature_15,responder_6,weight,symbol_id,date_id,time_id
0,-0.168524,-0.057372,0.102818,-0.243955,-0.202454,-0.104967,0.321823,-0.222371,-0.26224,-0.18043,...,-0.258119,-0.242541,-0.157287,-0.14576,0.0056,0.496563,3.324375,0,1000,0
1,-0.162565,-0.060451,0.094373,-0.243832,-0.207173,-0.107619,0.317791,-0.22277,-0.263774,-0.184074,...,-0.257707,-0.245912,-0.156141,-0.148324,-0.013658,0.529877,4.711303,1,1000,0
2,-0.164049,-0.259337,-0.201851,-0.278407,-0.252687,-0.257245,0.225601,-0.24953,-0.273658,-0.24442,...,-0.042617,-0.223356,-0.231683,-0.271294,0.407031,0.746983,3.028847,2,1000,0
3,-0.271798,-0.055786,-0.239978,-0.229345,-0.060237,-0.220229,0.733118,-0.208833,-0.147414,-0.217443,...,-0.012005,-0.107234,-0.278464,-0.162567,-0.258254,0.941218,2.099438,3,1000,0
4,-0.269069,-0.106385,-0.235731,-0.240414,-0.079836,-0.237433,0.665825,-0.209488,-0.180263,-0.203104,...,-0.017777,-0.128438,-0.277465,-0.186881,-0.27737,0.204584,3.166049,4,1000,0


In [18]:
# Save to parquet for efficient storage
encoded_train_df.to_parquet('encoded_train.parquet')
encoded_valid_df.to_parquet('encoded_valid.parquet')

In [12]:
# def create_ae_mlp(num_columns, hidden_units, dropout_rates, lr = 1e-3):
#     # Input layer and initial normalization
#     inp = layers.Input(shape = (num_columns, ))
#     x0 = layers.BatchNormalization()(inp)
    
#     # Encoder
#     encoder = layers.GaussianNoise(dropout_rates[0])(x0)
#     encoder = layers.Dense(hidden_units[0])(encoder)
#     encoder = layers.BatchNormalization()(encoder)
#     encoder = layers.Activation('swish')(encoder)
    
#     # Decoder
#     decoder = layers.Dropout(dropout_rates[1])(encoder)
#     decoder = layers.Dense(num_columns, name = 'decoder')(decoder)

#     # Takes decoder output and makes predictions
#     x_ae = layers.Dense(hidden_units[1])(decoder)
#     x_ae = layers.BatchNormalization()(x_ae)
#     x_ae = layers.Activation('swish')(x_ae)
#     x_ae = layers.Dropout(dropout_rates[2])(x_ae)

#     # out_ae = layers.Dense(num_labels, activation = 'sigmoid', name = 'ae_action')(x_ae)
#     x_ae = layers.Dense(1)(x_ae)
#     x_ae = layers.Activation('tanh')(x_ae)
#     out_ae = layers.Lambda(lambda x: 5 * x, name='ae_reg')(x_ae)
    
#     # # Combines original normalized input with encoded representation
#     # x = layers.Concatenate()([x0, encoder])
#     # x = layers.BatchNormalization()(x)
#     # x = layers.Dropout(dropout_rates[3])(x)
    
#     # # Deep network for main classification
#     # for i in range(2, len(hidden_units)):
#     #     x = layers.Dense(hidden_units[i])(x)
#     #     x = layers.BatchNormalization()(x)
#     #     x = layers.Activation('swish')(x)
#     #     x = layers.Dropout(dropout_rates[i + 2])(x)
        
#     # # out = layers.Dense(num_labels, activation = 'sigmoid', name = 'action')(x)
#     # x = layers.Dense(1)(x)
#     # x = layers.Activation('tanh')(x)
#     # out = layers.Lambda(lambda x: 5 * x, name='reg')(x)
    
#     # model definition and compile
#     # model = models.Model(inputs = inp, outputs = [decoder, out_ae, out])
#     model = models.Model(inputs = inp, outputs = [decoder, out_ae])
#     model.compile(optimizer = optimizers.Adam(learning_rate = lr),
#                   loss = {'decoder': losses.MeanSquaredError(), 
#                           'ae_reg': losses.MeanSquaredError(),
#                           # 'action': losses.BinaryCrossentropy(label_smoothing = ls), 
#                          },
#                   metrics = {'decoder': metrics.MeanAbsoluteError(name = 'MAE'), 
#                              'ae_reg': metrics.MeanSquaredError(name='MSE'), 
#                              # 'action': metrics.AUC(name = 'AUC'), 
#                             }, 
#                  )
    
#     return model

In [13]:
# def create_ae_mlp(num_columns, hidden_units, dropout_rates, lr=1e-3):
#     # Input layer and initial normalization
#     inp = layers.Input(shape=(num_columns,))
#     x0 = layers.BatchNormalization()(inp)
    
#     # Encoder - make it deeper and add regularization
#     encoder = layers.GaussianNoise(dropout_rates[0])(x0)
#     encoder = layers.Dense(hidden_units[0], kernel_regularizer=keras.regularizers.l2(1e-5))(encoder)
#     encoder = layers.BatchNormalization()(encoder)
#     encoder = layers.Activation('swish')(encoder)
#     encoder = layers.Dropout(dropout_rates[1])(encoder)
    
#     # Add another encoder layer
#     encoder = layers.Dense(hidden_units[0]//2, kernel_regularizer=keras.regularizers.l2(1e-5))(encoder)
#     encoder = layers.BatchNormalization()(encoder)
#     encoder = layers.Activation('swish')(encoder)
    
#     # Decoder with skip connection
#     decoder = layers.Dense(num_columns)(encoder)
#     decoder = layers.Add(name='decoder')([decoder, x0])  # Skip connection to help reconstruction
    
#     # Regression path - make it more focused on the regression task
#     x_ae = layers.Dense(hidden_units[1])(decoder)
#     x_ae = layers.BatchNormalization()(x_ae)
#     x_ae = layers.Activation('swish')(x_ae)
#     x_ae = layers.Dropout(dropout_rates[2])(x_ae)
    
#     # Additional layer for regression
#     x_ae = layers.Dense(hidden_units[1]//2)(x_ae)
#     x_ae = layers.BatchNormalization()(x_ae)
#     x_ae = layers.Activation('swish')(x_ae)
    
#     # Output with tanh and scaling
#     x_ae = layers.Dense(1)(x_ae)
#     x_ae = layers.Activation('tanh')(x_ae)
#     out_ae = layers.Lambda(lambda x: 5 * x, name='ae_reg')(x_ae)
    
#     model = models.Model(inputs=inp, outputs=[decoder, out_ae])
    
#     # Compile with adjusted loss weights
#     model.compile(
#         optimizer=optimizers.Adam(learning_rate=lr),
#         loss={
#             'decoder': losses.MeanSquaredError(),
#             'ae_reg': losses.MeanSquaredError()
#         },
#         # loss_weights={
#         #     'decoder': 0.1,  # Reduce reconstruction loss weight
#         #     'ae_reg': 1.0    # Focus more on regression task
#         # },
#         metrics={
#             'decoder': metrics.MeanAbsoluteError(name='MAE'),
#             'ae_reg': metrics.MeanSquaredError(name='MSE')
#         }
#     )
    
#     return model

In [14]:
# # params = {'num_columns': len(CONFIG.feature_cols), 
# #           # 'num_labels': 5, 
# #           'hidden_units': [96, 96, 896, 448, 448, 256], 
# #           'dropout_rates': [0.035, 0.038, 0.424, 0.104, 0.492, 0.320, 0.271, 0.437],
# #           # 'ls': 0, 
# #           'lr':1e-3, 
# #          }

# # Update parameters
# params = {
#     'num_columns': len(CONFIG.feature_cols),
#     'hidden_units': [128, 64],  # Simplified architecture
#     'dropout_rates': [0.01, 0.1, 0.2],  # Adjusted dropout
#     'lr': 1e-4  # Lower learning rate
# }

In [15]:
# model = create_ae_mlp(**params)

# ckp = callbacks.ModelCheckpoint(
#     "best-js-autoen.weights.h5", 
#     monitor='val_ae_reg_MSE', 
#     verbose=0, 
#     save_best_only=True, 
#     save_weights_only=True, 
#     mode='min'
# )

# # Add learning rate scheduling
# lr_scheduler = callbacks.ReduceLROnPlateau(
#     monitor='val_ae_reg_MSE',
#     factor=0.5,
#     patience=5,
#     min_lr=1e-6,
#     verbose=1
# )

# # Update early stopping
# es = callbacks.EarlyStopping(
#     monitor='val_ae_reg_MSE',
#     min_delta=1e-4,
#     patience=15,  # Increased patience
#     mode='min',
#     restore_best_weights=True,
#     verbose=1
# )

# # Train with updated callbacks
# history = model.fit(
#     train_generator,
#     validation_data=valid_generator,
#     epochs=100,
#     callbacks=[ckp, es, lr_scheduler],
#     verbose=1
# )

In [16]:
# model = create_ae_mlp(**params)
# ckp = callbacks.ModelCheckpoint("best-js-autoen.weights.h5", monitor = 'val_ae_reg_MSE', verbose = 0, 
#                               save_best_only = True, save_weights_only = True, mode = 'min')
# es = callbacks.EarlyStopping(monitor = 'val_ae_reg_MSE', min_delta = 1e-4, patience = 10, mode = 'min', 
#                            baseline = None, restore_best_weights = True, verbose = 0)

# history = model.fit(
#     train_generator,
#     validation_data=valid_generator,
#     epochs=100,
#     callbacks=[ckp, es],
#     verbose=1
# )

#         keras.backend.clear_session()
#         del model
#         rubbish = gc.collect()

In [17]:
# if not TEST:
#     scores = []
#     batch_size = 4096
#     gkf = PurgedGroupTimeSeriesSplit(n_splits = n_splits, group_gap = group_gap)
#     for fold, (tr, te) in enumerate(gkf.split(train['action'].values, train['action'].values, train['date'].values)):
#         ckp_path = f'JSModel_{fold}.hdf5'
#         model = create_ae_mlp(**params)
#         ckp = callbacks.ModelCheckpoint(ckp_path, monitor = 'val_action_AUC', verbose = 0, 
#                               save_best_only = True, save_weights_only = True, mode = 'max')
#         es = callbacks.EarlyStopping(monitor = 'val_action_AUC', min_delta = 1e-4, patience = 10, mode = 'max', 
#                            baseline = None, restore_best_weights = True, verbose = 0)
#         history = model.fit(X[tr], [X[tr], y[tr], y[tr]], validation_data = (X[te], [X[te], y[te], y[te]]), 
#                             sample_weight = sw[tr], 
#                             epochs = 100, batch_size = batch_size, callbacks = [ckp, es], verbose = 0)
#         hist = pd.DataFrame(history.history)
#         score = hist['val_action_AUC'].max()
#         print(f'Fold {fold} ROC AUC:\t', score)
#         scores.append(score)

#         keras.backend.clear_session()
#         del model
#         rubbish = gc.collect()
    
#     print('Weighted Average CV Score:', weighted_average(scores))