In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
import tensorflow as tf
from tensorflow.keras.layers import Embedding, Input, Dense, GRU, Concatenate, Flatten, Reshape
from tensorflow.keras.models import Model


# ----------------------------------
# 1️⃣ Load & Cache Data Efficiently
# ----------------------------------
if os.path.exists('X_train.pkl') and os.path.exists('y_train.pkl'):
    X_train = pd.read_pickle('X_train.pkl')
    y_train = pd.read_pickle('y_train.pkl')
else:
    X_train = pd.read_csv('X_train_N1UvY30.csv')
    y_train = pd.read_csv('y_train_or6m3Ta.csv')
    X_train.to_pickle('X_train.pkl')
    y_train.to_pickle('y_train.pkl')

In [None]:
# ----------------------------------
# 2️⃣ Custom Transformer: Time Step Reshaper (No changes needed)
# ----------------------------------
class TimeStepReshaper(BaseEstimator, TransformerMixin):
    """
    Custom transformer to reshape data into observations of time steps.
    """
    def __init__(self, time_steps=100):
        self.time_steps = time_steps

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        num_samples = X.shape[0] // self.time_steps
        reshaped_X = X.iloc[:num_samples * self.time_steps].values.reshape(num_samples, self.time_steps, -1)
        return reshaped_X


# ----------------------------------
# 3️⃣ Custom Transformer: OrdinalEncoderTransformer (No changes needed)
# ----------------------------------
class OrdinalEncoderTransformer(BaseEstimator, TransformerMixin):
    """
    Custom transformer to apply Ordinal Encoding on categorical features.
    Stores encoders and vocabulary sizes.
    """
    def __init__(self):
        self.encoders = {}
        self.vocabulary_sizes = {}

    def fit(self, X, y=None):
        self.encoders = {col: OrdinalEncoder() for col in X.columns}
        self.vocabulary_sizes = {}
        for col, encoder in self.encoders.items():
            encoder.fit(X[[col]])
            self.vocabulary_sizes[col] = len(encoder.categories_[0])
        return self

    def transform(self, X):
        encoded_features = np.column_stack(
            [self.encoders[col].transform(X[[col]]).astype(int) for col in X.columns]
        )
        return encoded_features


# ----------------------------------
# 4️⃣ Custom Transformer: TensorFlowStockClassifier (Modified for cleaner fit)
# ----------------------------------
class TensorFlowStockClassifier(BaseEstimator, TransformerMixin):
    """
    Custom Transformer for TensorFlow model.
    Vocabulary sizes are now determined within the pipeline's fit process.
    """
    def __init__(self, embedding_dim=8, lstm_units=64, epochs=10, batch_size=128, cat_feature_names=None, num_feature_names=None):
        self.embedding_dim = embedding_dim
        self.lstm_units = lstm_units
        self.epochs = epochs
        self.batch_size = batch_size
        self.vocabulary_sizes = None  # Vocabulary sizes will be set in fit
        self.cat_feature_names = cat_feature_names
        self.num_feature_names = num_feature_names
        self.model = None

    def fit(self, X, y):
        """
        Build and train the TensorFlow model.
        Vocabulary sizes are extracted from the fitted preprocessor.
        """
        time_steps = X.shape[1]
        num_features = X.shape[2]
        num_cats = len(self.cat_feature_names)
        num_numerical = len(self.num_feature_names)

        # 🌟 Get vocabulary sizes from the fitted preprocessor in the pipeline
        ordinal_encoder_transformer = self.pipeline_.named_steps['preprocessor'].named_transformers_['cat'].named_steps['encoder']
        self.vocabulary_sizes = ordinal_encoder_transformer.vocabulary_sizes


        # Define input layers
        categorical_input = Input(shape=(time_steps, num_cats), dtype=tf.int32, name='categorical_input')
        numerical_input = Input(shape=(time_steps, num_numerical), name='numerical_input')

        # Embedding Layers
        venue_embedding = TimeDistributed(Flatten())(TimeDistributed(Embedding(self.vocabulary_sizes['venue'], self.embedding_dim))(categorical_input[:,:, 0:1]))
        action_embedding = TimeDistributed(Flatten())(TimeDistributed(Embedding(self.vocabulary_sizes['action'], self.embedding_dim))(categorical_input[:,:, 1:2]))
        trade_embedding = TimeDistributed(Flatten())(TimeDistributed(Embedding(self.vocabulary_sizes['trade'], self.embedding_dim))(categorical_input[:,:, 2:3]))

        # Merge inputs
        merged_inputs = Concatenate(axis=-1)([venue_embedding, action_embedding, trade_embedding, numerical_input])

        # GRU layers
        gru_forward = GRU(self.lstm_units, return_sequences=True)(merged_inputs)
        gru_backward = GRU(self.lstm_units, return_sequences=True, go_backwards=True)(merged_inputs)
        gru_output = Concatenate()([gru_forward, gru_backward])
        flattened_gru_output = Flatten()(gru_output)

        # Dense layers for classification
        dense1 = Dense(64, activation='selu')(flattened_gru_output)
        output = Dense(24, activation='softmax')(dense1)

        # Define and compile model
        self.model = Model(inputs=[categorical_input, numerical_input], outputs=output)
        self.model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

        # Prepare input data (assuming preprocessed by ColumnTransformer)
        X_cat_train = X[:, :, :num_cats]
        X_num_train = X[:, :, num_cats:]

        # Train model
        self.model.fit([X_cat_train, X_num_train], y, epochs=self.epochs, batch_size=self.batch_size, verbose=1)
        return self

    def transform(self, X):
        num_cats = len(self.cat_feature_names)
        X_cat_test = X[:, :, :num_cats]
        X_num_test = X[:, :, num_cats:]
        return self.model.predict([X_cat_test, X_num_test])


# ----------------------------------
# 5️⃣ Define Preprocessing Pipeline (No changes needed)
# ----------------------------------
cat_features = ['venue', 'action', 'trade']
num_features = ['bid', 'ask', 'price', 'bid_size', 'ask_size', 'flux']

categorical_preprocessor = Pipeline([
    ('encoder', OrdinalEncoderTransformer())
])

numerical_preprocessor = Pipeline([
    ('passthrough', 'passthrough')
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_preprocessor, cat_features),
        ('num', numerical_preprocessor, num_features)
    ],
    remainder='passthrough'
)


# ----------------------------------
# 6️⃣ Full Pipeline (Clean and Integrated Fit)
# ----------------------------------
pipeline = Pipeline([
    ('reshape', TimeStepReshaper(time_steps=100)),
    ('preprocessor', preprocessor),
    ('classifier', TensorFlowStockClassifier(
        embedding_dim=8, lstm_units=64, epochs=10, batch_size=128,
        cat_feature_names=cat_features,
        num_feature_names=num_features
    ))
])

# Train Pipeline (Now much cleaner!)
X_train_reshaped_np = TimeStepReshaper(time_steps=100).fit_transform(X_train) # Reshape to numpy array

# Convert reshaped numpy array back to DataFrame for ColumnTransformer
original_columns = X_train.columns # Get original column names
num_original_columns = len(original_columns)
X_train_reshaped = pd.DataFrame(
    X_train_reshaped_np.reshape(-1, num_original_columns), # Flatten back to 2D and then reshape to DataFrame
    columns=original_columns
).groupby(np.arange(len(X_train_reshaped_np.reshape(-1, num_original_columns))) // 100).apply(lambda x: x.values.reshape(100, num_original_columns)) # Reshape back to 3D DataFrame


pipeline.fit(X_train_reshaped, y_train.iloc[:X_train_reshaped.shape[0]]) # Fit the pipeline


# To predict (example)
# predictions = pipeline.predict(X_test) # Assuming X_test is loaded and in original format