In [1]:
import os

X_TRAIN_PICKLE_PATH = '/kaggle/input/cfm-gts/X_train.pkl'
Y_TRAIN_PICKLE_PATH = '/kaggle/input/cfm-gts/y_train.pkl'

In [2]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
import tensorflow as tf
from tensorflow.keras.layers import Embedding, Input, Dense, GRU, Concatenate, Flatten, Reshape, TimeDistributed
from tensorflow.keras.models import Model


# ----------------------------------
# 1️⃣ Load & Cache Data Efficiently
# ----------------------------------

if os.path.exists(X_TRAIN_PICKLE_PATH) and os.path.exists(Y_TRAIN_PICKLE_PATH):
    X_train = pd.read_pickle(X_TRAIN_PICKLE_PATH)
    y_train = pd.read_pickle(Y_TRAIN_PICKLE_PATH)

In [3]:
X_train.head()

Unnamed: 0,obs_id,venue,order_id,action,side,price,bid,ask,bid_size,ask_size,trade,flux
0,0,4,0,A,A,0.3,0.0,0.01,100,1,False,100
1,0,4,1,A,B,-0.17,0.0,0.01,100,1,False,100
2,0,4,2,D,A,0.28,0.0,0.01,100,1,False,-100
3,0,4,3,A,A,0.3,0.0,0.01,100,1,False,100
4,0,4,4,D,A,0.37,0.0,0.01,100,1,False,-100


In [4]:
import numpy as np
import pandas as pd
import pickle
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, GRU, Dense, Concatenate, Dropout
from tensorflow.keras.optimizers import Adam
import random
from sklearn.model_selection import train_test_split

# Constants for the embedding dimensions
VENUE_EMBED_DIM = 8
ACTION_EMBED_DIM = 8
TRADE_EMBED_DIM = 8
SEQ_LENGTH = 100  # Define sequence length as a global constant

class OrderBookSequenceReshaper(BaseEstimator, TransformerMixin):
    """
    Reshapes order book data into sequences of observations by obs_id.
    Input: DataFrame with order book events
    Output: Dictionary of obs_id -> sequence of 100 events
    """
    def __init__(self, seq_length=SEQ_LENGTH):
        self.seq_length = seq_length

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Group data by observation ID
        grouped = X.groupby('obs_id')
        sequences = {}

        for obs_id, group in grouped:
            # Ensure we have exactly seq_length observations
            if len(group) == self.seq_length:
                sequences[obs_id] = group.reset_index(drop=True)
            else:
                print(f"Warning: Observation {obs_id} has {len(group)} events, expected {self.seq_length}")

        return sequences

class FeatureVectorizer(BaseEstimator, TransformerMixin):
    """
    Transforms sequences of order book events into tensors of shape (100, 30)
    """
    def __init__(self, seq_length=SEQ_LENGTH):
        self.venue_mapping = None
        self.action_mapping = None
        self.seq_length = seq_length

    def fit(self, sequences, y=None):
        # Extract unique values for categorical features
        all_venues = set()
        all_actions = set()

        for seq in sequences.values():
            all_venues.update(seq['venue'].unique())
            all_actions.update(seq['action'].unique())

        # Create mappings for categorical features
        self.venue_mapping = {v: i for i, v in enumerate(sorted(all_venues))}
        self.action_mapping = {a: i for i, a in enumerate(sorted(all_actions))}

        return self

    def transform(self, sequences):
        result = []
        obs_ids = []

        for obs_id, seq in sequences.items():
            # Get first bid price to normalize prices
            first_bid = seq.iloc[0]['bid']

            # Initialize tensor for this sequence
            seq_tensor = np.zeros((self.seq_length, 30))

            for i, row in seq.iterrows():
                # 1-8: Venue embedding (one-hot for now, will be replaced by embedding)
                venue_idx = self.venue_mapping[row['venue']]
                seq_tensor[i, venue_idx % 8] = 1  # Use modulo to fit within 8 dimensions

                # 9-16: Action embedding
                action_idx = self.action_mapping[row['action']]
                seq_tensor[i, 8 + (action_idx % 8)] = 1  # Use modulo to fit within 8 dimensions

                # 17-24: Trade embedding (one-hot for boolean)
                trade_val = 1 if row['trade'] else 0
                seq_tensor[i, 16 + trade_val] = 1

                # 25: Normalized bid
                seq_tensor[i, 24] = row['bid'] - first_bid

                # 26: Normalized ask
                seq_tensor[i, 25] = row['ask'] - first_bid

                # 27: Normalized price
                seq_tensor[i, 26] = row['price'] - first_bid

                # 28: log(bid_size + 1)
                seq_tensor[i, 27] = np.log1p(row['bid_size'])

                # 29: log(ask_size + 1)
                seq_tensor[i, 28] = np.log1p(row['ask_size'])

                # 30: log(flux)
                # Handle flux which could be negative
                flux = row['flux']
                seq_tensor[i, 29] = np.sign(flux) * np.log1p(abs(flux))

            result.append(seq_tensor)
            obs_ids.append(obs_id)

        return np.array(result), obs_ids

class OrderBookEmbeddingModel(BaseEstimator, TransformerMixin):
    """
    Neural network model for order book classification with proper embeddings
    """
    def __init__(self, n_venues, n_actions, n_categories=24, batch_size=128, learning_rate=3e-3, n_batches=10000):
        self.n_venues = n_venues
        self.n_actions = n_actions
        self.n_categories = n_categories
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.n_batches = n_batches
        self.model = None

    def create_model(self):
        # Input: (100, 30)
        input_layer = Input(shape=(SEQ_LENGTH, 30))

        # GRU layers - forward and backward
        forward_gru = GRU(64, return_sequences=False)(input_layer)
        backward_gru = GRU(64, return_sequences=False, go_backwards=True)(input_layer)

        # Concatenate GRU outputs
        concat = Concatenate()([forward_gru, backward_gru])

        # Dense layers
        dense1 = Dense(64, activation='selu')(concat)
        output_layer = Dense(self.n_categories, activation='softmax')(dense1)

        # Create and compile model
        model = Model(inputs=input_layer, outputs=output_layer)
        model.compile(
            loss='sparse_categorical_crossentropy',
            optimizer=Adam(learning_rate=self.learning_rate),
            metrics=['accuracy']
        )

        return model

    def fit(self, X, y):
        # X should be the tensor array of shape (n_samples, 100, 30)
        # y should be the category labels

        self.model = self.create_model()

        # Print model summary
        self.model.summary()

        # Training loop for specified number of batches
        for batch in range(self.n_batches):
            # Randomly select batch_size samples
            if len(X) <= self.batch_size:
                X_batch = X
                y_batch = y
            else:
                indices = random.sample(range(len(X)), self.batch_size)
                X_batch = X[indices]
                y_batch = y[indices]

            # Train on batch
            loss, acc = self.model.train_on_batch(X_batch, y_batch)

            # Report progress every 10 batches
            if batch % 10 == 0:
                print(f"Batch {batch}/{self.n_batches} completed - Loss: {loss:.4f}, Accuracy: {acc:.4f}")

        return self

    def transform(self, X):
        # Return predictions
        return self.model.predict(X)

    def predict(self, X):
        # Return class predictions
        return np.argmax(self.model.predict(X), axis=1)

# Define the full pipeline
def create_order_book_pipeline():
    pipeline = Pipeline([
        ('reshaper', OrderBookSequenceReshaper()),
        ('vectorizer', FeatureVectorizer()),
        # The model would be added after preprocessing in the training script
    ])

    return pipeline

# Training script with subset selection
def train_order_book_model(X_train, y_train, n_observations=200, n_batches=100):
    """
    Train the model on a subset of data

    Parameters:
    - X_train: DataFrame with order book events
    - y_train: DataFrame with labels
    - n_observations: Number of observations to use for training
    - n_batches: Number of batches to train for
    """
    print(f"Selecting {n_observations} random observations for training...")

    # Get unique observation IDs
    obs_ids = X_train['obs_id'].unique()

    # Select a random subset of observation IDs
    if len(obs_ids) > n_observations:
        selected_obs_ids = np.random.choice(obs_ids, size=n_observations, replace=False)
    else:
        selected_obs_ids = obs_ids
        print(f"Warning: Only {len(obs_ids)} observations available")

    # Filter data to only include selected observations
    X_train_subset = X_train[X_train['obs_id'].isin(selected_obs_ids)]
    y_train_subset = y_train[y_train.iloc[:, 0].isin(selected_obs_ids)]

    print(f"Selected data shape: {X_train_subset.shape}")

    # Step 1: Data preprocessing
    pipeline = create_order_book_pipeline()
    sequences = pipeline.named_steps['reshaper'].fit_transform(X_train_subset)
    print(f"Generated {len(sequences)} valid sequences")

    X_tensors, obs_ids = pipeline.named_steps['vectorizer'].fit_transform(sequences)
    print(f"Tensor shape: {X_tensors.shape}")

    # Step 2: Map observation IDs to labels
    y_dict = dict(zip(y_train.iloc[:, 0], y_train.iloc[:, 1]))
    y_values = np.array([y_dict[obs_id] for obs_id in obs_ids])
    print(f"Label shape: {y_values.shape}")

    # Step 3: Create and train the model
    n_venues = len(pipeline.named_steps['vectorizer'].venue_mapping)
    n_actions = len(pipeline.named_steps['vectorizer'].action_mapping)

    print(f"Unique venues: {n_venues}, Unique actions: {n_actions}")

    model = OrderBookEmbeddingModel(
        n_venues=n_venues,
        n_actions=n_actions,
        n_batches=n_batches
    )

    print(f"Starting training for {n_batches} batches...")
    model.fit(X_tensors, y_values)

    return model, pipeline

# Main function with small subset training
def main(n_observations=500, n_batches=100, test_size=0.2):
    """
    Run the pipeline on a small subset of data with proper train/val splitting
    """
    print("Starting training pipeline with small subset...")

    # Get all unique observation IDs first
    all_obs_ids = X_train['obs_id'].unique()

    # Limit to n_observations if specified
    if len(all_obs_ids) > n_observations:
        all_selected_ids = np.random.choice(all_obs_ids, size=n_observations, replace=False)
    else:
        all_selected_ids = all_obs_ids

    # Split IDs into train and validation BEFORE any processing
    train_ids, val_ids = train_test_split(all_selected_ids, test_size=test_size, random_state=42)

    print(f"Selected {len(train_ids)} observations for training and {len(val_ids)} for validation")

    # Filter data for training
    X_train_subset = X_train[X_train['obs_id'].isin(train_ids)]
    y_train_subset = y_train[y_train.iloc[:, 0].isin(train_ids)]

    # Create pipeline for preprocessing
    pipeline = create_order_book_pipeline()

    # Process training data
    train_sequences = pipeline.named_steps['reshaper'].fit_transform(X_train_subset)
    print(f"Generated {len(train_sequences)} valid sequences for training from {len(train_ids)} observations")

    # If no valid sequences, return early
    if len(train_sequences) == 0:
        print("No valid training sequences found. Check your data.")
        return None, pipeline

    X_train_tensors, train_obs_ids = pipeline.named_steps['vectorizer'].fit_transform(train_sequences)

    # Map observation IDs to labels
    y_dict = dict(zip(y_train.iloc[:, 0], y_train.iloc[:, 1]))
    y_train_values = np.array([y_dict[obs_id] for obs_id in train_obs_ids])

    # Create and train model
    n_venues = len(pipeline.named_steps['vectorizer'].venue_mapping)
    n_actions = len(pipeline.named_steps['vectorizer'].action_mapping)

    print(f"Unique venues: {n_venues}, Unique actions: {n_actions}")
    print(f"Training tensor shape: {X_train_tensors.shape}")
    print(f"Training labels shape: {y_train_values.shape}")

    model = OrderBookEmbeddingModel(
        n_venues=n_venues,
        n_actions=n_actions,
        n_batches=n_batches
    )

    print(f"Starting training for {n_batches} batches...")
    model.fit(X_train_tensors, y_train_values)

    # Process validation data if available
    if len(val_ids) > 0:
        print(f"\nValidating on {len(val_ids)} observations...")

        X_val_subset = X_train[X_train['obs_id'].isin(val_ids)]

        # Process validation data independently
        val_sequences = pipeline.named_steps['reshaper'].transform(X_val_subset)
        print(f"Generated {len(val_sequences)} valid sequences for validation from {len(val_ids)} observations")

        if len(val_sequences) == 0:
            print("No valid validation sequences found. Skipping validation.")
            return model, pipeline

        X_val_tensors, val_obs_ids = pipeline.named_steps['vectorizer'].transform(val_sequences)

        # Map observation IDs to labels
        y_val_values = np.array([y_dict[obs_id] for obs_id in val_obs_ids])

        # Make predictions
        print(f"Making predictions on {len(val_obs_ids)} validation samples...")
        val_predictions = model.predict(X_val_tensors)

        # Calculate accuracy
        accuracy = (val_predictions == y_val_values).mean()
        print(f"Validation accuracy: {accuracy:.4f}")

    return model, pipeline

In [5]:
model, pipeline = main(n_observations=1000, n_batches=100)

if model and pipeline:
        with open('order_book_model.pkl', 'wb') as f:
            pickle.dump(model, f)
        with open('order_book_pipeline.pkl', 'wb') as f:
            pickle.dump(pipeline, f)

Starting training pipeline with small subset...
Selected 800 observations for training and 200 for validation
Generated 800 valid sequences for training from 800 observations
Unique venues: 6, Unique actions: 3
Training tensor shape: (800, 100, 30)
Training labels shape: (800,)
Starting training for 100 batches...


Batch 0/100 completed - Loss: 3.7044, Accuracy: 0.0234
Batch 10/100 completed - Loss: 3.3034, Accuracy: 0.0597
Batch 20/100 completed - Loss: 3.1776, Accuracy: 0.0785
Batch 30/100 completed - Loss: 3.1058, Accuracy: 0.0950
Batch 40/100 completed - Loss: 3.0350, Accuracy: 0.1183
Batch 50/100 completed - Loss: 2.9643, Accuracy: 0.1368
Batch 60/100 completed - Loss: 2.8901, Accuracy: 0.1507
Batch 70/100 completed - Loss: 2.8243, Accuracy: 0.1655
Batch 80/100 completed - Loss: 2.7608, Accuracy: 0.1810
Batch 90/100 completed - Loss: 2.6986, Accuracy: 0.1963

Validating on 200 observations...
Generated 200 valid sequences for validation from 200 observations
Making predictions on 200 validation samples...
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
Validation accuracy: 0.1500


In [12]:
!pip install fastapi uvicorn nest-asyncio pyngrok

from fastapi import FastAPI, HTTPException, Depends
from pydantic import BaseModel, Field
from typing import List, Dict, Union, Optional
import numpy as np
import pandas as pd
import pickle
import os
from datetime import datetime
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# FastAPI application
app = FastAPI(
    title="Order Book Prediction API",
    description="API for order book sequence classification",
    version="1.0.0"
)

# Path to model and pipeline files
MODEL_PATH = os.getenv("MODEL_PATH", "order_book_model.pkl")
PIPELINE_PATH = os.getenv("PIPELINE_PATH", "order_book_pipeline.pkl")

# Load model and pipeline at startup
model = None
pipeline = None

class ModelLoader:
    def __init__(self):
        self._model = None
        self._pipeline = None
        self._load_time = None

    def load_model(self):
        """Load model and pipeline from files"""
        try:
            with open(MODEL_PATH, 'rb') as f:
                self._model = pickle.load(f)

            with open(PIPELINE_PATH, 'rb') as f:
                self._pipeline = pickle.load(f)

            self._load_time = datetime.now()
            logger.info(f"Model loaded successfully at {self._load_time}")
            return True
        except Exception as e:
            logger.error(f"Error loading model: {str(e)}")
            return False

    @property
    def model(self):
        if not self._model:
            self.load_model()
        return self._model

    @property
    def pipeline(self):
        if not self._pipeline:
            self.load_model()
        return self._pipeline

    @property
    def load_time(self):
        return self._load_time

# Initialize model loader
model_loader = ModelLoader()

# Pydantic models for request/response
class OrderBookEvent(BaseModel):
    venue: str
    action: str
    trade: bool
    bid: float
    ask: float
    price: float
    bid_size: float
    ask_size: float
    flux: float

class OrderBookSequence(BaseModel):
    events: List[OrderBookEvent] = Field(..., min_items=100, max_items=100,
                                        description="Sequence of 100 order book events")

class PredictionResponse(BaseModel):
    prediction: int
    prediction_probability: float
    processing_time_ms: float
    timestamp: str

class StatusResponse(BaseModel):
    status: str
    uptime: str
    model_loaded: bool
    model_load_time: Optional[str] = None

class HealthResponse(BaseModel):
    status: int
    message: str

# Dependency to ensure model is loaded
def get_model():
    if model_loader.model is None:
        raise HTTPException(status_code=503, detail="Model not loaded")
    return model_loader.model

def get_pipeline():
    if model_loader.pipeline is None:
        raise HTTPException(status_code=503, detail="Pipeline not loaded")
    return model_loader.pipeline

app_start_time = datetime.now()

@app.on_event("startup")
async def startup_event():
    """Load model on startup"""
    success = model_loader.load_model()
    if not success:
        logger.warning("Failed to load model at startup. Will attempt to load on first request.")

@app.get("/", response_model=StatusResponse)
def read_root():
    """Get API status"""
    # At the beginning of your code
    current_file_path = '/content/project.ipynb'

    # Then in your read_root function
    start_time = app_start_time
    uptime = str(datetime.now() - start_time)

    response = {
        "status": "running",
        "uptime": uptime,
        "model_loaded": model_loader.model is not None
    }

    if model_loader.load_time:
        response["model_load_time"] = model_loader.load_time.isoformat()

    return response

@app.get("/health", response_model=HealthResponse)
def health_check():
    """Check API health"""
    if model_loader.model is None:
        return HealthResponse(status=503, message="Model not loaded")
    return HealthResponse(status=200, message="OK")

@app.post("/predict", response_model=PredictionResponse)
def predict(sequence: OrderBookSequence,
            model=Depends(get_model),
            pipeline=Depends(get_pipeline)):
    """Make prediction for a new observation"""
    start_time = datetime.now()

    try:
        # Convert pydantic model to DataFrame
        df = pd.DataFrame([event.dict() for event in sequence.events])

        # Add observation ID column required by the pipeline
        df['obs_id'] = 'new_observation'

        # Process using pipeline
        reshaper = pipeline.named_steps['reshaper']
        vectorizer = pipeline.named_steps['vectorizer']

        # Transform sequence
        sequence_dict = reshaper.transform({'new_observation': df})
        X_tensor, _ = vectorizer.transform(sequence_dict)

        # Make prediction
        probabilities = model.model.predict(X_tensor)[0]
        prediction = np.argmax(probabilities)
        prediction_prob = float(probabilities[prediction])

        # Calculate processing time
        processing_time = (datetime.now() - start_time).total_seconds() * 1000

        return {
            "prediction": int(prediction),
            "prediction_probability": prediction_prob,
            "processing_time_ms": processing_time,
            "timestamp": datetime.now().isoformat()
        }

    except Exception as e:
        logger.error(f"Error making prediction: {str(e)}")
        raise HTTPException(status_code=500, detail=f"Prediction error: {str(e)}")

@app.post("/reload-model")
def reload_model():
    """Force reload of the model"""
    success = model_loader.load_model()
    if not success:
        raise HTTPException(status_code=500, detail="Failed to reload model")
    return {"message": "Model reloaded successfully", "load_time": model_loader.load_time.isoformat()}

ERROR:asyncio:Task exception was never retrieved
future: <Task finished name='Task-18' coro=<Server.serve() done, defined at /usr/local/lib/python3.11/dist-packages/uvicorn/server.py:68> exception=KeyboardInterrupt()>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/uvicorn/main.py", line 579, in run
    server.run()
  File "/usr/local/lib/python3.11/dist-packages/uvicorn/server.py", line 66, in run
    return asyncio.run(self.serve(sockets=sockets))
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/nest_asyncio.py", line 30, in run
    return loop.run_until_complete(task)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/nest_asyncio.py", line 92, in run_until_complete
    self._run_once()
  File "/usr/local/lib/python3.11/dist-packages/nest_asyncio.py", line 133, in _run_once
    handle._run()
  File "/usr/lib/python3.11/asyncio/events.py", line 84, in _run
    s



        on_event is deprecated, use lifespan event handlers instead.

        Read more about it in the
        [FastAPI docs for Lifespan Events](https://fastapi.tiangolo.com/advanced/events/).
        
  @app.on_event("startup")


In [None]:
from fastapi import FastAPI, HTTPException, Depends
from pydantic import BaseModel, Field
from typing import List, Dict, Union, Optional
import numpy as np
import pandas as pd
import pickle
import os
from datetime import datetime
import logging
import nest_asyncio
from pyngrok import ngrok
import uvicorn
from google.colab import userdata

# Load the environment variable
ngrok_auth_token = userdata.get('NGROK_AUTH')

# Check if the token is loaded
if ngrok_auth_token:
    print("Ngrok Auth Token loaded successfully")
else:
    print("Ngrok Auth Token not found")

# Configure ngrok with the auth token
ngrok.set_auth_token(ngrok_auth_token)

# Your existing FastAPI application code here

def run_api():
    # Apply the nest_asyncio patch
    nest_asyncio.apply()

    # Create a tunnel to the localhost
    public_url = ngrok.connect(8000)
    print(f"Public URL: {public_url}")

    # Run the FastAPI application
    uvicorn.run(app, host="0.0.0.0", port=8000)

# Launch the API
run_api()

Ngrok Auth Token loaded successfully
Public URL: NgrokTunnel: "https://f53f-34-16-205-53.ngrok-free.app" -> "http://localhost:8000"


INFO:     Started server process [19633]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)


INFO:     2a02:8424:61e0:4e01:e566:84a7:6d49:8acb:0 - "GET / HTTP/1.1" 200 OK
INFO:     2a02:8424:61e0:4e01:e566:84a7:6d49:8acb:0 - "GET /favicon.ico HTTP/1.1" 404 Not Found
