In [1]:
import sys
import os
import pickle  # Added this since you're using pickle.dump

# Add the parent directory to the path
sys.path.append(os.path.abspath('..'))

# Import your modularized code
from order_book.data import OrderBookDataGenerator
from order_book.preprocessing import create_order_book_pipeline
from order_book.models import OrderBookEmbeddingModel
from order_book.utils import main

# Define file paths
X_train_path = "../data/X_train.parquet"
y_train_path = "../data/y_train.csv"

# Train the model
model, pipeline = main(
    X_path=X_train_path,
    y_path=y_train_path,
    batch_size=1000,
    chunk_size=50000,
    n_epochs=3,
    visualize=False,
    n_samples_to_visualize=0,
    model_save_dir="../model_checkpoints"
)

# Save the model and pipeline
with open("order_book_model.pkl", "wb") as f:
    pickle.dump((model, pipeline), f)

Starting training with progressive parquet/CSV data loading...
Model checkpoints will be saved to ../model_checkpoints
Loading all data for preprocessing...
Loaded all data: 16080000 timesteps
Fitting preprocessing pipeline...
Filtered out 1 observations with non-positive bid_size values
Creating model with 6 venues and 3 actions
Setting up training with chunked data...

--- Epoch 1/3 ---
Processing chunk 1 with 50000 observations
Training on 50000 sequences from chunk 1


[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 691ms/step - accuracy: 0.0624 - loss: 3.1619
Processing chunk 2 with 50000 observations
Training on 50000 sequences from chunk 2
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 752ms/step - accuracy: 0.1612 - loss: 2.5651
Processing chunk 3 with 50000 observations
Filtered out 1 observations with non-positive bid_size values
Training on 49999 sequences from chunk 3
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 830ms/step - accuracy: 0.2277 - loss: 2.3302
Processing chunk 4 with 10800 observations
Training on 10800 sequences from chunk 4
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 715ms/step - accuracy: 0.2588 - loss: 2.2155

--- Epoch 2/3 ---
Processing chunk 1 with 50000 observations
Training on 50000 sequences from chunk 1
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 719ms/step - accuracy: 0.2687 - loss: 2.1870
Processing chunk 2 with 50000 obse

KeyboardInterrupt: 

In [2]:
import pandas as pd
import numpy as np
import pickle
import os
import sys

# Add the project root directory to Python's path
sys.path.append(os.path.abspath(".."))

# Load the saved model and pipeline correctly
with open("../model_checkpoints/final_model.pkl", "rb") as f:
    checkpoint_data = pickle.load(f)
    model = checkpoint_data['model']
    pipeline = checkpoint_data['pipeline']

# Define paths
X_test_path = "../data/X_test.parquet"  # Path to test data
output_path = "../data/y_submission.csv"  # Path to save predictions

# Load test data
X_test = pd.read_parquet(X_test_path)

# Get unique observation IDs
observation_ids = X_test['obs_id'].unique()
predictions = []

# Define chunk size
chunk_size = 10000  # Adjust based on your memory constraints and model

# Process observations in chunks
print(f"Processing {len(observation_ids)} observations in chunks of {chunk_size}...")

# Create chunks of observation IDs
chunks = [observation_ids[i:i + chunk_size] for i in range(0, len(observation_ids), chunk_size)]

for chunk_idx, chunk_obs_ids in enumerate(chunks):
    print(f"Processing chunk {chunk_idx+1}/{len(chunks)}")
    
    # Get data for current chunk of observations
    chunk_data = X_test[X_test['obs_id'].isin(chunk_obs_ids)]
    
    # Process the chunk through the pipeline
    # The pipeline returns a tuple: (data_dict, obs_ids)
    X_dict, processed_obs_ids = pipeline.transform(chunk_data)
    
    # Create a mapping to ensure predictions align with original chunk_obs_ids
    obs_id_to_position = {obs_id: i for i, obs_id in enumerate(processed_obs_ids)}
    
    # Make predictions using the model
    chunk_predictions = model.predict(X_dict)
    
    # Store results, ensuring we have the right observation IDs
    for i, obs_id in enumerate(processed_obs_ids):
        if obs_id in chunk_obs_ids:  # Just a safety check
            predictions.append((obs_id, int(chunk_predictions[i])))

# Create submission DataFrame
submission_df = pd.DataFrame(predictions, columns=['obs_id', 'eqt_code_cat'])

# Save to CSV
submission_df.to_csv(output_path, index=False)
print(f"Predictions saved to {output_path}")

Processing 81600 observations in chunks of 10000...
Processing chunk 1/9
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 217ms/step
Processing chunk 2/9
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 277ms/step
Processing chunk 3/9
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 273ms/step
Processing chunk 4/9
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 289ms/step
Processing chunk 5/9
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 261ms/step
Processing chunk 6/9
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 271ms/step
Processing chunk 7/9
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 267ms/step
Processing chunk 8/9
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 289ms/step
Processing chunk 9/9
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 713ms/step
Predictions saved to ../data/y_submission.csv


In [None]:
# Add this to a cell in project.ipynb
import sys
import os
import nest_asyncio
import uvicorn
from threading import Thread
import time
from pyngrok import ngrok

# Add project root to path if needed
if not os.path.abspath('..') in sys.path:
    sys.path.append(os.path.abspath('..'))

# Import your FastAPI app
from api.main import app

# Apply nest_asyncio to allow running asyncio in notebook
nest_asyncio.apply()

# Function to run the server in a separate thread
def run_server_in_thread(port=8000):
    uvicorn.run(app, host="0.0.0.0", port=port)

# Create a function to start/stop server with optional ngrok
def start_api_server(port=8000, use_ngrok=False, ngrok_token=None):
    global server_thread, public_url
    
    # Configure ngrok if needed
    if use_ngrok:
        if ngrok_token:
            ngrok.set_auth_token(ngrok_token)
        public_url = ngrok.connect(port)
        print(f"📡 Public URL: {public_url}")
        
    # Start server in a thread
    server_thread = Thread(target=run_server_in_thread, args=(port,), daemon=True)
    server_thread.start()
    
    # Wait for server startup
    time.sleep(1)
    print(f"🚀 API server running at http://localhost:{port}")
    print(f"📚 API documentation at http://localhost:{port}/docs")
    
    return server_thread

# Function to stop the server
def stop_api_server():
    # Disconnect ngrok tunnels
    try:
        ngrok.disconnect_all()
        print("📡 Ngrok tunnel closed")
    except:
        pass
    
    print("🛑 API server stopping - notebook kernel will need to be restarted")
    # To fully stop the server, you'll need to restart the kernel

# Start the API server
server_thread = start_api_server(port=8000, use_ngrok=False)