# retrieve run data from mongo

In [29]:
from pymongo import MongoClient
import polars as pl

client = MongoClient("localhost", 27017)
db = client["trader"]
documents = list(db["message"].find({}))
run_data = pl.DataFrame(documents)

filtered_data = run_data.filter(pl.col("trading_session_id") == run_data['trading_session_id'][-1])

preprocessed_data = [{
    'message_type': row['content']['type'],
    'order_book': row['content'].get('order_book'),
    'incoming': {k: v for k, v in row['content'].get('incoming_message', {}).items() if k != 'trader_id'},
    # 'trader_id': row['content']['incoming_message'].get('trader_id'),
    
    'timestamp': row['timestamp']
} for row in filtered_data.to_dicts()]

new_order_books = pl.DataFrame(preprocessed_data).sort("timestamp", nulls_last=True).with_row_index("row_number")
print(new_order_books.head(10))

shape: (7, 5)
┌────────────┬──────────────┬─────────────────────┬──────────────┬─────────────────────────┐
│ row_number ┆ message_type ┆ order_book          ┆ incoming     ┆ timestamp               │
│ ---        ┆ ---          ┆ ---                 ┆ ---          ┆ ---                     │
│ u32        ┆ str          ┆ struct[2]           ┆ struct[3]    ┆ datetime[μs]            │
╞════════════╪══════════════╪═════════════════════╪══════════════╪═════════════════════════╡
│ 0          ┆ ADD_ORDER    ┆ {[{2000.0,1.0}],[]} ┆ {1.0,2000,1} ┆ 2024-07-08 23:34:40.207 │
│ 1          ┆ ADD_ORDER    ┆ {[{2000.0,2.0}],[]} ┆ {1.0,2000,1} ┆ 2024-07-08 23:34:40.299 │
│ 2          ┆ ADD_ORDER    ┆ {[{2000.0,2.0},     ┆ {1.0,1999,1} ┆ 2024-07-08 23:34:40.365 │
│            ┆              ┆ {1999.0,1.0}],…     ┆              ┆                         │
│ 3          ┆ ADD_ORDER    ┆ {[{2000.0,2.0},     ┆ {1.0,1999,1} ┆ 2024-07-08 23:34:40.468 │
│            ┆              ┆ {1999.0,2.0}],…     ┆     

In [3]:
from pymongo import MongoClient
import polars as pl

client = MongoClient("localhost", 27017)
db = client["trader"]
collection = db["message"]

# First, let's check the total number of documents and print a sample document
total_documents = collection.count_documents({})
print(f"Total documents in collection: {total_documents}")

sample_doc = collection.find_one()
if sample_doc:
    print("Sample document keys:", sample_doc.keys())
    if 'trading_session_id' in sample_doc:
        print("Sample trading_session_id:", sample_doc['trading_session_id'])
    else:
        print("Warning: 'trading_session_id' not found in sample document")
else:
    print("No documents found in the collection")

# Now, let's check for the specific session ID
specific_session_id = '92b3dbc3-da86-4740-9dfc-382d200c3e95'
session_documents = list(collection.find({'trading_session_id': specific_session_id}))
print(f"Found {len(session_documents)} documents for session ID: {specific_session_id}")

if session_documents:
    run_data = pl.DataFrame(session_documents)
    print("Columns in run_data:", run_data.columns)
    print("Shape of run_data:", run_data.shape)
    print("First few rows of run_data:")
    print(run_data.head())

    # Your existing preprocessing code
    preprocessed_data = [{
        'message_type': row['content']['type'],
        'order_book': row['content'].get('order_book'),
        'incoming': {k: v for k, v in row['content'].get('incoming_message', {}).items() if k != 'trader_id'},
        'timestamp': row['timestamp']
    } for row in run_data.to_dicts()]

    new_order_books = pl.DataFrame(preprocessed_data).sort("timestamp", nulls_last=True).with_row_index("row_number")
    print("Preprocessed data:")
    print(new_order_books.head(10))
else:
    print(f"No documents found for the session ID: {specific_session_id}")

# Let's also check all unique session IDs
unique_session_ids = collection.distinct('trading_session_id')
print(f"\nFound {len(unique_session_ids)} unique trading session IDs")
print("First 5 session IDs:", unique_session_ids[:5])
print(f"Is '{specific_session_id}' in the list of unique session IDs? {specific_session_id in unique_session_ids}")

Total documents in collection: 15871
Sample document keys: dict_keys(['_id', 'trading_session_id', 'content', 'timestamp'])
Sample trading_session_id: 715dadb3-3a1c-4de9-bb7c-e8088bd0988d
Found 0 documents for session ID: 92b3dbc3-da86-4740-9dfc-382d200c3e95
No documents found for the session ID: 92b3dbc3-da86-4740-9dfc-382d200c3e95

Found 60 unique trading session IDs
First 5 session IDs: ['05fc09c2-da4b-452f-ad6f-3ceab1b46522', '0bb5f63e-37c4-4f21-84b0-83d7544d175c', '0e33b375-3cba-4f23-8431-9ce366a64c8f', '14d9ccc8-fe05-4dd4-bcff-7be3e28590b5', '16178e8b-4f06-4ea6-81c0-95c81d0abb0b']
Is '92b3dbc3-da86-4740-9dfc-382d200c3e95' in the list of unique session IDs? False
