# retrieve run data from mongo

In [29]:
from pymongo import MongoClient
import polars as pl

client = MongoClient("localhost", 27017)
db = client["trader"]
documents = list(db["message"].find({}))
run_data = pl.DataFrame(documents)

filtered_data = run_data.filter(pl.col("trading_session_id") == run_data['trading_session_id'][-1])

preprocessed_data = [{
    'message_type': row['content']['type'],
    'order_book': row['content'].get('order_book'),
    'incoming': {k: v for k, v in row['content'].get('incoming_message', {}).items() if k != 'trader_id'},
    # 'trader_id': row['content']['incoming_message'].get('trader_id'),
    
    'timestamp': row['timestamp']
} for row in filtered_data.to_dicts()]

new_order_books = pl.DataFrame(preprocessed_data).sort("timestamp", nulls_last=True).with_row_index("row_number")
print(new_order_books.head(10))

shape: (7, 5)
┌────────────┬──────────────┬─────────────────────┬──────────────┬─────────────────────────┐
│ row_number ┆ message_type ┆ order_book          ┆ incoming     ┆ timestamp               │
│ ---        ┆ ---          ┆ ---                 ┆ ---          ┆ ---                     │
│ u32        ┆ str          ┆ struct[2]           ┆ struct[3]    ┆ datetime[μs]            │
╞════════════╪══════════════╪═════════════════════╪══════════════╪═════════════════════════╡
│ 0          ┆ ADD_ORDER    ┆ {[{2000.0,1.0}],[]} ┆ {1.0,2000,1} ┆ 2024-07-08 23:34:40.207 │
│ 1          ┆ ADD_ORDER    ┆ {[{2000.0,2.0}],[]} ┆ {1.0,2000,1} ┆ 2024-07-08 23:34:40.299 │
│ 2          ┆ ADD_ORDER    ┆ {[{2000.0,2.0},     ┆ {1.0,1999,1} ┆ 2024-07-08 23:34:40.365 │
│            ┆              ┆ {1999.0,1.0}],…     ┆              ┆                         │
│ 3          ┆ ADD_ORDER    ┆ {[{2000.0,2.0},     ┆ {1.0,1999,1} ┆ 2024-07-08 23:34:40.468 │
│            ┆              ┆ {1999.0,2.0}],…     ┆     

In [3]:
from pymongo import MongoClient
import polars as pl

client = MongoClient("localhost", 27017)
db = client["trader"]
collection = db["message"]

# First, let's check the total number of documents and print a sample document
total_documents = collection.count_documents({})
print(f"Total documents in collection: {total_documents}")

sample_doc = collection.find_one()
if sample_doc:
    print("Sample document keys:", sample_doc.keys())
    if 'trading_session_id' in sample_doc:
        print("Sample trading_session_id:", sample_doc['trading_session_id'])
    else:
        print("Warning: 'trading_session_id' not found in sample document")
else:
    print("No documents found in the collection")

# Now, let's check for the specific session ID
specific_session_id = '92b3dbc3-da86-4740-9dfc-382d200c3e95'
session_documents = list(collection.find({'trading_session_id': specific_session_id}))
print(f"Found {len(session_documents)} documents for session ID: {specific_session_id}")

if session_documents:
    run_data = pl.DataFrame(session_documents)
    print("Columns in run_data:", run_data.columns)
    print("Shape of run_data:", run_data.shape)
    print("First few rows of run_data:")
    print(run_data.head())

    # Your existing preprocessing code
    preprocessed_data = [{
        'message_type': row['content']['type'],
        'order_book': row['content'].get('order_book'),
        'incoming': {k: v for k, v in row['content'].get('incoming_message', {}).items() if k != 'trader_id'},
        'timestamp': row['timestamp']
    } for row in run_data.to_dicts()]

    new_order_books = pl.DataFrame(preprocessed_data).sort("timestamp", nulls_last=True).with_row_index("row_number")
    print("Preprocessed data:")
    print(new_order_books.head(10))
else:
    print(f"No documents found for the session ID: {specific_session_id}")

# Let's also check all unique session IDs
unique_session_ids = collection.distinct('trading_session_id')
print(f"\nFound {len(unique_session_ids)} unique trading session IDs")
print("First 5 session IDs:", unique_session_ids[:5])
print(f"Is '{specific_session_id}' in the list of unique session IDs? {specific_session_id in unique_session_ids}")

Total documents in collection: 15871
Sample document keys: dict_keys(['_id', 'trading_session_id', 'content', 'timestamp'])
Sample trading_session_id: 715dadb3-3a1c-4de9-bb7c-e8088bd0988d
Found 0 documents for session ID: 92b3dbc3-da86-4740-9dfc-382d200c3e95
No documents found for the session ID: 92b3dbc3-da86-4740-9dfc-382d200c3e95

Found 60 unique trading session IDs
First 5 session IDs: ['05fc09c2-da4b-452f-ad6f-3ceab1b46522', '0bb5f63e-37c4-4f21-84b0-83d7544d175c', '0e33b375-3cba-4f23-8431-9ce366a64c8f', '14d9ccc8-fe05-4dd4-bcff-7be3e28590b5', '16178e8b-4f06-4ea6-81c0-95c81d0abb0b']
Is '92b3dbc3-da86-4740-9dfc-382d200c3e95' in the list of unique session IDs? False


In [130]:
run_data['content'][0]

{'text': 'add_order update processed',
 'type': 'ADD_ORDER',
 'current_time': '2024-07-08T16:05:33.451911+00:00',
 'start_time': '2024-07-08T16:05:32.955265+00:00',
 'duration': 1,
 'order_book': {'bids': [{'x': 1998.0, 'y': 1.0}], 'asks': []},
 'active_orders': [{'id': b'+\xbe\xbf\x07\xdeBA\xd1\x97\x18\xc8\xaa9\xb2\x0e\xd3',
   'trader_id': 'INITIAL_ORDER_BOOK_d135508e-8776-41af-8b7f-d00704df76d4',
   'order_type': 1,
   'amount': 1.0,
   'price': 1998.0,
   'timestamp': datetime.datetime(2024, 7, 8, 16, 5, 33, 451000)}],
 'history': [],
 'spread': None,
 'midpoint': None,
 'transaction_price': None,
 'incoming_message': {'amount': 1.0,
  'price': 1998,
  'order_type': 1,
  'trader_id': 'INITIAL_ORDER_BOOK_d135508e-8776-41af-8b7f-d00704df76d4'},
 'test_field': 'test'}

In [21]:
new_order_books.tail(20)

row_number,message_type,order_book,incoming,timestamp
u32,str,struct[2],struct[3],datetime[μs]
264,"""ADD_ORDER""","{[{1999.0,15.0}, {1998.0,16.0}, … {1995.0,2.0}],[{2000.0,13.0}, {2001.0,21.0}, … {2004.0,7.0}]}","{1.0,1999,1}",2024-07-08 17:07:45.129
265,"""ADD_ORDER""","{[{1999.0,15.0}, {1998.0,16.0}, … {1995.0,2.0}],[{2000.0,13.0}, {2001.0,22.0}, … {2004.0,7.0}]}","{1.0,2001,-1}",2024-07-08 17:07:45.541
266,"""ADD_ORDER""","{[{1999.0,15.0}, {1998.0,16.0}, … {1995.0,2.0}],[{2000.0,13.0}, {2001.0,22.0}, … {2004.0,7.0}]}","{1.0,2002,-1}",2024-07-08 17:07:45.860
267,"""ADD_ORDER""","{[{1999.0,11.0}, {1998.0,8.0}, … {1996.0,12.0}],[{2000.0,13.0}, {2001.0,11.0}, … {2003.0,19.0}]}","{1.0,1997,1}",2024-07-08 17:07:46.406
268,"""ADD_ORDER""","{[{1999.0,10.0}, {1998.0,7.0}, … {1996.0,12.0}],[{2000.0,13.0}, {2001.0,11.0}, … {2003.0,19.0}]}","{1.0,1999,-1}",2024-07-08 17:07:46.677
…,…,…,…,…
279,"""ADD_ORDER""","{[{1999.0,5.0}, {1998.0,5.0}, … {1996.0,4.0}],[{2000.0,9.0}, {2001.0,4.0}, … {2003.0,4.0}]}","{1.0,1998,1}",2024-07-08 17:07:49.566
280,"""BOOK_UPDATED""","{[{1999.0,2.0}, {1998.0,2.0}, {1997.0,3.0}],[{2000.0,3.0}, {2001.0,2.0}, {2002.0,1.0}]}","{null,null,null}",2024-07-08 17:07:50.040
281,"""ADD_ORDER""","{[{1999.0,2.0}, {1998.0,2.0}, … {1996.0,1.0}],[{2000.0,3.0}, {2001.0,2.0}, {2002.0,1.0}]}","{1.0,1996,1}",2024-07-08 17:07:50.187
282,"""stop_trading""","{[{1999.0,2.0}, {1998.0,2.0}, … {1996.0,1.0}],[{2000.0,3.0}, {2001.0,2.0}, {2002.0,1.0}]}","{null,null,null}",2024-07-08 17:07:50.326


In [143]:
session_data_pd['order_imbalance']

0      1.000000
1      1.000000
2      1.000000
3      1.000000
4      1.000000
         ...   
288   -0.783784
289   -0.736842
290   -0.703704
291   -0.555556
292    0.142857
Name: order_imbalance, Length: 293, dtype: float64

In [34]:
from analysis.utilities import delete_all_tables

delete_all_tables()

DuckDB tables deleted successfully.
MongoDB collection deleted successfully.


In [39]:
import numpy as np
from SALib.analyze import sobol

def perform_sobol_analysis(problem, Y):
    if np.all(Y == Y[0]) or np.var(Y) < 1e-10:
        print("Warning: All Y values are the same or variance is very small.")
        return {'S1': np.zeros(problem['num_vars']), 'ST': np.zeros(problem['num_vars'])}
    try:
        Si = sobol.analyze(problem, Y, calc_second_order=False, print_to_console=False)
        if np.any(np.isnan(Si['S1'])) or np.any(np.isnan(Si['ST'])):
            print("Warning: NaN values in Sobol indices.")
            return {'S1': np.zeros(problem['num_vars']), 'ST': np.zeros(problem['num_vars'])}
        return Si
    except Exception as e:
        print(f"Error in Sobol analysis: {str(e)}")
        return {'S1': np.zeros(problem['num_vars']), 'ST': np.zeros(problem['num_vars'])}

problem = {
    'num_vars': 2,
    'names': ['X1', 'X2'],
    'bounds': [[0, 1], [0, 1]]
}

N = 1000
Y = np.linspace(0, 1, N)

for _ in range(5):
    result = perform_sobol_analysis(problem, Y)
    print(f"S1: {result['S1']}")
    print(f"ST: {result['ST']}")
    print()

S1: [1.7999802e-05 3.5999604e-05]
ST: [5.9999340e-06 2.3999736e-05]

S1: [1.7999802e-05 3.5999604e-05]
ST: [5.9999340e-06 2.3999736e-05]

S1: [1.7999802e-05 3.5999604e-05]
ST: [5.9999340e-06 2.3999736e-05]

S1: [1.7999802e-05 3.5999604e-05]
ST: [5.9999340e-06 2.3999736e-05]

S1: [1.7999802e-05 3.5999604e-05]
ST: [5.9999340e-06 2.3999736e-05]

