In [37]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Load the dataset
file_path = "C:/Users/HP/Desktop/Spring 2025/Data Science/first_25000_rows.csv"
data = pd.read_csv(file_path)

# Convert timestamp columns to datetime
data['ts_recv'] = pd.to_datetime(data['ts_recv'], unit='ns')
data['ts_event'] = pd.to_datetime(data['ts_event'], unit='ns')

# Sort by timestamp
data = data.sort_values('ts_recv')

# Get unique instrument IDs
instrument_ids = data['instrument_id'].unique()

# Initialize dictionary to store results
ofi_results = {}

# 1. Best-Level OFI Implementation
def calculate_best_level_ofi(group):
    """Calculate Best-Level OFI according to equation (1) in the paper"""
    group = group.copy()
    group['OFI_bid_00'] = 0
    group['OFI_ask_00'] = 0
    
    for i in range(1, len(group)):
        prev_row = group.iloc[i-1]
        curr_row = group.iloc[i]
        
        # Best bid OFI calculation
        if curr_row['bid_px_00'] > prev_row['bid_px_00']:
            group.loc[group.index[i], 'OFI_bid_00'] = curr_row['bid_sz_00']
        elif curr_row['bid_px_00'] == prev_row['bid_px_00']:
            group.loc[group.index[i], 'OFI_bid_00'] = curr_row['bid_sz_00'] - prev_row['bid_sz_00']
        else:
            group.loc[group.index[i], 'OFI_bid_00'] = -curr_row['bid_sz_00']
            
        # Best ask OFI calculation
        if curr_row['ask_px_00'] > prev_row['ask_px_00']:
            group.loc[group.index[i], 'OFI_ask_00'] = -curr_row['ask_sz_00']
        elif curr_row['ask_px_00'] == prev_row['ask_px_00']:
            group.loc[group.index[i], 'OFI_ask_00'] = curr_row['ask_sz_00'] - prev_row['ask_sz_00']
        else:
            group.loc[group.index[i], 'OFI_ask_00'] = curr_row['ask_sz_00']
    
    group['OFI_00'] = group['OFI_bid_00'] - group['OFI_ask_00']
    return group

# 2. Multi-Level OFI Implementation
def calculate_multi_level_ofi(group, levels=10):
    """Calculate Multi-Level OFI according to equation (2) in the paper"""
    group = group.copy()
    for m in range(levels):
        bid_px_col = f'bid_px_{m:02d}'
        ask_px_col = f'ask_px_{m:02d}'
        bid_sz_col = f'bid_sz_{m:02d}'
        ask_sz_col = f'ask_sz_{m:02d}'
        
        group[f'OFI_bid_{m:02d}'] = 0
        group[f'OFI_ask_{m:02d}'] = 0
        
        for i in range(1, len(group)):
            prev_row = group.iloc[i-1]
            curr_row = group.iloc[i]
            
            # Bid OFI calculation
            if curr_row[bid_px_col] > prev_row[bid_px_col]:
                group.loc[group.index[i], f'OFI_bid_{m:02d}'] = curr_row[bid_sz_col]
            elif curr_row[bid_px_col] == prev_row[bid_px_col]:
                group.loc[group.index[i], f'OFI_bid_{m:02d}'] = curr_row[bid_sz_col] - prev_row[bid_sz_col]
            else:
                group.loc[group.index[i], f'OFI_bid_{m:02d}'] = -curr_row[bid_sz_col]
                
            # Ask OFI calculation
            if curr_row[ask_px_col] > prev_row[ask_px_col]:
                group.loc[group.index[i], f'OFI_ask_{m:02d}'] = -curr_row[ask_sz_col]
            elif curr_row[ask_px_col] == prev_row[ask_px_col]:
                group.loc[group.index[i], f'OFI_ask_{m:02d}'] = curr_row[ask_sz_col] - prev_row[ask_sz_col]
            else:
                group.loc[group.index[i], f'OFI_ask_{m:02d}'] = curr_row[ask_sz_col]
        
        group[f'OFI_{m:02d}'] = group[f'OFI_bid_{m:02d}'] - group[f'OFI_ask_{m:02d}']
    
    return group

# 3. Integrated OFI Implementation
def calculate_integrated_ofi(group, levels=10):
    """Calculate Integrated OFI according to equation (4) in the paper"""
    group = group.copy()
    
    # First calculate multi-level OFI if not already done
    if f'OFI_{levels-1:02d}' not in group.columns:
        group = calculate_multi_level_ofi(group, levels)
    
    # Calculate average order book depth (Q) across levels
    bid_sz_cols = [f'bid_sz_{m:02d}' for m in range(levels)]
    ask_sz_cols = [f'ask_sz_{m:02d}' for m in range(levels)]
    group['avg_depth'] = (group[bid_sz_cols].sum(axis=1) + group[ask_sz_cols].sum(axis=1)) / (2 * levels)
    
    # Scale OFIs by average depth
    for m in range(levels):
        group[f'scaled_OFI_{m:02d}'] = group[f'OFI_{m:02d}'] / group['avg_depth']
    
    # Prepare matrix of scaled OFIs for PCA
    scaled_ofi_cols = [f'scaled_OFI_{m:02d}' for m in range(levels)]
    X = group[scaled_ofi_cols].values
    
    # Perform PCA and get first principal component
    pca = PCA(n_components=1)
    pc1 = pca.fit_transform(X)
    
    # Normalize by L1 norm of weights
    weights = pca.components_[0]
    l1_norm = np.sum(np.abs(weights))
    group['integrated_OFI'] = pc1.flatten() / l1_norm
    
    return group

# Process each instrument separately to avoid groupby issues
for instrument_id in instrument_ids:
    instrument_data = data[data['instrument_id'] == instrument_id].copy()
    
    # Calculate all OFI measures
    instrument_data = calculate_best_level_ofi(instrument_data)
    instrument_data = calculate_multi_level_ofi(instrument_data)
    instrument_data = calculate_integrated_ofi(instrument_data)
    
    # Store results
    ofi_results[instrument_id] = instrument_data

# Combine results back into single DataFrame
all_data = pd.concat(ofi_results.values())

# 4. Cross-Asset OFI Implementation
def calculate_cross_asset_ofi(data, time_window='1min'):
    """
    Calculate Cross-Asset OFI by aggregating OFIs across assets within time windows.
    """
    # Create a dictionary to store OFI time series for each instrument
    ofi_series = {}
    
    for instrument_id, group in data.groupby('instrument_id'):
        if 'OFI_00' in group.columns:
            # Set timestamp as index and resample
            resampled = group.set_index('ts_recv')['OFI_00'].resample(time_window).sum()
            ofi_series[instrument_id] = resampled
    
    # Combine all instruments' OFIs into a single DataFrame
    cross_asset_ofi = pd.DataFrame(ofi_series)
    
    return cross_asset_ofi

# Calculate cross-asset OFI
cross_asset_ofi = calculate_cross_asset_ofi(all_data)

# Display results
print("Best-Level OFI sample:")
print(all_data[['ts_recv', 'instrument_id', 'OFI_00']].head())

print("\nMulti-Level OFI sample (first 3 levels):")
print(all_data[['ts_recv', 'instrument_id', 'OFI_00', 'OFI_01', 'OFI_02']].head())

print("\nIntegrated OFI sample:")
print(all_data[['ts_recv', 'instrument_id', 'integrated_OFI']].head())

print("\nCross-Asset OFI sample:")
print(cross_asset_ofi.head())

Best-Level OFI sample:
                              ts_recv  instrument_id  OFI_00
0 2024-10-21 11:54:29.221230963+00:00             38       0
1 2024-10-21 11:54:29.223936626+00:00             38       2
2 2024-10-21 11:54:29.225196809+00:00             38       3
3 2024-10-21 11:54:29.712600612+00:00             38       0
4 2024-10-21 11:54:29.764839221+00:00             38       0

Multi-Level OFI sample (first 3 levels):
                              ts_recv  instrument_id  OFI_00  OFI_01  OFI_02
0 2024-10-21 11:54:29.221230963+00:00             38       0       0       0
1 2024-10-21 11:54:29.223936626+00:00             38       2       0       0
2 2024-10-21 11:54:29.225196809+00:00             38       3       0       0
3 2024-10-21 11:54:29.712600612+00:00             38       0       0     200
4 2024-10-21 11:54:29.764839221+00:00             38       0       0    -200

Integrated OFI sample:
                              ts_recv  instrument_id  integrated_OFI
0 2024-10-21 1