In [1]:
%cd /app

/app


In [2]:
import argparse
import os
import sys

os.environ["XLA_PYTHON_CLIENT_PREALLOCATE"] = "false"

import torch
torch.multiprocessing.set_start_method('spawn')

import jax
from lob.encoding import Vocab, Message_Tokenizer

from lob import inference_no_errcorr as inference
from lob.init_train import init_train_state, load_checkpoint, load_metadata, load_args_from_checkpoint

from lob import inference_no_errcorr as inference
import lob.encoding as encoding
import preproc as preproc

import jax.numpy as jnp
import numpy as np

from pathlib import Path
import os

import pandas as pd
import plotly.graph_objs as go
import yaml

from filtration_utils import summary_table, build_zero_padded_series, plot_midprice_series_with_insertions, prepare_volatility_filtered_series, plot_midprice_series_with_mean_std

2025-08-08 08:01:07.606681: W external/xla/xla/service/gpu/nvptx_compiler.cc:718] The NVIDIA driver's CUDA version is 12.8 which is older than the ptxas CUDA version (12.9.41). Because the driver is older than the ptxas version, XLA is disabling parallel compilation, which may slow down compilation. You should update your NVIDIA driver or use the NVIDIA-provided CUDA forward compatibility packages.


In [None]:
# ======= CONFIGURATION =======
experiments = {
    "Plain": 'exp_21_20250804_215633_hist_plain_whole_352',
    "Heuristic": 'exp_22_20250804_215639_hist_heur_whole_360',
    "GenAI": 'exp_39_20250806_095303_gen_buy_05_248',
}

In [4]:
import yaml

# Your experiments

# Colors for each experiment
colors = {
    "Plain": 'black',
    "Heuristic": 'blue',
    "GenAI": 'red',
}

fig = go.Figure()

for label, exp_name in experiments.items():
    # === Load config ===
    CONFIG_PATH = f"/app/data_saved/{exp_name}/used_config.yaml"
    with open(CONFIG_PATH, 'r') as f:
        config = yaml.safe_load(f)

    num_insertions      = config["num_insertions"]
    num_coolings        = config["num_coolings"]
    midprice_step_size  = config["midprice_step_size"]
    hist_msgs           = config["n_messages"]
    n_gen_msgs          = config["n_gen_msgs"]

    # === Build series ===
    merged = summary_table(exp_name)
    x, all_series = build_zero_padded_series(hist_msgs, n_gen_msgs, midprice_step_size, merged)
    hist_steps = hist_msgs // midprice_step_size
    gen_steps  = n_gen_msgs // midprice_step_size
    gen_block  = gen_steps + 1

    # === Mean & Std ===
    mean_series = all_series.mean(axis=0)
    std_series = all_series.std(axis=0)

    # ±1 std band
    fig.add_trace(go.Scatter(
        x=np.concatenate([x, x[::-1]]),
        y=np.concatenate([mean_series + std_series, (mean_series - std_series)[::-1]]),
        fill='toself', fillcolor='rgba(0,0,0,0.05)',
        line=dict(color='rgba(0,0,0,0)'), hoverinfo='skip',
        showlegend=False
    ))

    # Mean line
    fig.add_trace(go.Scatter(
        x=x, y=mean_series, mode='lines',
        name=f"{label} Mean", line=dict(color=colors[label], width=3)
    ))

# Add reference lines
fig.add_hline(y=0, line=dict(color='gray', dash='dash'), name="Zero line")

fig.update_layout(
    title="Midprice Mean ±1 Std Comparison",
    xaxis_title="Steps (sampled midprice points)",
    yaxis_title="Price – first price",
    template="plotly_white",
    hovermode="x unified",
    height=800,
    width=1200,
    legend=dict(x=0.01, y=0.99),
)
fig.show()

In [5]:
import os, glob, re, yaml
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
import plotly.graph_objects as go

# ======= HELPERS =======
def build_and_merge(folder, batch_prefix, inp_prefix, num_insertions):
    files = glob.glob(os.path.join(folder, "*.npy"))
    rx_iter = re.compile(rf"{re.escape(batch_prefix)}_\[(.+)\]_iter_(\d+)\.npy$")
    rx_inp = re.compile(rf"{re.escape(inp_prefix)}_\[(.+)\]\.npy$")
    rec = []

    for f in files:
        nm = os.path.basename(f)
        m = rx_iter.match(nm)
        if m:
            rng, itr = m.group(1).replace(" ", ""), int(m.group(2))
        else:
            m2 = rx_inp.match(nm)
            if not m2:
                continue
            rng, itr = m2.group(1).replace(" ", ""), 0

        batch = np.load(f)
        rec.append({"range": rng, "iteration": itr, "batch": batch})

    df = pd.DataFrame(rec).sort_values(["range", "iteration"]).reset_index(drop=True)
    df["ids"] = df["range"].str.split(",").apply(lambda L: [int(x) for x in L])

    rows = []
    for _, r in df.iterrows():
        for idx, sample_id in enumerate(r["ids"]):
            single = r["batch"][idx]
            if r["iteration"] > 0:
                n_keep = 51 if r["iteration"] <= num_insertions else 50
                single = single[-n_keep:, :]
            rows.append({"id": sample_id, "iteration": r["iteration"], "data": single})

    df_sorted = pd.DataFrame(rows).sort_values(["id", "iteration"]).reset_index(drop=True)

    merged = []
    for id_val, grp in df_sorted.groupby("id", sort=True):
        arrs = [row.data for _, row in grp.iterrows()]
        big = np.concatenate(arrs, axis=0)
        merged.append({"id": id_val, "merged_data": big})
    return pd.DataFrame(merged)

def calculate_betas_across_samples(b_dict, m_dict, hist_steps=550, gen_block=50, num_insertions=20, daily_volume_estimate=250_000):
    all_betas = []
    for sample_id in sorted(b_dict.keys()):
        if sample_id not in m_dict:
            continue
        book_array = b_dict[sample_id]
        messages = m_dict[sample_id]
        insertion_positions = hist_steps + np.arange(1, num_insertions + 1) * gen_block
        valid_insertions = [pos for pos in insertion_positions if pos < len(messages)]
        if len(valid_insertions) < 2:
            continue

        reference_price_ticks = float(messages[valid_insertions[0], 3])
        cum_order_volume = 0.0
        cum_order_notional_ticks = 0.0
        log_qv = []
        log_impact = []

        for idx in valid_insertions:
            order_size = float(messages[idx, 5])
            order_price_ticks = float(messages[idx, 3])
            cum_order_volume += order_size
            cum_order_notional_ticks += order_size * order_price_ticks
            if cum_order_volume == 0:
                continue
            vwap_ticks = cum_order_notional_ticks / cum_order_volume
            impact_ticks = abs(vwap_ticks - reference_price_ticks)
            rel_size = cum_order_volume / daily_volume_estimate
            if impact_ticks <= 0 or rel_size <= 0:
                continue
            log_qv.append(np.log(rel_size))
            log_impact.append(np.log(impact_ticks))

        if len(log_qv) > 1:
            X = np.array(log_qv).reshape(-1, 1)
            y = np.array(log_impact)
            reg = LinearRegression().fit(X, y)
            all_betas.append(reg.coef_[0])
    return all_betas

def plot_beta_distributions_plotly_with_stats(all_betas):
    fig = go.Figure()
    bins = np.linspace(-0.2, 1.5, 40)
    bin_size = bins[1] - bins[0]

    plotly_colors = {
        "Plain": "rgba(0,0,0,0.5)",
        "Heuristic": "rgba(0,0,255,0.4)",
        "GenAI": "rgba(255,0,0,0.4)",
    }
    line_colors = {
        "Plain": "black",
        "Heuristic": "blue",
        "GenAI": "red",
    }

    for label, betas in all_betas.items():
        betas = np.array(betas)
        n = len(betas)
        mean_beta = np.mean(betas)
        std_beta = np.std(betas)
        print(f"{label:10} | n = {n:<3} | mean β = {mean_beta:.4f} | std = {std_beta:.4f}")

        fig.add_trace(go.Histogram(
            x=betas,
            xbins=dict(start=bins[0], end=bins[-1], size=bin_size),
            name=f"{label} (n={n}, mean={mean_beta:.3f})",
            opacity=0.6,
            marker_color=plotly_colors.get(label, 'gray'),
            histnorm=None,
        ))

        fig.add_trace(go.Scatter(
            x=[mean_beta] * 2,
            y=[0, 25],
            mode="lines",
            name=f"{label} mean",
            line=dict(color=line_colors.get(label, 'gray'), dash="dash", width=3)
        ))

    fig.add_trace(go.Scatter(
        x=[0.5] * 2,
        y=[0, 25],
        mode="lines",
        name="β = 0.5 (theoretical)",
        line=dict(color="green", dash="dot", width=3)
    ))

    fig.update_layout(
        title="Distribution of Estimated Market Impact Slopes (β)",
        xaxis_title="β",
        yaxis_title="Frequency",
        barmode='overlay',
        template="plotly_white",
        legend=dict(x=0.7, y=0.95),
        width=900,
        height=550,
    )

    fig.show()

# ======= RUN ALL =======
all_betas = {}
for label, exp_name in experiments.items():
    config_path = f"/app/data_saved/{exp_name}/used_config.yaml"
    with open(config_path, "r") as f:
        config = yaml.safe_load(f)
    num_insertions = config["num_insertions"]

    b_folder = f"/app/data_saved/{exp_name}/b_seq_gen_doubled"
    m_folder = f"/app/data_saved/{exp_name}/msgs_decoded_doubled"
    b_batch_pref = "b_seq_gen_doubled_batch"
    b_inp_pref   = "b_seq_inp"
    m_batch_pref = "msgs_decoded_doubled_batch"
    m_inp_pref   = "m_seq_raw_inp"

    b_merged = build_and_merge(b_folder, b_batch_pref, b_inp_pref, num_insertions)
    m_merged = build_and_merge(m_folder, m_batch_pref, m_inp_pref, num_insertions)

    b_dict = {int(r.id): np.array(r.merged_data) for _, r in b_merged.iterrows()}
    m_dict = {int(r.id): np.array(r.merged_data) for _, r in m_merged.iterrows()}

    for d in (b_dict, m_dict):
        for key, arr in d.items():
            zero = np.zeros((1, arr.shape[1]), dtype=arr.dtype)
            d[key] = np.vstack([zero, arr])

    betas = calculate_betas_across_samples(b_dict, m_dict, num_insertions=num_insertions)
    all_betas[label] = betas

plot_beta_distributions_plotly_with_stats(all_betas)

Plain      | n = 352 | mean β = 0.1048 | std = 0.6002
Heuristic  | n = 360 | mean β = 0.4163 | std = 0.6738
GenAI      | n = 156 | mean β = 0.8265 | std = 0.8519
