# Stress test, kappa only

In [1]:
import sys
import pandas as pd
import numpy as np
import random
import tensorflow as tf
import plotly.express as px
import plotly.graph_objects as go
import os
os.environ["PYTHONHASHSEED"] = "42"
os.environ["TF_DETERMINISTIC_OPS"] = "1"
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from pathlib import Path

project_root = Path.cwd()
while project_root != project_root.parent and not (project_root / "src").exists():
    project_root = project_root.parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

from src.simulation.dgp0 import Tier0Config, simulate_panel
from src.simulation.validation import plot_market_plotly, separation_auc_like
from src.data.feature_eng import feature_eng_syn
from src.model.autoencoder import PriceAutoencoder


#set seed
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

In [2]:
df_kappa = pd.read_parquet("../data/processed_syn/synth_dgp0_kappa_only_L18.parquet")
df_kappa.head()

Unnamed: 0,market_id,window_start,window_end,window_length,Price 1,Price 2,Price 3,Price 4,Price 5,Price 6,...,Price 14,Price 15,Price 16,Price 17,Price 18,share_C,share_T,share_K,state_mode,is_pure_80
0,0,0,17,18,0.089382,0.088709,0.081665,0.07181,0.076316,0.076378,...,0.01105,-0.002371,-0.012381,-0.024323,-0.020946,0.111111,0.888889,0.0,1,1.0
1,0,1,18,18,0.088709,0.081665,0.07181,0.076316,0.076378,0.079745,...,-0.002371,-0.012381,-0.024323,-0.020946,-0.07832,0.166667,0.833333,0.0,1,1.0
2,0,2,19,18,0.081665,0.07181,0.076316,0.076378,0.079745,0.052573,...,-0.012381,-0.024323,-0.020946,-0.07832,-0.095198,0.222222,0.777778,0.0,1,0.0
3,0,3,20,18,0.07181,0.076316,0.076378,0.079745,0.052573,0.078968,...,-0.024323,-0.020946,-0.07832,-0.095198,-0.105134,0.277778,0.722222,0.0,1,0.0
4,0,4,21,18,0.076316,0.076378,0.079745,0.052573,0.078968,0.089319,...,-0.020946,-0.07832,-0.095198,-0.105134,-0.100654,0.333333,0.666667,0.0,1,0.0


In [3]:
feature_df = feature_eng_syn(df_kappa)
feature_df.head()

Unnamed: 0,market_id,window_start,window_end,window_length,Price 1,Price 2,Price 3,Price 4,Price 5,Price 6,...,CoV_change,zero_change_fraction,AR_1,AR_2,kurtosis_change,max_abs_ret,pos_vol,neg_vol,level_vol,price_range
0,0,0,17,18,0.089382,0.088709,0.081665,0.07181,0.076316,0.076378,...,2.276474,0.117647,-0.01549,0.053722,1.73708,0.042639,0.009612,0.011403,0.041615,0.113704
1,0,1,18,18,0.088709,0.081665,0.07181,0.076316,0.076378,0.079745,...,1.924273,0.058824,-0.121106,0.094237,1.619953,0.057374,0.009612,0.016329,0.050188,0.167639
2,0,2,19,18,0.081665,0.07181,0.076316,0.076378,0.079745,0.052573,...,1.822711,0.058824,-0.040472,0.079848,1.44483,0.057374,0.009612,0.015838,0.05797,0.184517
3,0,3,20,18,0.07181,0.076316,0.076378,0.079745,0.052573,0.078968,...,1.821877,0.058824,-0.04312,0.050544,1.444103,0.057374,0.009612,0.015833,0.064619,0.194453
4,0,4,21,18,0.076316,0.076378,0.079745,0.052573,0.078968,0.089319,...,1.821487,0.058824,-0.070229,-0.006613,1.444913,0.057374,0.009614,0.015833,0.069139,0.194453


In [4]:
FEATURES_5 = [
    "volatility", "zero_change_fraction","max_abs_ret",
    "AR_1","price_range"]

pure_kappa = feature_df[feature_df["is_pure_80"] == 1]

pure_kappa.groupby("state_mode")[FEATURES_5].mean()

Unnamed: 0_level_0,volatility,zero_change_fraction,max_abs_ret,AR_1,price_range
state_mode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.034483,0.028019,0.079287,0.070697,0.182906
1,0.025519,0.034179,0.058372,0.151225,0.150671
2,0.018176,0.052667,0.040757,0.162817,0.116705


## Training AE

In [5]:
X = feature_df[FEATURES_5].to_numpy().astype(np.float32)

# Scale
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X).astype(np.float32)

#train test split
X_train, X_val = train_test_split(X_scaled, test_size=0.2, random_state=42)

In [6]:
ae = PriceAutoencoder(input_dim=5, latent_dim=2, hidden_dims=(16,8), latent_activation=None)
ae.compile(optimizer=tf.keras.optimizers.Adam(1e-3), loss="mse")

history = ae.fit(
    X_train, X_train,
    validation_data=(X_val, X_val),
    epochs=200,
    batch_size=256,
    callbacks=[tf.keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)],
    verbose=1
)

Epoch 1/200
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.8302 - val_loss: 0.5640
Epoch 2/200
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.3006 - val_loss: 0.2036
Epoch 3/200
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.1829 - val_loss: 0.1661
Epoch 4/200
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.1580 - val_loss: 0.1514
Epoch 5/200
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.1468 - val_loss: 0.1441
Epoch 6/200
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.1404 - val_loss: 0.1396
Epoch 7/200
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.1363 - val_loss: 0.1364
Epoch 8/200
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.1330 - val_loss: 0.1333
Epoch 9/200
[1m102/102[0m [32

## Checking Embedding

In [7]:
Z = ae.encoder(X_scaled).numpy()

validation_df = feature_df.copy()

validation_df["z1"] = Z[:, 0]
validation_df["z2"] = Z[:, 1]

In [8]:
#plotting embedding
# Focus on pure windows only for clarity
pure = validation_df[validation_df["is_pure_80"] == 1].copy()

# Map state labels for readability
state_map = {0: "Competitive", 1: "Tacit", 2: "Cartel"}
pure["state_label"] = pure["state_mode"].map(state_map)

fig = px.scatter(
    pure,
    x="z1",
    y="z2",
    color="state_label",
    color_discrete_map={
        "Competitive": "green",
        "Tacit": "orange",
        "Cartel": "red",
    },
    title="Latent Space (Pure Windows Only) Kappa Only",
    opacity=0.6
)

fig.update_layout(template="plotly_white")
fig.show()

## Calculating centroids

In [9]:
Z_pure = pure[["z1","z2"]].to_numpy()


#cacluating centroids
mu_C = pure[pure["state_mode"] == 0][["z1","z2"]].mean().to_numpy()
mu_K = pure[pure["state_mode"] == 2][["z1","z2"]].mean().to_numpy()
mu_T = pure[pure["state_mode"] == 1][["z1","z2"]].mean().to_numpy()

#calculating competition direction and scalling 
v = mu_K - mu_C
v = v / np.linalg.norm(v)

In [10]:
#projecting each coordinate on competitoin axis
Z_all = validation_df[["z1","z2"]].to_numpy()
validation_df["conduct_score"] = Z_all @ v
validation_df["conduct_score_centred"] = (Z_all - mu_C) @ v

In [11]:
validation_df.groupby("state_mode")["conduct_score"].mean()

state_mode
0   -0.123068
1    0.450820
2    1.382638
Name: conduct_score, dtype: float32

In [12]:
validation_df.groupby("state_mode")["conduct_score"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
state_mode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,9572.0,-0.123068,1.655169,-4.815728,-0.99365,-0.328058,0.191251,13.707209
1,9494.0,0.45082,1.95677,-3.9071,-0.416771,0.051065,0.835331,14.949136
2,13534.0,1.382638,2.501534,-3.695632,-0.052977,0.388965,1.416593,17.638617


## Understanding classification

In [13]:
state_map = {0: "Competitive", 1: "Tacit", 2: "Cartel"}

pure["state_label"] = pure["state_mode"].map(state_map)
validation_df["state_label"] = validation_df["state_mode"].map(state_map)

In [14]:
fig = px.scatter(
    pure,
    x="z1",
    y="z2",
    color="state_label",
    opacity=0,
    title="Latent Space (Kappa only) with Centroids and Conduct Axis",
    template="plotly_white"
)

# Add centroid markers
centroids = np.vstack([mu_C, mu_T, mu_K])
centroid_labels = ["Competitive centroid", "Tacit centroid", "Cartel centroid"]

fig.add_trace(go.Scatter(
    x=centroids[:,0],
    y=centroids[:,1],
    mode="markers+text",
    text=centroid_labels,
    textposition="top center",
    marker=dict(size=14, symbol="x"),
    name="Centroids"
))

# Add arrow from mu_C to mu_K
fig.add_trace(go.Scatter(
    x=[mu_C[0], mu_K[0]],
    y=[mu_C[1], mu_K[1]],
    mode="lines",
    line=dict(width=4, dash="dash"),
    name="Conduct axis (C → K)"
))

# Optional: annotate arrow direction
fig.add_annotation(
    x=mu_K[0], y=mu_K[1],
    ax=mu_C[0], ay=mu_C[1],
    xref="x", yref="y", axref="x", ayref="y",
    showarrow=True, arrowhead=3, arrowsize=1.2, arrowwidth=2,
    text="C→K"
)

fig.show()

In [15]:
fig = px.histogram(
    validation_df,
    x="conduct_score_centred",
    color="state_label",
    nbins=60,
    opacity=0.5,
    barmode="overlay",
    title="Centered Conduct Score Distribution by Regime (All Windows) kappa only",
    template="plotly_white"
)
fig.show()

### Probabilty of getting classification correct

In [16]:
scores_C = validation_df[validation_df["state_mode"] == 0]["conduct_score_centred"].values
scores_K = validation_df[validation_df["state_mode"] == 2]["conduct_score_centred"].values

sep = separation_auc_like(scores_C, scores_K)
print(sep)

0.7526
