In [None]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from pathlib import Path
from sklearn.preprocessing import MinMaxScaler
from tensorflow import keras
from tensorflow.keras import layers

# data and labels
data_path = Path("../data/crobex_history.csv")
df = pd.read_csv(data_path, sep=";", quotechar='"', decimal=",", parse_dates=["date"])
df = df.sort_values("date")

# use existing df (date, last_value); drop NaN for modeling
ts = df["last_value"].dropna().values.astype(np.float32).reshape(-1, 1)
scaler = MinMaxScaler()
ts_scaled = scaler.fit_transform(ts)

seq_len = 20
X, y = [], []
for i in range(len(ts_scaled) - seq_len):
    X.append(ts_scaled[i : i + seq_len])
    y.append(ts_scaled[i + seq_len, 0])
X = np.array(X)
y = np.array(y)

# minimal RNN: learn to predict next value
model = keras.Sequential([
    layers.Input(shape=(seq_len, 1)),
    layers.SimpleRNN(16, return_sequences=False),
    layers.Dense(1),
])
model.compile(optimizer="adam", loss="mse")
model.fit(X, y, epochs=15, batch_size=32, verbose=0)

# anomaly = prediction error above percentile; mask length matches full series, valid only after first seq_len
pred = model.predict(X, verbose=0).flatten()
err = np.abs(y - pred)
threshold = np.percentile(err, 95)
anomaly_mask = np.zeros(len(ts_scaled), dtype=bool)
anomaly_mask[seq_len : seq_len + len(err)] = err >= threshold

# align with original dates (dropna index)
dates = df["date"].loc[df["last_value"].notna()].values
orig = scaler.inverse_transform(ts_scaled).flatten()
pred_full = np.full(len(ts_scaled), np.nan)
pred_full[seq_len : seq_len + len(pred)] = scaler.inverse_transform(pred.reshape(-1, 1)).flatten()

In [27]:
# single plot: series, predictions, anomalies
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=dates,
        y=orig,
        name="crobex",
        mode="lines",
        line=dict(color="royalblue"),
    )
)
fig.add_trace(
    go.Scatter(
        x=dates,
        y=pred_full,
        name="rnn prediction",
        mode="lines",
        line=dict(color="orange", dash="dash"),
    )
)
fig.add_trace(
    go.Scatter(
        x=dates[anomaly_mask],
        y=orig[anomaly_mask],
        name="anomalies",
        mode="markers",
        marker=dict(size=10, color="red", symbol="x"),
    )
)
fig.update_layout(
    title="high prediction error = anomaly",
    xaxis_title="date",
    yaxis_title="last_value",
    hovermode="x unified",
    height=400,
)
fig.show()

In [28]:
# metrics (on scaled or original error - here on original scale)
pred_orig = scaler.inverse_transform(pred.reshape(-1, 1)).flatten()
y_orig = scaler.inverse_transform(y.reshape(-1, 1)).flatten()
err_orig = np.abs(y_orig - pred_orig)

mae = np.mean(err_orig)
rmse = np.sqrt(np.mean(err_orig**2))
n_anomalies = np.sum(err >= threshold)
threshold_orig = np.percentile(err_orig, 95)  # same percentile, original scale

print(f"MAE (original scale): {mae:.2f}")
print(f"RMSE (original scale): {rmse:.2f}")
print(f"Threshold (95th percentile, scaled): {threshold:.4f}")
print(f"Threshold (95th percentile, original scale): {threshold_orig:.2f}")
print(f"Number of points flagged as anomalies: {n_anomalies} ({100*n_anomalies/len(err):.1f}%)")

MAE (original scale): 14.73
RMSE (original scale): 18.45
Threshold (95th percentile, scaled): 0.0189
Threshold (95th percentile, original scale): 33.68
Number of points flagged as anomalies: 199 (5.0%)
