# Test DataTransformer

Interactive notebook for testing the `DataTransformer` pipeline and its component classes.

In [6]:
import sys
from pathlib import Path

sys.path.append('src')

import pandas as pd
import numpy as np

from config import ModelConfig, TargetSpec, VariableSpec, ModelSpec, BacktestSpec
from transform import (
    DataLoader,
    FrequencyInferrer,
    Transformer,
    TrendRemover,
    LagFeatureBuilder,
    DataTransformer,
    TransformedData,
)

## 1. Explore raw data

In [None]:
raw_df = pd.read_parquet(DATA_PATH)
print(f"Shape: {raw_df.shape}")
print(f"\nColumns: {raw_df.columns.tolist()}")
print(f"\nSeries: {raw_df['internal_series_name'].nunique()} unique")
print(raw_df['internal_series_name'].value_counts())
raw_df.head()

In [None]:
raw_df.dtypes

## 2. Test DataLoader

In [None]:
loader = DataLoader(DATA_PATH)
df = loader.load()
print(f"Loaded: {df.shape}")

df_dedup = loader.deduplicate(df)
print(f"After dedup: {df_dedup.shape}")
print(f"Rows removed: {len(df) - len(df_dedup)}")

In [None]:
# Test vintage creation
as_of = pd.Timestamp("2023-06-01")
df_vintage = loader.create_vintage(df, as_of)
print(f"Vintage as of {as_of.date()}: {df_vintage.shape}")
print(f"\nMax value_date per series:")
print(df_vintage.groupby("internal_series_name")["value_date"].max().sort_values())

## 3. Test FrequencyInferrer

In [None]:
inferrer = FrequencyInferrer()

for series_name in df_dedup["internal_series_name"].unique():
    dates = df_dedup[df_dedup["internal_series_name"] == series_name]["value_date"]
    freq = inferrer.infer_frequency(dates)
    print(f"{series_name}: {freq} ({len(dates)} obs)")

## 4. Test Transformer

In [None]:
# Pick a series to test transformations on
test_series_name = "hicp_dk_dst"
test_df = df_dedup[df_dedup["internal_series_name"] == test_series_name].copy()
test_series = test_df.set_index("value_date")["value"].sort_index()

print(f"Test series: {test_series_name}")
print(f"Length: {len(test_series)}, Range: {test_series.index.min()} to {test_series.index.max()}")
test_series.plot(title=f"{test_series_name} (raw)");

In [None]:
t = Transformer()

for tfm in ["none", "log", "diff", "12m_diff", "log_diff"]:
    transformed = t.apply_transformation(test_series, tfm)
    print(f"{tfm:12s} -> len={len(transformed)}, NaN={transformed.isna().sum()}, "
          f"mean={transformed.dropna().mean():.4f}")

## 5. Test TrendRemover

In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for ax, method in zip(axes, ["linear", "moving_average"]):
    tr = TrendRemover(method=method)
    detrended, trend = tr.remove_trend(test_series.dropna())
    ax.plot(test_series.dropna(), label="original", alpha=0.7)
    ax.plot(trend, label="trend", linewidth=2)
    ax.set_title(f"Trend: {method}")
    ax.legend()

    # Test extrapolation
    extrap = tr.extrapolate_trend(12)
    print(f"{method}: extrapolated 12 periods -> shape={extrap.shape}")

axes[2].set_visible(False)
plt.tight_layout();

## 6. Test LagFeatureBuilder

In [None]:
lag_df = LagFeatureBuilder.create_lag_features(
    series=test_series,
    n_lags=3,
    horizon=6,
    publication_lag=1,
    feature_name="hicp",
)

print(f"Columns: {lag_df.columns.tolist()}")
print(f"Shape: {lag_df.shape}")
lag_df.dropna().head()

## 7. Test full DataTransformer pipeline

In [None]:
config = ModelConfig.from_yaml(CONFIG_PATH)
config.validate()

print(f"Target: {config.target.internal_series_name} ({config.target.transformation})")
print(f"Features: {len(config.features)}")
print(f"Horizons: {config.model.horizons}")

In [None]:
dt = DataTransformer(config, DATA_PATH)

horizon = 6
result = dt.prepare_data(horizon=horizon)

print(f"Horizon: {horizon}")
print(f"X shape: {result.X.shape}")
print(f"y shape: {result.y.shape}")
print(f"Features: {result.feature_names}")
print(f"Date range: {result.dates.min()} to {result.dates.max()}")
print(f"Metadata: {result.metadata}")

In [None]:
# Check for NaN/inf in output
print(f"X NaN: {np.isnan(result.X).sum()}")
print(f"X inf: {np.isinf(result.X).sum()}")
print(f"y NaN: {np.isnan(result.y).sum()}")
print(f"y inf: {np.isinf(result.y).sum()}")

In [None]:
# Test with vintage date
result_vintage = dt.prepare_data(horizon=6, as_of_date=pd.Timestamp("2023-01-01"))
print(f"Vintage X shape: {result_vintage.X.shape}")
print(f"Vintage date range: {result_vintage.dates.min()} to {result_vintage.dates.max()}")

In [None]:
# Test across all horizons
for h in config.model.horizons:
    r = dt.prepare_data(horizon=h)
    print(f"h={h:2d}: X={str(r.X.shape):15s} y={str(r.y.shape):10s} "
          f"dates={r.dates.min().date()} to {r.dates.max().date()}")

In [None]:
# Visualize feature matrix
feature_df = pd.DataFrame(result.X, columns=result.feature_names, index=result.dates)
feature_df.describe()