# Transformer Workflow Walkthrough

This notebook demonstrates how to scaffold data ingestion, tokenization, and experiment configuration using the `transformers` toolkit. It builds a synthetic dataset, registers it in a catalog, and prepares both PatchTST and fusion model configurations.


In [None]:
from pathlib import Path
import pandas as pd

from trading_transformers.data import DataCatalog, DataSource
from trading_transformers.features import ContinuousFeatureBuilder, BrooksTokenizer
from trading_transformers.tokenizers import BrooksTokenVocabulary

TMP_ROOT = Path('notebooks/_tmp')
TMP_ROOT.mkdir(parents=True, exist_ok=True)
CSV_PATH = TMP_ROOT / 'synthetic.csv'
VOCAB_PATH = TMP_ROOT / 'brooks_vocab.json'

# Synthetic OHLCV data
synthetic = pd.DataFrame({
    'timestamp': pd.date_range('2024-01-01', periods=400, freq='D'),
    'open': 100 + pd.Series(range(400)).mul(0.05),
    'high': 101 + pd.Series(range(400)).mul(0.05),
    'low': 99 + pd.Series(range(400)).mul(0.05),
    'close': 100 + pd.Series(range(400)).mul(0.05) + 0.3,
    'volume': 1_000_000,
})

# Brooks-style tokens + vocabulary
brooks = BrooksTokenizer()
synthetic['brooks_token'] = brooks.transform(synthetic)
vocab = BrooksTokenVocabulary.from_sequences(synthetic['brooks_token'])
vocab.to_json(VOCAB_PATH)

synthetic.to_csv(CSV_PATH, index=False)

catalog = DataCatalog(root=TMP_ROOT)
catalog.register_source(DataSource(name='synthetic', path=CSV_PATH, fmt='csv'))

CSV_PATH, VOCAB_PATH


In [None]:
# Continuous feature inspection (optional)
builder = ContinuousFeatureBuilder()
features = builder.transform(synthetic)
features[['close', 'log_return', 'hl_range']].head()


In [None]:
from trading_transformers.training import DataConfig, ExperimentConfig, OptimizerConfig, TrainerConfig

# PatchTST baseline configuration
patch_data_cfg = DataConfig(
    source='synthetic',
    features=['open', 'high', 'low', 'close', 'volume'],
    target='close',
    lookback=64,
    horizon=8,
    batch_size=64,
)
patch_experiment = ExperimentConfig(
    name='notebook_patchtst',
    data=patch_data_cfg,
    model={'type': 'patchtst', 'input_dim': 5},
    optimizer=OptimizerConfig(lr=1e-3),
    trainer=TrainerConfig(max_epochs=5, accelerator='cpu', precision='32'),
)
patch_experiment


In [None]:
# Fusion configuration leveraging Brooks tokens
fusion_data_cfg = DataConfig(
    source='synthetic',
    features=['open', 'high', 'low', 'close', 'volume'],
    target='close',
    lookback=64,
    horizon=8,
    batch_size=64,
    token_column='brooks_token',
    vocab_path=str(VOCAB_PATH),
)
fusion_experiment = ExperimentConfig(
    name='notebook_fusion',
    data=fusion_data_cfg,
    model={
        'type': 'fusion',
        'd_model': 128,
        'nheads': 4,
        'depth': 2,
    },
    optimizer=OptimizerConfig(lr=1e-3),
    trainer=TrainerConfig(max_epochs=5, accelerator='cpu', precision='32'),
)
fusion_experiment


In [None]:
# Optionally auto-register real archives from ../data (SHARADAR bundles, etc.)
from trading_transformers.data import auto_register_archives

DATA_DIR = Path('../data')
if DATA_DIR.exists():
    auto_register_archives(catalog, DATA_DIR)
    catalog_path = TMP_ROOT / 'catalog.json'
    catalog.to_json(catalog_path)
    print('Catalog saved to', catalog_path)
    print('Available archives (first five):', catalog.list_archives()[:5])
else:
    print('No external data directory found; using synthetic catalog only.')


In [None]:
# Peek into the first registered archive if present (non-destructive)
if catalog.list_archives():
    archive_name = catalog.list_archives()[0]
    folder = catalog.extract_archive(archive_name)
    print('Extracted to', folder)
    sample_files = sorted(f.name for f in folder.glob('*'))[:5]
    print('Sample files:', sample_files)


In [None]:
# Diagnostics for fusion tokens
from trading_transformers.evaluation.diagnostics import fusion_token_report
fusion_report = fusion_token_report(synthetic, fusion_data_cfg)
fusion_report


In [None]:
# Training requires PyTorch Lightning and torch.
# Uncomment once dependencies are installed.
# from trading_transformers.training import ExperimentRunner
# runner = ExperimentRunner(config=fusion_experiment, catalog=catalog)
# trainer = runner.run()
# trainer.test()
# print('Diagnostics from runner:', runner.report)


## Next Steps
- Swap the synthetic dataset with your catalog source and rerun diagnostics (`fusion_token_report`).
- Use the CLI: `python -m trading_trading_transformers.training --config transformers/configs/fusion.yaml --catalog notebooks/_tmp/catalog.json --diagnostics fusion_report.json`.
- Feed model forecasts into `python -m trading_trading_transformers.backtest` for P&L assessment.
