# Transformer Workflow Walkthrough

This notebook demonstrates how to scaffold data ingestion, tokenization, and experiment configuration using the `transformers` toolkit. It builds a synthetic dataset, registers it in a catalog, and prepares both PatchTST and fusion model configurations.


In [1]:
from pathlib import Path
import pandas as pd

from trading_transformers.data import DataCatalog, DataSource
from trading_transformers.features import ContinuousFeatureBuilder, BrooksTokenizer
from trading_transformers.tokenizers import BrooksTokenVocabulary

TMP_ROOT = Path('notebooks/_tmp')
TMP_ROOT.mkdir(parents=True, exist_ok=True)
CSV_PATH = TMP_ROOT / 'synthetic.csv'
VOCAB_PATH = TMP_ROOT / 'brooks_vocab.json'

# Synthetic OHLCV data
synthetic = pd.DataFrame({
    'timestamp': pd.date_range('2024-01-01', periods=400, freq='D'),
    'open': 100 + pd.Series(range(400)).mul(0.05),
    'high': 101 + pd.Series(range(400)).mul(0.05),
    'low': 99 + pd.Series(range(400)).mul(0.05),
    'close': 100 + pd.Series(range(400)).mul(0.05) + 0.3,
    'volume': 1_000_000,
})

# Brooks-style tokens + vocabulary
brooks = BrooksTokenizer()
synthetic['brooks_token'] = brooks.transform(synthetic)
vocab = BrooksTokenVocabulary.from_sequences(synthetic['brooks_token'])
vocab.to_json(VOCAB_PATH)

synthetic.to_csv(CSV_PATH, index=False)

catalog = DataCatalog(root=TMP_ROOT)
catalog.register_source(DataSource(name='synthetic', path=CSV_PATH, fmt='csv'))

CSV_PATH, VOCAB_PATH


(PosixPath('notebooks/_tmp/synthetic.csv'),
 PosixPath('notebooks/_tmp/brooks_vocab.json'))

In [2]:
# Continuous feature inspection (optional)
builder = ContinuousFeatureBuilder()
features = builder.transform(synthetic)
features[['close', 'log_return', 'hl_range']].head()


Unnamed: 0,close,log_return,hl_range
0,100.3,,
1,100.35,0.000498,0.01994
2,100.4,0.000498,0.01993
3,100.45,0.000498,0.01992
4,100.5,0.000498,0.01991


In [3]:
from trading_transformers.training import DataConfig, ExperimentConfig, OptimizerConfig, TrainerConfig

# PatchTST baseline configuration
patch_data_cfg = DataConfig(
    source='synthetic',
    features=['open', 'high', 'low', 'close', 'volume'],
    target='close',
    lookback=64,
    horizon=8,
    batch_size=64,
)
patch_experiment = ExperimentConfig(
    name='notebook_patchtst',
    data=patch_data_cfg,
    model={'type': 'patchtst', 'input_dim': 5},
    optimizer=OptimizerConfig(lr=1e-3),
    trainer=TrainerConfig(max_epochs=5, accelerator='cpu', precision='32'),
)
patch_experiment


ExperimentConfig(name='notebook_patchtst', data=DataConfig(source='synthetic', features=['open', 'high', 'low', 'close', 'volume'], target='close', lookback=64, horizon=8, batch_size=64, val_fraction=0.1, test_fraction=0.1, token_column=None, vocab_path=None), model={'type': 'patchtst', 'input_dim': 5}, optimizer=OptimizerConfig(lr=0.001, weight_decay=0.0001), trainer=TrainerConfig(max_epochs=5, accelerator='cpu', precision='32', gradient_clip_val=1.0, devices=None), output_dir=PosixPath('artifacts'))

In [4]:
# Fusion configuration leveraging Brooks tokens
fusion_data_cfg = DataConfig(
    source='synthetic',
    features=['open', 'high', 'low', 'close', 'volume'],
    target='close',
    lookback=64,
    horizon=8,
    batch_size=64,
    token_column='brooks_token',
    vocab_path=str(VOCAB_PATH),
)
fusion_experiment = ExperimentConfig(
    name='notebook_fusion',
    data=fusion_data_cfg,
    model={
        'type': 'fusion',
        'd_model': 128,
        'nheads': 4,
        'depth': 2,
    },
    optimizer=OptimizerConfig(lr=1e-3),
    trainer=TrainerConfig(max_epochs=5, accelerator='cpu', precision='32'),
)
fusion_experiment


ExperimentConfig(name='notebook_fusion', data=DataConfig(source='synthetic', features=['open', 'high', 'low', 'close', 'volume'], target='close', lookback=64, horizon=8, batch_size=64, val_fraction=0.1, test_fraction=0.1, token_column='brooks_token', vocab_path='notebooks/_tmp/brooks_vocab.json'), model={'type': 'fusion', 'd_model': 128, 'nheads': 4, 'depth': 2}, optimizer=OptimizerConfig(lr=0.001, weight_decay=0.0001), trainer=TrainerConfig(max_epochs=5, accelerator='cpu', precision='32', gradient_clip_val=1.0, devices=None), output_dir=PosixPath('artifacts'))

In [5]:
# Optionally auto-register real archives from ../data (SHARADAR bundles, etc.)
from trading_transformers.data import auto_register_archives

DATA_DIR = Path('../data')
if DATA_DIR.exists():
    auto_register_archives(catalog, DATA_DIR)
    catalog_path = TMP_ROOT / 'catalog.json'
    catalog.to_json(catalog_path)
    print('Catalog saved to', catalog_path)
    print('Available archives (first five):', catalog.list_archives()[:5])
else:
    print('No external data directory found; using synthetic catalog only.')


Catalog saved to notebooks/_tmp/catalog.json
Available archives (first five): ['SHARADAR_DAILY_3_1c00e922d0fc2ccdfae0e4c5271349a4', 'SHARADAR_SEP_2_0afbc06bfa7d2d5ebd28c43e0940ec30', 'SHARADAR_SF1_017f04a0d2ef7cc409f920be72167ada', 'SHARADAR_SF2_6ae86d850a382c2a8a24c5daa109c39b', 'SHARADAR_SF3_ce320d02f19d0b5d04c9557e0bc16680']


In [6]:
if catalog.list_archives():
    archive_name = catalog.list_archives()[0]
    archive_path = Path(catalog.archives[archive_name].path)
    size_mb = archive_path.stat().st_size / (1024 * 1024)
    if size_mb < 250:
        folder = catalog.extract_archive(archive_name)
        print('Extracted to', folder)
        sample_files = sorted(f.name for f in folder.glob('*'))[:5]
        print('Sample files:', sample_files)
    else:
        print(f'Skipping extraction of {archive_name} ({size_mb:.1f} MB)')


Skipping extraction of SHARADAR_DAILY_3_1c00e922d0fc2ccdfae0e4c5271349a4 (607.1 MB)


In [7]:
# Diagnostics for fusion tokens
from trading_transformers.evaluation.diagnostics import fusion_token_report
fusion_report = fusion_token_report(synthetic, fusion_data_cfg)
fusion_report


{'token_stats': {'total_tokens': 400,
  'unique_tokens': 1,
  'entropy': -0.0,
  'top_tokens': [('bull|bodyNA|tailNA|trend_up', 400)]},
 'sample_tokens': ['bull|bodyNA|tailNA|trend_up',
  'bull|bodyNA|tailNA|trend_up',
  'bull|bodyNA|tailNA|trend_up',
  'bull|bodyNA|tailNA|trend_up',
  'bull|bodyNA|tailNA|trend_up',
  'bull|bodyNA|tailNA|trend_up',
  'bull|bodyNA|tailNA|trend_up',
  'bull|bodyNA|tailNA|trend_up',
  'bull|bodyNA|tailNA|trend_up',
  'bull|bodyNA|tailNA|trend_up',
  'bull|bodyNA|tailNA|trend_up',
  'bull|bodyNA|tailNA|trend_up',
  'bull|bodyNA|tailNA|trend_up',
  'bull|bodyNA|tailNA|trend_up',
  'bull|bodyNA|tailNA|trend_up',
  'bull|bodyNA|tailNA|trend_up',
  'bull|bodyNA|tailNA|trend_up',
  'bull|bodyNA|tailNA|trend_up',
  'bull|bodyNA|tailNA|trend_up',
  'bull|bodyNA|tailNA|trend_up'],
 'vocab_size': 3}

In [8]:
# Training requires PyTorch Lightning and torch.
# Uncomment once dependencies are installed.
# from trading_transformers.training import ExperimentRunner
# runner = ExperimentRunner(config=fusion_experiment, catalog=catalog)
# trainer = runner.run()
# if runner.report.get('token_stats', {}).get('total_tokens', 0) > 0:
#     print('Diagnostics from runner:', runner.report)


💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/Users/mohamedali/trading_project/models/.venv/lib/python3.13/site-packages/pytorch_lightning/trainer/setup.py:177: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
/Users/mohamedali/trading_project/models/.venv/lib/python3.13/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip ins

Sanity Checking: |                                                                                | 0/? [00:00…

/Users/mohamedali/trading_project/models/.venv/lib/python3.13/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:433: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=9` in the `DataLoader` to improve performance.
/Users/mohamedali/trading_project/models/.venv/lib/python3.13/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:433: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=9` in the `DataLoader` to improve performance.
/Users/mohamedali/trading_project/models/.venv/lib/python3.13/site-packages/pytorch_lightning/loops/fit_loop.py:310: The number of training batches (5) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |                                                                                       | 0/? [00:00…

Validation: |                                                                                     | 0/? [00:00…

Validation: |                                                                                     | 0/? [00:00…

Validation: |                                                                                     | 0/? [00:00…

Validation: |                                                                                     | 0/? [00:00…

Validation: |                                                                                     | 0/? [00:00…

`Trainer.fit` stopped: `max_epochs=5` reached.
Restoring states from the checkpoint path at artifacts/lightning_logs/version_1/checkpoints/epoch=4-step=25.ckpt
Loaded model weights from the checkpoint at artifacts/lightning_logs/version_1/checkpoints/epoch=4-step=25.ckpt


MisconfigurationException: `test_dataloader` must be implemented to be used with the Lightning Trainer

## Next Steps
- Swap the synthetic dataset with your catalog source and rerun diagnostics (`fusion_token_report`).
- Use the CLI: `python -m trading_trading_transformers.training --config transformers/configs/fusion.yaml --catalog notebooks/_tmp/catalog.json --diagnostics fusion_report.json`.
- Feed model forecasts into `python -m trading_trading_transformers.backtest` for P&L assessment.
