# HDD 3Hz Backbone Notebook
End-to-end pipeline: scan -> split -> index -> normalize -> train encoder -> (future) evaluation.

This notebook orchestrates the workflow while keeping heavy logic in .py modules.


In [1]:
from pathlib import Path
import json
import os
import random
import numpy as np
import torch
from hdd_dataset import (
    scan_dataset,
    build_session_splits,
    build_window_index,
    compute_normalization,
    HDDWindowDataset,
    CHANNEL_NAMES,
    FS_HZ,
    DEFAULT_WINDOW,
    DEFAULT_HOP_TRAIN,
    DEFAULT_HOP_INFER,
)
import train_encoder
import extract_embeddings
from models import build_model

def set_seed(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

config = {
    'model_name': 'tcn_ae',
    'train_index': 'artifacts/index_train_sparse.jsonl',
    'val_index': 'artifacts/index_val_sparse.jsonl',
    'sensor_dir': '20200710_sensors/sensor',
    'label_dir': '20200710_labels/target',
    'normalization': 'artifacts/normalization.json',
    'window': DEFAULT_WINDOW,
    'embedding_dim': 20,
    'hidden_channels': 32,
    'num_layers': 3,
    'kernel_size': 3,
    'dropout': 0.0,
    'batch_size': 128,
    'epochs': 50,
    'lr': 1e-3,
    'weight_decay': 1e-4,
    'max_steps_per_epoch': 0,
    'num_workers': 0,
    'cache_size': 32,
    'seed': 123,
    'device': 'auto',
    'amp': False,
    'out_dir': 'artifacts/encoder',
}

set_seed(config['seed'])

artifacts_dir = Path('artifacts')
artifacts_dir.mkdir(exist_ok=True)

sensor_dir = Path(config['sensor_dir'])
label_dir = Path(config['label_dir'])
window = int(config['window'])
hop_sparse = DEFAULT_HOP_TRAIN
hop_dense = DEFAULT_HOP_INFER


In [2]:
# 1) Dataset scan and validation
summary = scan_dataset(sensor_dir, label_dir, window=window)
scan_path = artifacts_dir / 'scan_summary.json'
scan_path.write_text(json.dumps(summary, indent=2))
print(f'Summary saved to {scan_path}')


Sessions with sensors+labels: 137
Lengths (frames): min 792, max 30495, mean 8199.96
Total duration: 6241.08 minutes
Label inventory: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
Summary saved to artifacts\scan_summary.json


In [3]:
# 2) Session-level splits (by session_id only)
session_ids = sorted(summary['per_session_label_counts'].keys())
splits = build_session_splits(session_ids, seed=config['seed'])
split_path = artifacts_dir / 'splits.json'
split_path.write_text(json.dumps(splits, indent=2))
print(f'Splits saved to {split_path}')
print({k: len(v) for k, v in splits.items() if k != 'seed'})

Splits saved to artifacts\splits.json
{'train': 95, 'val': 20, 'test': 22}


In [4]:
# 3) Build window indices (sparse=hop=3, dense=hop=1)
index_paths = {}
for split_name, ids in splits.items():
    if split_name == 'seed':
        continue
    idx_sparse = build_window_index(ids, sensor_dir, window=window, hop=hop_sparse)
    idx_dense = build_window_index(ids, sensor_dir, window=window, hop=hop_dense)
    sparse_path = artifacts_dir / f'index_{split_name}_sparse.jsonl'
    dense_path = artifacts_dir / f'index_{split_name}_dense.jsonl'
    with sparse_path.open('w', encoding='utf-8') as f:
        for rec in idx_sparse:
            f.write(json.dumps(rec) + '\n')
    with dense_path.open('w', encoding='utf-8') as f:
        for rec in idx_dense:
            f.write(json.dumps(rec) + '\n')
    index_paths[split_name] = {'sparse': sparse_path, 'dense': dense_path}
    print(f"{split_name}: sparse={len(idx_sparse)} dense={len(idx_dense)}")


train: sparse=254334 dense=762894
val: sparse=76350 dense=229029
test: sparse=43055 dense=129143


In [5]:
# 4) Normalization (train sessions only)
normalization = compute_normalization(splits['train'], sensor_dir)
norm_path = artifacts_dir / 'normalization.json'
norm_path.write_text(json.dumps(normalization, indent=2))
print(f'Normalization saved to {norm_path}')
print('binary_channels:', normalization.get('binary_channels'))
print('non_normalized_channels:', normalization.get('non_normalized_channels'))


Normalization saved to artifacts\normalization.json
binary_channels: ['lturn', 'rturn']
non_normalized_channels: ['lturn', 'rturn']


In [None]:
# 5) Train encoder (self-supervised)
train_result = train_encoder.train(config)
print('Best checkpoint:', train_result['best_path'])

In [None]:
# 6) Sanity recon + embedding shape
device = torch.device('cuda' if torch.cuda.is_available() and config['device'] != 'cpu' else 'cpu')
model = build_model(
    config['model_name'],
    in_channels=8,
    latent_dim=config['embedding_dim'],
    hidden_channels=config['hidden_channels'],
    num_layers=config['num_layers'],
    kernel_size=config['kernel_size'],
    dropout=config['dropout'],
)
state = torch.load(train_result['best_path'], map_location=device)
model.load_state_dict(state)
model.to(device)
model.eval()

ds = HDDWindowDataset(
    index_paths['val']['sparse'],
    sensor_dir=sensor_dir,
    label_dir=None,
    window=window,
    normalization=normalization,
    return_label=False,
    cache_size=8,
    to_tensor=True,
)
x, meta = ds[0]
x = x.unsqueeze(0).to(device)
with torch.no_grad():
    recon, z = model(x)
print('x shape:', x.shape, 'recon:', recon.shape, 'z:', z.shape)


In [None]:
# 7) Plot one reconstruction (input vs recon)
import matplotlib.pyplot as plt
x_np = x.squeeze(0).cpu().numpy()
r_np = recon.squeeze(0).cpu().numpy()
time_axis = np.arange(x_np.shape[0]) / FS_HZ
fig, axes = plt.subplots(4, 2, figsize=(10, 8), sharex=True)
for i, ax in enumerate(axes.flat):
    ax.plot(time_axis, x_np[:, i], label='input')
    ax.plot(time_axis, r_np[:, i], label='recon', alpha=0.7)
    ax.set_title(CHANNEL_NAMES[i])
    ax.grid(True, alpha=0.3)
axes[-1, 0].set_xlabel('Time (s)')
axes[-1, 1].set_xlabel('Time (s)')
axes[0, 0].legend(loc='upper right', fontsize=8)
plt.tight_layout()
plt.show()


In [None]:
# 8) Evaluation stub (embeddings extraction)
RUN_EXTRACTION = False
if RUN_EXTRACTION:
    extract_config = {
        'splits': 'train,val,test',
        'index_dir': 'artifacts',
        'sensor_dir': config['sensor_dir'],
        'normalization': config['normalization'],
        'weights': train_result['best_path'],
        'config': str(Path(config['out_dir']) / 'config.json'),
        'out_dir': 'artifacts/embeddings',
        'batch_size': 256,
        'num_workers': 0,
        'device': config['device'],
    }
    extract_embeddings.extract(extract_config)


In [36]:
# 9) Latent PCA health check (per-split)
RUN_PCA = True #False by default
if RUN_PCA:
    import sys
    import importlib
    import scripts.latent_pca_viz as latent_pca_viz
    importlib.reload(latent_pca_viz)
    from scripts.latent_pca_viz import main as latent_pca_main

    for split in ["train", "val", "test"]:
        sys.argv = [
            "latent_pca_viz.py",
            "--ckpt", "artifacts/encoder/weights_best.pt",
            "--split", split,
            "--batch_size", "256",
            "--seed", "123",
            "--out_dir", "artifacts/latent_viz",
            "--out_npz", f"artifacts/latent_viz/{split}_pca.npz",
        ]
        latent_pca_main()


Determinism check max|z1-z2|: 0.000000e+00
Z shape: (762894, 20)
Z mean min/max: -0.4176/0.2570
Z std  min/max: 0.3988/1.0473
Fraction of near-zero std dims: 0.0000
Determinism check max|z1-z2|: 0.000000e+00
Z shape: (229029, 20)
Z mean min/max: -0.4269/0.2722
Z std  min/max: 0.3954/1.0442
Fraction of near-zero std dims: 0.0000
Determinism check max|z1-z2|: 0.000000e+00
Z shape: (129143, 20)
Z mean min/max: -0.3881/0.1698
Z std  min/max: 0.3927/0.9886
Fraction of near-zero std dims: 0.0000


In [37]:
# 9b) Align PCA basis (train -> val/test)
RUN_ALIGN_PCA = True #False by default
if RUN_ALIGN_PCA:
    import sys
    import runpy

    sys.argv = [
        "scripts/align_pca_basis.py",
        "--train_npz", "artifacts/latent_viz/train_pca.npz",
        "--val_npz", "artifacts/latent_viz/val_pca.npz",
        "--test_npz", "artifacts/latent_viz/test_pca.npz",
        "--pca_dim", "6",
    ]
    runpy.run_path("scripts/align_pca_basis.py", run_name="__main__")


Updated PCA outputs: artifacts\latent_viz\train_pca.npz
Updated PCA outputs: artifacts\latent_viz\val_pca.npz
Updated PCA outputs: artifacts\latent_viz\test_pca.npz


In [38]:
# 10) Latent diagnostics (terminal-free)
RUN_DIAGNOSTICS = True #False by default
if RUN_DIAGNOSTICS:
    import sys
    import runpy

    sys.argv = [
        "scripts/latent_diagnostics.py",
        "--pca_npz", "artifacts/latent_viz/train_pca.npz",
        "--use_space", "pc_scores",
        "--max_pc", "6",
        "--k_min", "4",
        "--k_max", "40",
        "--k_step", "4",
        "--stability_k", "8", "12", "16", "20",
    ]
    runpy.run_path("scripts/latent_diagnostics.py", run_name="__main__")


Hopkins statistic: 0.9686 (near 1 -> highly clusterable)

Top correlations per PC:
PC1: brake_mean=-0.819, accel_pedal_mean=0.794, speed_mean=0.745
PC2: steer_angle_mean=-0.862, yaw_mean=-0.826, rturn_frac=-0.328
PC3: speed_mean=0.497, abs_yaw=-0.293, brake_mean=0.249
PC4: steer_speed_mean=0.693, yaw_mean=-0.354, speed_mean=-0.124
PC5: steer_angle_mean=0.362, steer_speed_mean=-0.292, yaw_mean=-0.225
PC6: steer_angle_mean=-0.302, yaw_mean=0.216, rturn_frac=0.164

Silhouette (best K):
K=12, silhouette=0.2985

Stability summary (ARI):
K=8: mean=0.615, min=0.474, max=0.885
K=12: mean=0.627, min=0.501, max=0.709
K=16: mean=0.605, min=0.475, max=0.705
K=20: mean=0.555, min=0.451, max=0.723


In [24]:
# # 11) Stage 2: KMeans clustering in PCA space + smoothing
# RUN_STAGE2 = True #False by default
# if RUN_STAGE2:
#     import stage2.kmeans_cluster as km
#     km.main([
#         "--pca_npz_train", "artifacts/latent_viz/train_pca.npz",
#         "--pca_npz_val", "artifacts/latent_viz/val_pca.npz",
#         "--pca_npz_test", "artifacts/latent_viz/test_pca.npz",
#         "--pca_dim", "6",
#         "--k", "16",
#         "--smooth_window", "5",
#         "--seed", "123",
#         "--out_dir", "artifacts/stage2",
#         "--splits", "train,val,test",
#         "--viz_all",
#         "--viz_label", "smooth",
#     ])

In [39]:
# 11) Stage 2: KMeans clustering in PCA space + smoothing
RUN_STAGE2 = True #False by default
if RUN_STAGE2:
    import importlib
    import stage2.kmeans_cluster as km
    importlib.reload(km)

    km.main([
        "--pca_npz_train", "artifacts/latent_viz/train_pca.npz",
        "--pca_npz_val", "artifacts/latent_viz/val_pca.npz",
        "--pca_npz_test", "artifacts/latent_viz/test_pca.npz",
        "--pca_dim", "6",
        "--k", "16",
        "--smooth_window", "5",
        "--seed", "123",
        "--out_dir", "artifacts/stage2",
        "--splits", "train,val,test",
        "--viz_all",
        "--viz_label", "smooth",
    ])
