<a target="_blank" href="https://colab.research.google.com/github/echosprint/TabularTransformer/blob/main/notebooks/higgs_classification.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

---


**for more details about the [TabularTransformer](https://github.com/echosprint/TabularTransformer) model**,
ckeck the online **[Documents](https://echosprint.github.io/TabularTransformer/)**

---

- This notebook provides a usage example of the
  [TabularTransformer](https://github.com/echosprint/TabularTransformer)
  package.
- Hyperparameters are not tuned and may be suboptimal.

In [None]:
%pip install tabular-transformer

In [2]:
import torch
import tabular_transformer as ttf

In [None]:
higgs_path = ttf.prepare_higgs_dataset()

In [4]:
higgs_cols = ["label", "lepton  pT", "lepton  eta", "lepton  phi",
              "missing energy magnitude", "missing energy phi",
              "jet 1 pt", "jet 1 eta", "jet 1 phi", "jet 1 b-tag",
              "jet 2 pt", "jet 2 eta", "jet 2 phi", "jet 2 b-tag",
              "jet 3 pt", "jet 3 eta", "jet 3 phi", "jet 3 b-tag",
              "jet 4 pt", "jet 4 eta", "jet 4 phi", "jet 4 b-tag",
              "m_jj", "m_jjj", "m_lv", "m_jlv", "m_bb", "m_wbb", "m_wwbb"]


ensure_categorical_cols = ['label']
ensure_numerical_cols = [col for col in higgs_cols if col != 'label']

In [5]:
higgs_data_reader = ttf.DataReader(
    file_path='./data/higgs/higgs.csv.gz',
    ensure_categorical_cols=ensure_categorical_cols,
    ensure_numerical_cols=ensure_numerical_cols,
    label='label',
    header=False,
    column_names=higgs_cols)

In [None]:
split = {'train': 10_500_000,
         'test': 500_000}

split = higgs_data_reader.split_data(
    split=split,
    seed=None,  # no shuffle when seed is None
    override=False,
    save_as='parquet')

In [None]:
higgs_data_reader = higgs_data_reader(header=True)

train_data_reader = higgs_data_reader(file_path=split['train'])

train_df = train_data_reader.read().to_pandas()
print(train_df.head(3))

test_data_reader = higgs_data_reader(file_path=split['test'])

In [8]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
dtype = 'bfloat16' if torch.cuda.is_available() \
    and torch.cuda.is_bf16_supported() else 'float16'

In [9]:
ts = ttf.TrainSettings(device=device,
                       dtype=dtype,
                       unk_ratio_default=0,
                       wandb_log=False)

# Tips: if run out of memory, reduce the `batch_size`, and adjust the `max_iters` accordingly
tp = ttf.TrainParameters(max_iters=50000, learning_rate=5e-4,
                         output_dim=1, loss_type='BINCE',
                         batch_size=1024, eval_interval=1000,
                         eval_iters=100, warmup_iters=1000,
                         validate_split=0.2, output_checkpoint='higgs_r1_ckpt.pt')

hp = ttf.HyperParameters(dim=768, n_layers=12,
                         n_heads=16,
                         output_forward_dim=32,
                         output_hidden_dim=256)

In [None]:
trainer = ttf.Trainer(hp=hp, ts=ts)

trainer.train(data_reader=train_data_reader, tp=tp)

In [None]:
predictor = ttf.Predictor(checkpoint='out/higgs_r1_ckpt.pt')

prediction = predictor.predict(
    data_reader=test_data_reader,
    save_as="prediction_higgs.csv",
)

In [None]:
prediction.head(3)