<a target="_blank" href="https://colab.research.google.com/github/echosprint/TabularTransformer/blob/main/notebooks/self-supervised_pre-training.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

---


**for more details about the [TabularTransformer](https://github.com/echosprint/TabularTransformer) model**,
ckeck the online **[Documents](https://echosprint.github.io/TabularTransformer/)**

---

- This notebook provides a usage example of the
  [TabularTransformer](https://github.com/echosprint/TabularTransformer)
  package.
- Hyperparameters are not tuned and may be suboptimal.

In [None]:
%pip install tabular-transformer

In [1]:
import tabular_transformer as ttf
import pandas as pd
import torch

In [None]:
income_dataset_path = ttf.prepare_income_dataset()

specify the categorical cols and numerical cols

In [3]:
categorical_cols = [
    'workclass', 'education',
    'marital.status', 'occupation',
    'relationship', 'race', 'sex',
    'native.country', 'income']

numerical_cols = [
    'age', 'fnlwgt', 'education.num',
    'capital.gain', 'capital.loss',
    'hours.per.week']

define the income dataset reader

In [4]:
income_reader = ttf.DataReader(
    file_path=income_dataset_path,
    ensure_categorical_cols=categorical_cols,
    ensure_numerical_cols=numerical_cols,
    label='income',
    header=True,
    id=None,
)

split dataset for `pretrain`, `finetune`, `ssl_test`

In [None]:
split = income_reader.split_data(
    {'pretrain': 0.8, 'finetune': 64, 'ssl_test': -1})
print(split)

drop the `income`, then copy the `occupation` column to `pretext_target` for `pretrain` split

In [6]:
def replace_target(file_path):
    df = pd.read_csv(file_path)
    df.drop(columns=['income'], inplace=True)
    df['pretext_target'] = df['occupation']
    df.to_csv(file_path, index=False)

In [7]:
replace_target(split['pretrain'])

pretrain column

In [8]:
pretrain_categorical_cols = [
    'pretext_target'
    if x == 'income' else x
    for x in categorical_cols
]

define the pretrain data reader

In [None]:
pretrain_reader = ttf.DataReader(
    file_path=split['pretrain'],
    ensure_categorical_cols=pretrain_categorical_cols,
    ensure_numerical_cols=numerical_cols,
    header=True,
    label='pretext_target',
)
pdf = pretrain_reader.read().to_pandas()
pdf.head(3)

choose the `device` and `dtype`

In [10]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
dtype = 'bfloat16' if torch.cuda.is_available() \
    and torch.cuda.is_bf16_supported() else 'float16'

train settings and hyperparameters

In [11]:
ts = ttf.TrainSettings(wandb_log=False,
                       device=device,
                       dtype=dtype,
                       )


hp = ttf.HyperParameters(dim=64,
                         n_layers=6)

trainer = ttf.Trainer(hp=hp, ts=ts)

pretraining

In [None]:
pretrain_tp = ttf.TrainParameters(
    max_iters=3000,
    loss_type='SUPCON',
    batch_size=128,
    output_dim=16,
    unk_ratio={'occupation': 0.50},
    eval_interval=100,
    eval_iters=20,
    warmup_iters=500,
    validate_split=0.2,
    output_checkpoint='income_pretrain_ckpt.pt')

trainer.train(
    data_reader=pretrain_reader,
    tp=pretrain_tp,
    resume=False)

finetne

In [None]:
finetune_tp = ttf.TrainParameters(
    transformer_lr=5e-6,
    output_head_lr=5e-5,
    lr_scheduler='constant',
    max_iters=250,
    loss_type='BINCE',
    batch_size=64,
    output_dim=1,
    eval_interval=249,
    always_save_checkpoint=True,
    eval_iters=1,
    warmup_iters=10,
    validate_split=0.0,
    input_checkpoint='income_pretrain_ckpt.pt',
    output_checkpoint='income_finetune_ckpt.pt',
)

trainer.train(
    data_reader=income_reader(file_path=split['finetune']),
    tp=finetune_tp,
    resume=True,
    replace_output_head=True)

prediction on test split with finetuned model

In [None]:
predictor = ttf.Predictor(checkpoint='out/income_finetune_ckpt.pt')
prediction = predictor.predict(data_reader=income_reader(file_path=split['ssl_test']),
                  save_as="prediction_output.csv")
prediction.head(3)

train scratch with the finetune dataset

In [None]:
train_scratch_tp = ttf.TrainParameters(
    learning_rate=5e-4,
    lr_scheduler='cosine',
    max_iters=50,
    loss_type='BINCE',
    batch_size=64,
    output_dim=1,
    always_save_checkpoint=True,
    eval_iters=1,
    warmup_iters=10,
    validate_split=0.0,
    output_checkpoint='income_scratch_ckpt.pt',
)

trainer.train(
    data_reader=income_reader(file_path=split['finetune']),
    tp=train_scratch_tp,
    resume=False,
)

check the result of scratch training

In [None]:
predictor = ttf.Predictor(checkpoint='out/income_scratch_ckpt.pt')
prediction = predictor.predict(data_reader=income_reader(file_path=split['ssl_test']),
                  save_as="prediction_scratch_output.csv")
prediction.head(3)