This demo shows pretrain `TrxEncoder` with TabFormer pretrain.

## Setup

In [4]:
%load_ext autoreload
%autoreload 2

# import logging
import torch
import pytorch_lightning as pl
# import warnings

# warnings.filterwarnings('ignore')
# logging.getLogger("pytorch_lightning").setLevel(logging.ERROR)

## Data preproccessing

In [5]:
import os
import pandas as pd

source_data = pd.read_csv(
    'https://huggingface.co/datasets/dllllb/age-group-prediction/resolve/main/transactions_train.csv.gz?download=true', 
    compression='gzip',
)
source_data.head(2)

Unnamed: 0,client_id,trans_date,small_group,amount_rur
0,33172,6,4,71.463
1,33172,6,35,45.017


In [6]:
from ptls.preprocessing import PandasDataPreprocessor

preprocessor = PandasDataPreprocessor(
    col_id='client_id',
    col_event_time='trans_date',
    event_time_transformation='none',
    cols_category=['small_group'],
    cols_numerical=['amount_rur'],
    return_records=True,
)

In [7]:
%%time

dataset = preprocessor.fit_transform(source_data)

CPU times: user 1min 25s, sys: 9.87 s, total: 1min 35s
Wall time: 1min 47s


In [8]:
import pickle

with open('preprocessor.p', 'wb') as f:
    pickle.dump(preprocessor, f)

In [9]:
dataset = sorted(dataset, key=lambda x: x['client_id'])

In [10]:
# Load targets
df_target = pd.read_csv(
    'https://huggingface.co/datasets/dllllb/age-group-prediction/resolve/main/train_target.csv?download=true', 
    compression='gzip',
)df_target.set_index('client_id', inplace=True)
df_target.rename(columns={"bins": "target"}, inplace=True)
df_target.head(5)

Unnamed: 0_level_0,target
client_id,Unnamed: 1_level_1
24662,2
1046,0
34089,2
34848,1
47076,3


In [11]:
# Add targets to tdataset
print(dataset[0].keys())

for el in dataset:
    el['target'] = df_target['target'][el['client_id']]

print(dataset[0].keys())

dict_keys(['client_id', 'trans_date', 'event_time', 'small_group', 'amount_rur'])
dict_keys(['client_id', 'trans_date', 'event_time', 'small_group', 'amount_rur', 'target'])


In [12]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(dataset, test_size=0.2, random_state=42)
train, valid = train_test_split(train, test_size=4000, random_state=42)

len(train), len(valid), len(test)

(20000, 4000, 6000)

In [13]:
import pickle

with open('train_list', 'wb') as h:
      pickle.dump(train, h)
        
with open('valid_list', 'wb') as h:
      pickle.dump(valid, h)

with open('test_list', 'wb') as h:
      pickle.dump(test, h)

## Embedding training

Model training in our framework organised via pytorch-lightning (pl) framework.
The key parts of neural networks training in pl are: 

    * model (`pytorch_lightning.LightningModule`)
    * data loader (`torch.utils.data.DataLoader`)
    * trainer (`pytorch_lightning.Trainer`)
    
For futher details check https://pytorchlightning.ai/

In [14]:
import pickle

with open('train_list', 'rb') as h:
      train = pickle.load(h)
        
with open('valid_list', 'rb') as h:
      valid = pickle.load(h)

with open('test_list', 'rb') as h:
      test = pickle.load(h)

### Data loader

In [15]:
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing import FeatureFilter
from ptls.frames.bert import MlmDataset
from ptls.frames.tabformer.tabformer_dataset import TabformerDataset
from ptls.frames import PtlsDataModule

mlm_dm = PtlsDataModule(
    train_data=TabformerDataset(
        MemoryMapDataset(
            data=train,
        ),
        min_len=100, max_len=110
    ),
    valid_data=MlmDataset(
        MemoryMapDataset(
            data=valid,
            i_filters=[
                FeatureFilter(),
            ],
        ),
        min_len=200, max_len=256
    ),
    train_num_workers=16,
    train_batch_size=128,
)

### Model definition

In [16]:
import torch
from ptls.nn import TrxEncoder, LongformerEncoder, TabFormerFeatureEncoder, TransformerEncoder
from ptls.frames.bert import MLMPretrainModule
from ptls.frames.tabformer.tabformer_module import TabformerPretrainModule
from ptls.nn import PBLinear, PBL2Norm, PBLayerNorm

In [17]:
trx_encoder_params = dict(
    embeddings_noise=0,
#     numeric_values={'amount_rur': 'identity'},
    embeddings={
        'trans_date': {'in': 800, 'out': 16},
        'small_group': {'in': 250, 'out': 16},
    },
)

trx_encoder = TrxEncoder(**trx_encoder_params)

feature_encoder = TabFormerFeatureEncoder(2, 16)

seq_encoder = TransformerEncoder(input_size=32, n_heads=2, n_layers=1)

In [18]:
mlm_module = TabformerPretrainModule(
    trx_encoder=trx_encoder,
    seq_encoder=seq_encoder,
    feature_encoder=feature_encoder,
    
    total_steps=30000,

    mask_prob=0.15
)



### Trainer

In [19]:
import numpy as np
import torch
import pytorch_lightning as pl

import logging

trainer = pl.Trainer(
    max_steps=2400,  # 120000
    gpus=1 if torch.cuda.is_available() else 0,
    enable_progress_bar=False,
    callbacks=[
        pl.callbacks.LearningRateMonitor(logging_interval='step'),
        pl.callbacks.ModelCheckpoint(every_n_train_steps=2000, save_top_k=-1),
    ]
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [20]:
mlm_module.hparams

"inference_pooling_strategy": out
"mask_prob":                  0.15
"max_lr":                     0.001
"norm_predict":               False
"pct_start":                  0.1
"total_steps":                30000
"weight_decay":               0.0

### Training 

In [21]:
%%time
print(f'logger.version = {trainer.logger.version}')
trainer.fit(mlm_module, mlm_dm)
print(trainer.logged_metrics)

logger.version = 10


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name                 | Type                    | Params
-----------------------------------------------------------------
0 | trx_encoder          | TrxEncoder              | 16.8 K
1 | feature_encoder      | TabFormerFeatureEncoder | 4.3 K 
2 | head                 | ModuleList              | 34.7 K
3 | _seq_encoder         | TransformerEncoder      | 21.1 K
4 | loss                 | CrossEntropyLoss        | 0     
5 | train_tabformer_loss | MeanMetric              | 0     
6 | valid_tabformer_loss | MeanMetric              | 0     
7 | lin_proj             | Sequential              | 608   
-----------------------------------------------------------------
77.6 K    Trainable params
0         Non-trainable params
77.6 K    Total params
0.310     Total estimated model params size (MB)


{'tabformer/loss': tensor(9.2652), 'tabformer/valid_tabformer_loss': tensor(9.2595), 'tabformer/train_tabformer_loss': tensor(9.2832)}
CPU times: user 2min 17s, sys: 1min 37s, total: 3min 55s
Wall time: 10min 56s


### Save sequence encoder for other experiments

In [22]:
torch.save(mlm_module.trx_encoder.state_dict(), "mlm-emb.pt")

## Finetuning 

### Data module

In [23]:
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing import SeqLenFilter, FeatureFilter
from ptls.frames.supervised import SeqToTargetDataset
from ptls.frames import PtlsDataModule

def get_dataset(data):
    return SeqToTargetDataset(
        MemoryMapDataset(
            data=data,
            i_filters=[
                FeatureFilter(keep_feature_names='target'),
            ],
        ),
        target_col_name='target',
    )

finetune_dm = PtlsDataModule(
    train_data=get_dataset(train),
    valid_data=get_dataset(valid),
    test_data=get_dataset(test),
    train_num_workers=4,
    train_batch_size=256,
    test_batch_size=128,
)

### Load pretrained TrxEncoder to new model

In [24]:
import torch
import torchmetrics
from functools import partial
from ptls.nn import RnnSeqEncoder, Head
from ptls.frames.supervised import SequenceToTarget

In [25]:
trx_encoder = TrxEncoder(**trx_encoder_params)
trx_encoder.load_state_dict(torch.load('mlm-emb.pt'))

<All keys matched successfully>

In [26]:
downstream_model = SequenceToTarget(
    seq_encoder=RnnSeqEncoder(
        trx_encoder=torch.nn.Sequential(
            torch.nn.Sequential(
                trx_encoder,
                PBLinear(trx_encoder.output_size, 64),
                PBL2Norm(),
            ),
            PBLayerNorm(64),
        ),
        input_size=64,
        hidden_size=64,
        type='gru',
    ),
    head=Head(
        input_size=64,
        use_batch_norm=True,
        objective='classification',
        num_classes=4,
    ),
    loss=torch.nn.NLLLoss(),
    metric_list=torchmetrics.Accuracy(compute_on_step=False),
    pretrained_lr=0.005,
    optimizer_partial=partial(torch.optim.Adam, lr=0.015),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=5, gamma=0.4),
)

In [27]:
trainer_ft = pl.Trainer(
    max_epochs=3, # 32
    gpus=1 if torch.cuda.is_available() else 0,
    enable_progress_bar=False,
    callbacks=[
        pl.callbacks.LearningRateMonitor(logging_interval='step'),
    ]
)
print(f'logger.version = {trainer_ft.logger.version}')
trainer_ft.fit(downstream_model, finetune_dm)
print(trainer_ft.logged_metrics)
trainer_ft.test(dataloaders=finetune_dm.test_dataloader(), verbose=False)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name          | Type          | Params
------------------------------------------------
0 | seq_encoder   | RnnSeqEncoder | 44.1 K
1 | head          | Head          | 388   
2 | loss          | NLLLoss       | 0     
3 | train_metrics | ModuleDict    | 0     
4 | valid_metrics | ModuleDict    | 0     
5 | test_metrics  | ModuleDict    | 0     
------------------------------------------------
44.5 K    Trainable params
0         Non-trainable params
44.5 K    Total params
0.178     Total estimated model params size (MB)


logger.version = 11


  rank_zero_warn(
Restoring states from the checkpoint path at /home/jovyan/kireev/demo_fix/pytorch-lifestream/demo/lightning_logs/version_11/checkpoints/epoch=2-step=237.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from checkpoint at /home/jovyan/kireev/demo_fix/pytorch-lifestream/demo/lightning_logs/version_11/checkpoints/epoch=2-step=237.ckpt


{'loss': tensor(0.8110), 'seq_len': tensor(860.3438), 'y': tensor(1.4062), 'val_loss': tensor(0.9861), 'valid/Accuracy': tensor(0.5552), 'train/Accuracy': tensor(0.5706)}


[{'test/Accuracy': 0.5823333263397217}]

### New model without pretrain

In [28]:
downstream_model = SequenceToTarget(
    seq_encoder=RnnSeqEncoder(
        trx_encoder=TrxEncoder(**trx_encoder_params),
        hidden_size=64,
        type='gru',
    ),
    head=Head(
        input_size=64,
        use_batch_norm=False,
        objective='classification',
        num_classes=4,
    ),
    loss=torch.nn.NLLLoss(),
    metric_list=torchmetrics.Accuracy(compute_on_step=False),
    optimizer_partial=partial(torch.optim.Adam, lr=0.015),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=5, gamma=0.5),
)

In [29]:
trainer_ft = pl.Trainer(
    max_epochs=3,  # 32
    gpus=1 if torch.cuda.is_available() else 0,
    enable_progress_bar=False,
)
print(f'logger.version = {trainer_ft.logger.version}')
trainer_ft.fit(downstream_model, finetune_dm)
print(trainer_ft.logged_metrics)
trainer_ft.test(dataloaders=finetune_dm.test_dataloader())

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name          | Type          | Params
------------------------------------------------
0 | seq_encoder   | RnnSeqEncoder | 35.7 K
1 | head          | Head          | 260   
2 | loss          | NLLLoss       | 0     
3 | train_metrics | ModuleDict    | 0     
4 | valid_metrics | ModuleDict    | 0     
5 | test_metrics  | ModuleDict    | 0     
------------------------------------------------
35.9 K    Trainable params
0         Non-trainable params
35.9 K    Total params
0.144     Total estimated model params size (MB)


logger.version = 12


Restoring states from the checkpoint path at /home/jovyan/kireev/demo_fix/pytorch-lifestream/demo/lightning_logs/version_12/checkpoints/epoch=2-step=237.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from checkpoint at /home/jovyan/kireev/demo_fix/pytorch-lifestream/demo/lightning_logs/version_12/checkpoints/epoch=2-step=237.ckpt


{'loss': tensor(0.9158), 'seq_len': tensor(876.3438), 'y': tensor(1.2812), 'val_loss': tensor(0.9741), 'valid/Accuracy': tensor(0.5587), 'train/Accuracy': tensor(0.5670)}
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
      test/Accuracy         0.5820000171661377
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'test/Accuracy': 0.5820000171661377}]