# Supervised task with transformer sequence encoder

## Data load

In [1]:
import os

if not os.path.exists('data/transactions_train.csv'):
    ! mkdir -p data
    ! curl -OL https://storage.yandexcloud.net/ptls-datasets/age-prediction-nti-sbebank-2019.zip
    ! unzip -j -o age-prediction-nti-sbebank-2019.zip 'data/*.csv' -d data
    ! mv age-prediction-nti-sbebank-2019.zip data/

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  239M  100  239M    0     0  52.2M      0  0:00:04  0:00:04 --:--:-- 54.1M
Archive:  age-prediction-nti-sbebank-2019.zip
  inflating: data/test.csv           
  inflating: data/small_group_description.csv  
  inflating: data/train_target.csv   
  inflating: data/transactions_train.csv  
  inflating: data/transactions_test.csv  


## Prepare your data

- Use `Pyspark` in local or cluster mode for big dataset and `Pandas` for small.
- Split data into required parts (train, valid, test, ...).
- Use `ptls.preprocessing` for simple data preparation. 
- Transform features to compatible format using `Pyspark` or `Pandas` functions. 
You can also use `ptls.data_load.preprocessing` for common data transformation patterns.
- Split sequences to `ptls-data` format with `ptls.data_load.split_tools`. Save prepared data into `Parquet` format or 
keep it in memory (`Pickle` also works).
- Use one of the available `ptls.data_load.datasets` to define input for the models.

In [15]:
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split

from ptls.preprocessing import PandasDataPreprocessor
from ptls.data_load.datasets import MemoryMapDataset

Read target data

In [16]:
df_target = pd.read_csv('data/train_target.csv')
df_target.head()

Unnamed: 0,client_id,bins
0,24662,2
1,1046,0
2,34089,2
3,34848,1
4,47076,3


Split target data into train, test and validatation data

In [17]:
df_target_train, df_target_test = train_test_split(
    df_target, test_size=7000, stratify=df_target['bins'], random_state=142)
df_target_train, df_target_valid = train_test_split(
    df_target_train, test_size=3000, stratify=df_target_train['bins'], random_state=142)
print('Split {} records to train: {}, valid: {}, test: {}'.format(
    *[len(df) for df in [df_target, df_target_train, df_target_valid, df_target_test]]))

Split 30000 records to train: 20000, valid: 3000, test: 7000


Load data with transactions

In [18]:
df_trx = pd.read_csv('data/transactions_train.csv')
df_trx.head()

Unnamed: 0,client_id,trans_date,small_group,amount_rur
0,33172,6,4,71.463
1,33172,6,35,45.017
2,33172,8,11,13.887
3,33172,9,11,15.983
4,33172,10,11,21.341


In [19]:
df_trx_train = pd.merge(df_trx, df_target_train['client_id'], on='client_id', how='inner')
df_trx_valid = pd.merge(df_trx, df_target_valid['client_id'], on='client_id', how='inner')
df_trx_test = pd.merge(df_trx, df_target_test['client_id'], on='client_id', how='inner')
print('Split {} transactions to train: {}, valid: {}, test: {}'.format(
    *[len(df) for df in [df_trx, df_trx_train, df_trx_valid, df_trx_test]]))

Split 26450577 transactions to train: 17622321, valid: 2634248, test: 6194008


In [81]:
df_trx_train.sample(5)

Unnamed: 0,client_id,trans_date,small_group,amount_rur
4159434,42191,308,2,76.361
10477921,30412,628,11,20.237
2682069,1095,353,1,64.957
10469276,20216,160,91,14.234
4149024,48112,580,3,4.579


In [9]:
# transform flat table to dictionaries with client features

In [20]:
from ptls.preprocessing import PandasDataPreprocessor

preprocessor = PandasDataPreprocessor(
    col_id='client_id',
    col_event_time='trans_date',
    event_time_transformation='none',
    cols_category=['small_group'],
    cols_numerical=['amount_rur'],
    return_records=False,
)

In [21]:
%%time
df_data_train = preprocessor.fit_transform(df_trx_train)
df_data_valid = preprocessor.transform(df_trx_valid)
df_data_test = preprocessor.transform(df_trx_test)

CPU times: user 30.2 s, sys: 6.14 s, total: 36.3 s
Wall time: 36.4 s


In [22]:
print('Record in dataset, train {}, valid {}, test {}\nEach record is a client with list of transactions'.format(
    *[len(df) for df in [df_data_train, df_data_valid, df_data_test]]))

Record in dataset, train 20000, valid 3000, test 7000
Each record is a client with list of transactions


In [23]:
df_data_train.head(3)

Unnamed: 0,client_id,trans_date,event_time,small_group,amount_rur
0,6,"[tensor(0), tensor(5), tensor(10), tensor(11),...","[tensor(0), tensor(5), tensor(10), tensor(11),...","[tensor(4), tensor(3), tensor(1), tensor(3), t...","[tensor(4.0540, dtype=torch.float64), tensor(1..."
1,7,"[tensor(1), tensor(2), tensor(12), tensor(13),...","[tensor(1), tensor(2), tensor(12), tensor(13),...","[tensor(3), tensor(53), tensor(1), tensor(5), ...","[tensor(18.3190, dtype=torch.float64), tensor(..."
2,12,"[tensor(3), tensor(6), tensor(6), tensor(6), t...","[tensor(3), tensor(6), tensor(6), tensor(6), t...","[tensor(1), tensor(19), tensor(13), tensor(6),...","[tensor(3.0220, dtype=torch.float64), tensor(2..."


To learn our model, we need to add prefix target to target column due to feature naming rules

In [24]:
df_target = df_target.rename(columns={'bins': 'target_bin'})

In [25]:
df_data_train = pd.merge(df_data_train, df_target, on='client_id')
df_data_valid = pd.merge(df_data_valid, df_target, on='client_id')
df_data_test = pd.merge(df_data_test, df_target, on='client_id')

In [26]:
df_data_train = df_data_train.to_dict(orient='records')
df_data_valid = df_data_valid.to_dict(orient='records')
df_data_test = df_data_test.to_dict(orient='records')

In [18]:
# show first 10 transactions from one record
rec = df_data_train[0]
{k: v[:10] if type(v) is torch.Tensor else v for k, v in rec.items()}

{'client_id': 6,
 'trans_date': tensor([ 0,  5, 10, 11, 15, 15, 16, 16, 17, 18]),
 'small_group': tensor([ 4,  3,  1,  3,  4,  1,  4,  3, 18,  2]),
 'amount_rur': tensor([ 4.0540, 13.7380, 20.7010, 21.5640, 13.4990, 23.7220,  4.3040,  8.6250,
         12.9380, 28.1620], dtype=torch.float64),
 'event_time': tensor([ 0,  5, 10, 11, 15, 15, 16, 16, 17, 18]),
 'target_bin': 1}

Memory map dataset is a torch dataset, but also use filters to preprocess our data

In [27]:
dataset_train = MemoryMapDataset(df_data_train)
dataset_valid = MemoryMapDataset(df_data_valid)
dataset_test = MemoryMapDataset(df_data_test)

## Build encoder

- All parts are available in `ptls.nn`.
- You can also use pretrained layers.

In this task we will use TransformerSeqEncoder based on transformer architecture

In [30]:
import torch
import torchmetrics
from ptls.nn import TrxEncoder, TransformerSeqEncoder, Head

Define TrxEncoder to learn embedding for single transaction

In [43]:
trx_encoder=TrxEncoder(
        embeddings={
            'small_group': {'in': 150, 'out': 31},
        },
        numeric_values={
            'amount_rur': 'log',
        },
        embeddings_noise=0.001
)

trx_encoder.output_size

32

We can choose parameters for our transformer encoder, for example the number of heads in the multiheadattention, the number of sub-encoder-layers in the encoder and dimension of linear layer

In [36]:
transformer_params = {
    "n_heads": 1,
    "dim_hidden": 128,
    "n_layers": 4,
}

Define sequence encoder

In [45]:
seq_encoder = TransformerSeqEncoder(
    trx_encoder=trx_encoder,
    **transformer_params
)

## Choose framework for encoder train

- There are both supervised of unsupervised frameworks in `ptls.frames`.
- Keep in mind that each framework requires his own batch format.
Tools for batch collate can be found in the selected framework package.

In [46]:
from functools import partial
from ptls.frames.supervised import SeqToTargetDataset, SequenceToTarget
from ptls.frames import PtlsDataModule


* 'schema_extra' has been renamed to 'json_schema_extra'


To define a model for supervised learning, we need sequence encoder to get embedding for one user, head to transform embeddings to solve our task. In this task we will use linear layer and softmax

In [47]:
sup_module = SequenceToTarget(
    seq_encoder=seq_encoder,
    head=Head(input_size=seq_encoder.embedding_size, objective='classification', num_classes=4),
    loss=torch.nn.NLLLoss(),
    metric_list=torchmetrics.Accuracy(),
    optimizer_partial=partial(torch.optim.Adam),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=4, gamma=0.5),
)

Data module to define data for training, validating and testing, batch sizes and number of workers

In [48]:
sup_data = PtlsDataModule(
    train_data=SeqToTargetDataset(dataset_train, target_col_name='target_bin', target_dtype=torch.long),
    valid_data=SeqToTargetDataset(dataset_valid, target_col_name='target_bin', target_dtype=torch.long),
    test_data=SeqToTargetDataset(dataset_test, target_col_name='target_bin', target_dtype=torch.long),
    train_batch_size=128,
    valid_batch_size=1024,
    train_num_workers=8,
)

## Train your encoder with selected framework and `pytorch_lightning`

- Provide data with one of the DataLoaders that is compatible with selected framework. 
- Monitor the progress on tensorboard.
- Optionally tune hyperparameters.

In [49]:
import pytorch_lightning as pl

In [50]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

In [51]:
%tensorboard --logdir lightning_logs

In [55]:
trainer = pl.Trainer(
    max_epochs=2,
    gpus=1 if torch.cuda.is_available() else 0,
    enable_progress_bar=True,
)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [56]:
%%time
print(f'logger.version = {trainer.logger.version}')
trainer.fit(sup_module, sup_data)


  | Name          | Type                  | Params
--------------------------------------------------------
0 | seq_encoder   | TransformerSeqEncoder | 55.6 K
1 | head          | Head                  | 132   
2 | loss          | NLLLoss               | 0     
3 | train_metrics | ModuleDict            | 0     
4 | valid_metrics | ModuleDict            | 0     
5 | test_metrics  | ModuleDict            | 0     
--------------------------------------------------------
55.7 K    Trainable params
0         Non-trainable params
55.7 K    Total params
0.223     Total estimated model params size (MB)


logger.version = 1


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

CPU times: user 6h 38min 45s, sys: 2h 38min 26s, total: 9h 17min 11s
Wall time: 1h 15min 14s


In [57]:
# train and validation metrics
print(trainer.logged_metrics)

{'loss': tensor(1.0247), 'seq_len': tensor(851.9688), 'y': tensor(1.6250), 'val_loss': tensor(1.0196), 'valid/Accuracy': tensor(0.5547), 'train/Accuracy': tensor(0.5416)}


In [58]:
# test metrics
trainer.test(ckpt_path='best', dataloaders=sup_data.test_dataloader())

Restoring states from the checkpoint path at /home/jovyan/lightning_logs/version_1/checkpoints/epoch=1-step=314.ckpt
Loaded model weights from checkpoint at /home/jovyan/lightning_logs/version_1/checkpoints/epoch=1-step=314.ckpt


Testing: 0it [00:00, ?it/s]

[{'test/Accuracy': 0.5495714545249939}]

# Make predict

Let's make predict to check metrics

In [59]:
from ptls.data_load.utils import collate_feature_dict
from ptls.frames.inference_module import InferenceModule

In [60]:
inference_dl = torch.utils.data.DataLoader(
    dataset=dataset_test,
    collate_fn=collate_feature_dict,
    shuffle=False,
    batch_size=1000,
    num_workers=4,
)

In [61]:
inf_module = InferenceModule(
    torch.nn.Sequential(
        sup_module,
        torch.nn.Softmax(dim=1),
    ),
    model_out_name='prob',
)

In [62]:
df_predict = trainer.predict(inf_module, inference_dl)

Predicting: 157it [00:00, ?it/s]

In [72]:
df_predict = pd.concat(df_predict, axis=0)

In [74]:
df_predict.sample(10)

Unnamed: 0,client_id,target_bin,prob_0000,prob_0001,prob_0002,prob_0003
530,3769,2,0.06501,0.004084,0.922117,0.00879
958,13871,2,0.262393,0.031538,0.622771,0.083299
671,11754,3,0.068451,0.667012,0.005306,0.259231
708,19111,3,0.099277,0.610973,0.006524,0.283227
368,31272,1,0.213297,0.450815,0.043134,0.292753
520,39318,2,0.045078,0.004748,0.942754,0.00742
272,44799,0,0.511363,0.033735,0.28101,0.173892
615,25731,0,0.501576,0.047266,0.249107,0.202051
811,12658,0,0.229762,0.408421,0.107694,0.254123
72,21866,2,0.371182,0.013171,0.534654,0.080994


In [75]:
y_pred = df_predict[[f'prob_{i:04d}' for i in range(4)]].values.argmax(axis=1)
y_pred

array([1, 0, 1, ..., 1, 2, 2])

In [76]:
y_true = df_predict['target_bin'].values
y_true

array([3, 3, 1, ..., 1, 2, 2])

In [77]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [78]:
accuracy_score(y_true, y_pred)

0.5495714285714286

In [79]:
confusion_matrix(y_true, y_pred)

array([[ 667,  155,  698,  215],
       [ 221, 1148,  101,  279],
       [ 155,   35, 1557,   17],
       [ 557,  501,  219,  475]])