# Supervised task

## Data load

In [1]:
import os

if not os.path.exists('data/transactions_train.csv'):
    ! mkdir -p data
    ! curl -OL https://storage.yandexcloud.net/di-datasets/age-prediction-nti-sbebank-2019.zip
    ! unzip -j -o age-prediction-nti-sbebank-2019.zip 'data/*.csv' -d data
    ! mv age-prediction-nti-sbebank-2019.zip data/

## Prepare your data

- Use `Pyspark` in local or cluster mode for big dataset and `Pandas` for small.
- Split data into required parts (train, valid, test, ...).
- Use `ptls.preprocessing` for simple data preparation. 
- Transform features to compatible format using `Pyspark` or `Pandas` functions. 
You can also use `ptls.data_load.preprocessing` for common data transformation patterns.
- Split sequences to `ptls-data` format with `ptls.data_load.split_tools`. Save prepared data into `Parquet` format or 
keep it in memory (`Pickle` also works).
- Use one of the available `ptls.data_load.datasets` to define input for the models.

In [2]:
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split

from ptls.preprocessing import PandasDataPreprocessor
from ptls.data_load.datasets import MemoryMapDataset

In [3]:
# load and split target

In [4]:
df_target = pd.read_csv('data/train_target.csv')
df_target.head()

Unnamed: 0,client_id,bins
0,24662,2
1,1046,0
2,34089,2
3,34848,1
4,47076,3


In [5]:
df_target_train, df_target_test = train_test_split(
    df_target, test_size=7000, stratify=df_target['bins'], random_state=142)
df_target_train, df_target_valid = train_test_split(
    df_target_train, test_size=3000, stratify=df_target_train['bins'], random_state=142)
print('Split {} records to train: {}, valid: {}, test: {}'.format(
    *[len(df) for df in [df_target, df_target_train, df_target_valid, df_target_test]]))

Split 30000 records to train: 20000, valid: 3000, test: 7000


In [6]:
# load and split transactions

In [7]:
df_trx = pd.read_csv('data/transactions_train.csv')
df_trx.head()

Unnamed: 0,client_id,trans_date,small_group,amount_rur
0,33172,6,4,71.463
1,33172,6,35,45.017
2,33172,8,11,13.887
3,33172,9,11,15.983
4,33172,10,11,21.341


In [8]:
df_trx_train = pd.merge(df_trx, df_target_train['client_id'], on='client_id', how='inner')
df_trx_valid = pd.merge(df_trx, df_target_valid['client_id'], on='client_id', how='inner')
df_trx_test = pd.merge(df_trx, df_target_test['client_id'], on='client_id', how='inner')
print('Split {} transactions to train: {}, valid: {}, test: {}'.format(
    *[len(df) for df in [df_trx, df_trx_train, df_trx_valid, df_trx_test]]))

Split 26450577 transactions to train: 17622321, valid: 2634248, test: 6194008


In [9]:
# transform flat table to dictionaries with client features

In [10]:
from ptls.preprocessing import PandasDataPreprocessor

preprocessor = PandasDataPreprocessor(
    col_id='client_id',
    col_event_time='trans_date',
    event_time_transformation='none',
    cols_category=['small_group'],
    cols_numerical=['amount_rur'],
    return_records=False,
)

In [11]:
%%time
df_data_train = preprocessor.fit_transform(df_trx_train)
df_data_valid = preprocessor.transform(df_trx_valid)
df_data_test = preprocessor.transform(df_trx_test)

CPU times: user 37.6 s, sys: 10.2 s, total: 47.8 s
Wall time: 1min 2s


In [12]:
print('Record in dataset, train {}, valid {}, test {}\nEach record is a client with list of transactions'.format(
    *[len(df) for df in [df_data_train, df_data_valid, df_data_test]]))

Record in dataset, train 20000, valid 3000, test 7000
Each record is a client with list of transactions


In [13]:
df_data_train.head(3)

Unnamed: 0,client_id,trans_date,small_group,amount_rur,event_time
0,6,"[tensor(0), tensor(5), tensor(10), tensor(11),...","[tensor(4), tensor(3), tensor(1), tensor(3), t...","[tensor(4.0540, dtype=torch.float64), tensor(1...","[tensor(0), tensor(5), tensor(10), tensor(11),..."
1,7,"[tensor(1), tensor(2), tensor(12), tensor(13),...","[tensor(3), tensor(53), tensor(1), tensor(5), ...","[tensor(18.3190, dtype=torch.float64), tensor(...","[tensor(1), tensor(2), tensor(12), tensor(13),..."
2,12,"[tensor(3), tensor(6), tensor(6), tensor(6), t...","[tensor(1), tensor(19), tensor(13), tensor(6),...","[tensor(3.0220, dtype=torch.float64), tensor(2...","[tensor(3), tensor(6), tensor(6), tensor(6), t..."


In [14]:
# join target

In [15]:
df_target = df_target.rename(columns={'bins': 'target_bin'})

In [16]:
df_data_train = pd.merge(df_data_train, df_target, on='client_id')
df_data_valid = pd.merge(df_data_valid, df_target, on='client_id')
df_data_test = pd.merge(df_data_test, df_target, on='client_id')

In [17]:
df_data_train = df_data_train.to_dict(orient='records')
df_data_valid = df_data_valid.to_dict(orient='records')
df_data_test = df_data_test.to_dict(orient='records')

In [18]:
# show first 10 transactions from one record
rec = df_data_train[0]
{k: v[:10] if type(v) is torch.Tensor else v for k, v in rec.items()}

{'client_id': 6,
 'trans_date': tensor([ 0,  5, 10, 11, 15, 15, 16, 16, 17, 18]),
 'small_group': tensor([ 4,  3,  1,  3,  4,  1,  4,  3, 18,  2]),
 'amount_rur': tensor([ 4.0540, 13.7380, 20.7010, 21.5640, 13.4990, 23.7220,  4.3040,  8.6250,
         12.9380, 28.1620], dtype=torch.float64),
 'event_time': tensor([ 0,  5, 10, 11, 15, 15, 16, 16, 17, 18]),
 'target_bin': 1}

In [19]:
# Make torch datasets

In [20]:
dataset_train = MemoryMapDataset(df_data_train)
dataset_valid = MemoryMapDataset(df_data_valid)
dataset_test = MemoryMapDataset(df_data_test)

## Build encoder

- All parts are available in `ptls.nn`.
- You can also use pretrained layers.

In [21]:
import torch
import torchmetrics
from ptls.nn import TrxEncoder, RnnSeqEncoder, Head

In [22]:
# check preprocessor dictionary sisez for categoryes
preprocessor.get_category_dictionary_sizes()

{'small_group': 203}

In [23]:
seq_encoder = RnnSeqEncoder(
    trx_encoder=TrxEncoder(
        embeddings={
            'small_group': {'in': 150, 'out': 32},
        },
        numeric_values={
            'amount_rur': 'log',
        },
        embeddings_noise=0.001,
    ),
    hidden_size=48,
)

## Choose framework for encoder train

- There are both supervised of unsupervised frameworks in `ptls.frames`.
- Keep in mind that each framework requires his own batch format.
Tools for batch collate can be found in the selected framework package.

In [24]:
from functools import partial
from ptls.frames.supervised import SeqToTargetDataset, SequenceToTarget
from ptls.frames import PtlsDataModule

In [25]:
sup_module = SequenceToTarget(
    seq_encoder=seq_encoder,
    head=Head(input_size=seq_encoder.embedding_size, objective='classification', num_classes=4),
    loss=torch.nn.NLLLoss(),
    metric_list=torchmetrics.Accuracy(),
    optimizer_partial=partial(torch.optim.Adam),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=4, gamma=0.5),
)

In [26]:
sup_data = PtlsDataModule(
    train_data=SeqToTargetDataset(dataset_train, target_col_name='target_bin', target_dtype=torch.long),
    valid_data=SeqToTargetDataset(dataset_valid, target_col_name='target_bin', target_dtype=torch.long),
    test_data=SeqToTargetDataset(dataset_test, target_col_name='target_bin', target_dtype=torch.long),
    train_batch_size=128,
    valid_batch_size=1024,
    train_num_workers=8,
)

## Train your encoder with selected framework and `pytorch_lightning`

- Provide data with one of the DataLoaders that is compatible with selected framework. 
- Monitor the progress on tensorboard.
- Optionally tune hyperparameters.

In [27]:
import pytorch_lightning as pl

In [28]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

In [None]:
%tensorboard --logdir lightning_logs

In [30]:
trainer = pl.Trainer(
    max_epochs=10,
    gpus=1 if torch.cuda.is_available() else 0,
    enable_progress_bar=False,
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [31]:
%%time
print(f'logger.version = {trainer.logger.version}')
trainer.fit(sup_module, sup_data)

logger.version = 2


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name          | Type          | Params
------------------------------------------------
0 | seq_encoder   | RnnSeqEncoder | 16.8 K
1 | head          | Head          | 196   
2 | loss          | NLLLoss       | 0     
3 | train_metrics | ModuleDict    | 0     
4 | valid_metrics | ModuleDict    | 0     
5 | test_metrics  | ModuleDict    | 0     
------------------------------------------------
17.0 K    Trainable params
0         Non-trainable params
17.0 K    Total params
0.068     Total estimated model params size (MB)


CPU times: user 3min 13s, sys: 20.1 s, total: 3min 33s
Wall time: 5min 31s


In [32]:
# train and validation metrics
print(trainer.logged_metrics)

{'loss': tensor(0.8221), 'seq_len': tensor(854.8750), 'val_Accuracy': tensor(0.5311), 'train_Accuracy': tensor(0.5223)}


In [33]:
# test metrics
trainer.test(ckpt_path='best', dataloaders=sup_data.test_dataloader())

Restoring states from the checkpoint path at /home/kireev/pycharm-deploy/pytorch-lifestream/demo/lightning_logs/version_2/checkpoints/epoch=9-step=1570.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
Loaded model weights from checkpoint at /home/kireev/pycharm-deploy/pytorch-lifestream/demo/lightning_logs/version_2/checkpoints/epoch=9-step=1570.ckpt


[{'test_Accuracy': 0.5738499760627747}]

# Make predict

Let's make predict to check metrics

In [34]:
from ptls.data_load.utils import collate_feature_dict
from ptls.frames.inference_module import InferenceModule

In [35]:
inference_dl = torch.utils.data.DataLoader(
    dataset=dataset_test,
    collate_fn=collate_feature_dict,
    shuffle=False,
    batch_size=1000,
    num_workers=4,
)

In [36]:
inf_module = InferenceModule(
    torch.nn.Sequential(
        sup_module,
        torch.nn.Softmax(dim=1),
    ),
    model_out_name='prob',
)

In [37]:
df_predict = trainer.predict(inf_module, inference_dl)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


In [38]:
df_predict = pd.concat(df_predict, axis=0)

In [39]:
df_predict.head()

Unnamed: 0,client_id,target_bin,prob_0000,prob_0001,prob_0002,prob_0003
0,6,1,0.029013,0.801605,0.006939,0.162442
1,7,0,0.426665,0.022428,0.444309,0.106599
2,12,2,0.082625,0.004733,0.903185,0.009457
3,13,2,0.22367,0.007996,0.741577,0.026757
4,14,0,0.155119,0.408157,0.018226,0.418498


In [40]:
y_pred = df_predict[[f'prob_{i:04d}' for i in range(4)]].values.argmax(axis=1)
y_pred

array([1, 2, 2, ..., 2, 2, 1])

In [41]:
y_true = df_predict['target_bin'].values
y_true

array([1, 0, 2, ..., 2, 2, 1])

In [42]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [43]:
accuracy_score(y_true, y_pred)

0.57385

In [44]:
confusion_matrix(y_true, y_pred)

array([[2272,  305,  784, 1595],
       [ 293, 3220,   48, 1437],
       [1362,   51, 3354,  273],
       [1109, 1158,  108, 2631]])