# Supervised task

## Data load

In [1]:
import os

if not os.path.exists('data/transactions_train.csv'):
    ! mkdir -p data
    ! curl -OL https://storage.yandexcloud.net/di-datasets/age-prediction-nti-sbebank-2019.zip
    ! unzip -j -o age-prediction-nti-sbebank-2019.zip 'data/*.csv' -d data
    ! mv age-prediction-nti-sbebank-2019.zip data/

## Prepare your data

- Use `Pyspark` in local or cluster mode for big dataset and `Pandas` for small.
- Split data into required parts (train, valid, test, ...).
- Use `ptls.data_preprocessing` for simple data preparation. 
- Transform features to compatible format using `Pyspark` or `Pandas` functions. 
You can also use `ptls.data_load.preprocessing` for common data transformation patterns.
- Split sequences to `ptls-data` format with `ptls.data_load.split_tools`. Save prepared data into `Parquet` format or 
keep it in memory (`Pickle` also works).
- Use one of the available `ptls.data_load.datasets` to define input for the models.

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from ptls.data_preprocessing import PandasDataPreprocessor
from ptls.data_load.datasets import MemoryMapDataset

In [3]:
# load and split target

In [4]:
df_target = pd.read_csv('data/train_target.csv')
df_target.head()

Unnamed: 0,client_id,bins
0,24662,2
1,1046,0
2,34089,2
3,34848,1
4,47076,3


In [5]:
df_target_train, df_target_test = train_test_split(
    df_target, test_size=7000, stratify=df_target['bins'], random_state=142)
df_target_train, df_target_valid = train_test_split(
    df_target_train, test_size=3000, stratify=df_target_train['bins'], random_state=142)
print('Split {} records to train: {}, valid: {}, test: {}'.format(
    *[len(df) for df in [df_target, df_target_train, df_target_valid, df_target_test]]))

Split 30000 records to train: 20000, valid: 3000, test: 7000


In [6]:
# load and split transactions

In [7]:
df_trx = pd.read_csv('data/transactions_train.csv')
df_trx.head()

Unnamed: 0,client_id,trans_date,small_group,amount_rur
0,33172,6,4,71.463
1,33172,6,35,45.017
2,33172,8,11,13.887
3,33172,9,11,15.983
4,33172,10,11,21.341


In [8]:
df_trx = df_trx.rename(columns={'trans_date': 'event_time'})

In [9]:
df_trx_train = pd.merge(df_trx, df_target_train['client_id'], on='client_id', how='inner')
df_trx_valid = pd.merge(df_trx, df_target_valid['client_id'], on='client_id', how='inner')
df_trx_test = pd.merge(df_trx, df_target_test['client_id'], on='client_id', how='inner')
print('Split {} transactions to train: {}, valid: {}, test: {}'.format(
    *[len(df) for df in [df_trx, df_trx_train, df_trx_valid, df_trx_test]]))

Split 26450577 transactions to train: 17622321, valid: 2634248, test: 6194008


In [10]:
# transform flat table to dictionaries with client features

In [11]:
from ptls.data_preprocessing import PandasDataPreprocessor

preprocessor = PandasDataPreprocessor(
    col_id='client_id',
    cols_event_time='trans_date',
    time_transformation='none',
    cols_category=['small_group'],
    cols_log_norm=[],
    cols_identity=['amount_rur'],
    print_dataset_info=False,
)

In [12]:
%%time
df_data_train = preprocessor.fit_transform(df_trx_train)
df_data_valid = preprocessor.transform(df_trx_valid)
df_data_test = preprocessor.transform(df_trx_test)

CPU times: user 27.6 s, sys: 5.63 s, total: 33.2 s
Wall time: 33.2 s


In [13]:
print('Record in dataset, train {}, valid {}, test {}\nEach record is a client with list of transactions'.format(
    *[len(df) for df in [df_data_train, df_data_valid, df_data_test]]))

Record in dataset, train 20000, valid 3000, test 7000
Each record is a client with list of transactions


In [14]:
# show first 10 transactions from one record
rec = df_data_train[0]
{k: v[:10] if type(v) is np.ndarray else v for k, v in rec.items()}

{'client_id': 6,
 'event_time': array([ 0,  5, 10, 11, 15, 15, 16, 16, 17, 18]),
 'small_group': array([ 4,  3,  1,  3,  4,  1,  4,  3, 18,  2]),
 'amount_rur': array([ 4.054, 13.738, 20.701, 21.564, 13.499, 23.722,  4.304,  8.625,
        12.938, 28.162])}

In [15]:
# join target

In [16]:
s_target = df_target.set_index('client_id')['bins']

In [17]:
for df in [df_data_train, df_data_valid, df_data_test]:
    for rec in df:
        rec['target_bin'] = s_target.loc[rec['client_id']]

In [18]:
# show first 10 transactions from one record
rec = df_data_train[0]
{k: v[:10] if type(v) is np.ndarray else v for k, v in rec.items()}

{'client_id': 6,
 'event_time': array([ 0,  5, 10, 11, 15, 15, 16, 16, 17, 18]),
 'small_group': array([ 4,  3,  1,  3,  4,  1,  4,  3, 18,  2]),
 'amount_rur': array([ 4.054, 13.738, 20.701, 21.564, 13.499, 23.722,  4.304,  8.625,
        12.938, 28.162]),
 'target_bin': 1}

In [19]:
# Make torch datasets

In [20]:
dataset_train = MemoryMapDataset(df_data_train)
dataset_valid = MemoryMapDataset(df_data_train)
dataset_test = MemoryMapDataset(df_data_train)

## Build encoder

- All parts are available in `ptls.nn`.
- You can also use pretrained layers.

In [21]:
import torch
import torchmetrics
from ptls.nn import TrxEncoder, RnnSeqEncoder, Head

In [22]:
# check preprocessor dictionary sisez for categoryes
preprocessor.get_category_sizes()

{'small_group': 201}

In [23]:
seq_encoder = RnnSeqEncoder(
    trx_encoder=TrxEncoder(
        embeddings={
            'small_group': {'in': 150, 'out': 32},
        },
        numeric_values={
            'amount_rur': 'log',
        },
        embeddings_noise=0.001,
    ),
    hidden_size=48,
)

## Choose framework for encoder train

- There are both supervised of unsupervised frameworks in `ptls.frames`.
- Keep in mind that each framework requires his own batch format.
Tools for batch collate can be found in the selected framework package.

In [24]:
from functools import partial
from ptls.frames.supervised import SeqToTargetDataset, SequenceToTarget
from ptls.frames import PtlsDataModule

In [25]:
sup_module = SequenceToTarget(
    seq_encoder=seq_encoder,
    head=Head(input_size=seq_encoder.embedding_size, objective='classification', num_classes=4),
    loss=torch.nn.NLLLoss(),
    metric_list=torchmetrics.Accuracy(),
    optimizer_partial=partial(torch.optim.Adam),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=4, gamma=0.5),
)

In [26]:
sup_data = PtlsDataModule(
    train_data=SeqToTargetDataset(dataset_train, target_col_name='target_bin', target_dtype=torch.long),
    valid_data=SeqToTargetDataset(dataset_valid, target_col_name='target_bin', target_dtype=torch.long),
    test_data=SeqToTargetDataset(dataset_test, target_col_name='target_bin', target_dtype=torch.long),
    train_batch_size=128,
    valid_batch_size=1024,
    train_num_workers=8,
)

## Train your encoder with selected framework and `pytorch_lightning`

- Provide data with one of the DataLoaders that is compatible with selected framework. 
- Monitor the progress on tensorboard.
- Optionally tune hyperparameters.

In [27]:
import pytorch_lightning as pl

In [28]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

In [30]:
%tensorboard --logdir lightning_logs

In [31]:
trainer = pl.Trainer(
    max_epochs=10,
    gpus=1 if torch.cuda.is_available() else 0,
    enable_progress_bar=False,
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [32]:
%%time
print(f'logger.version = {trainer.logger.version}')
trainer.fit(sup_module, sup_data)

Missing logger folder: /home/kireev/pycharm-deploy/pytorch-lifestream/demo/lightning_logs


logger.version = 0


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name          | Type          | Params
------------------------------------------------
0 | seq_encoder   | RnnSeqEncoder | 16.8 K
1 | head          | Head          | 196   
2 | loss          | NLLLoss       | 0     
3 | train_metrics | ModuleDict    | 0     
4 | valid_metrics | ModuleDict    | 0     
5 | test_metrics  | ModuleDict    | 0     
------------------------------------------------
17.0 K    Trainable params
0         Non-trainable params
17.0 K    Total params
0.068     Total estimated model params size (MB)


CPU times: user 2min 54s, sys: 19.7 s, total: 3min 13s
Wall time: 3min 18s


In [33]:
# train and validation metrics
print(trainer.logged_metrics)

{'loss': tensor(1.2590), 'seq_len': tensor(904.1562), 'val_Accuracy': tensor(0.5374), 'train_Accuracy': tensor(0.5243)}


In [34]:
# test metrics
trainer.test(ckpt_path='best', dataloaders=sup_data.test_dataloader())

Restoring states from the checkpoint path at /home/kireev/pycharm-deploy/pytorch-lifestream/demo/lightning_logs/version_0/checkpoints/epoch=9-step=1570.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
Loaded model weights from checkpoint at /home/kireev/pycharm-deploy/pytorch-lifestream/demo/lightning_logs/version_0/checkpoints/epoch=9-step=1570.ckpt


[{'test_Accuracy': 0.5798500180244446}]

# Make predict

Let's make predict to check metrics

In [35]:
from ptls.data_load.utils import collate_feature_dict
from ptls.frames.inference_module import InferenceModule

In [36]:
inference_dl = torch.utils.data.DataLoader(
    dataset=dataset_test,
    collate_fn=collate_feature_dict,
    shuffle=False,
    batch_size=1000,
    num_workers=4,
)

In [37]:
inf_module = InferenceModule(
    torch.nn.Sequential(
        sup_module,
        torch.nn.Softmax(dim=1),
    ),
    model_out_name='prob',
)

In [38]:
df_predict = trainer.predict(inf_module, inference_dl)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


In [39]:
df_predict = pd.concat(df_predict, axis=0)

In [40]:
df_predict.head()

Unnamed: 0,client_id,target_bin,prob_0000,prob_0001,prob_0002,prob_0003
0,6,1,0.017108,0.861903,0.00551,0.115479
1,7,0,0.272358,0.006778,0.686012,0.034851
2,12,2,0.053981,0.00293,0.938305,0.004783
3,13,2,0.1496,0.003601,0.826836,0.019963
4,14,0,0.281833,0.245787,0.076031,0.396349


In [41]:
y_pred = df_predict[[f'prob_{i:04d}' for i in range(4)]].values.argmax(axis=1)
y_pred

array([1, 2, 2, ..., 2, 2, 1])

In [42]:
y_true = df_predict['target_bin'].values
y_true

array([1, 0, 2, ..., 2, 2, 1])

In [43]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [44]:
accuracy_score(y_true, y_pred)

0.57985

In [45]:
confusion_matrix(y_true, y_pred)

array([[1908,  327, 1441, 1280],
       [ 354, 3360,  128, 1156],
       [ 715,   64, 4092,  169],
       [1224, 1233,  312, 2237]])