# Colab setup

In [1]:
import sys
if 'google.colab' in str(get_ipython()):
    ! {sys.executable} -m pip install pytorch-lifestream
    ! {sys.executable} -m pip install -U 'torch<2'  # downgrade for ptls==0.5.x
    ! {sys.executable} -m pip install -U 'pytorch-lightning<2'  # downgrade for ptls==0.5.x
    ! {sys.executable} -m pip install -U 'torchvision<0.15.1'  # downgrade for ptls==0.5.x
    ! {sys.executable} -m pip install -U 'torchaudio<2'  # downgrade for ptls==0.5.x

## Data load

In [4]:
import os

if not os.path.exists('data/transactions_train.csv'):
    ! mkdir -p data
    ! curl -OL https://storage.yandexcloud.net/ptls-datasets/age-prediction-nti-sbebank-2019.zip
    ! unzip -j -o age-prediction-nti-sbebank-2019.zip 'data/*.csv' -d data
    ! mv age-prediction-nti-sbebank-2019.zip data/

## Setup

In [57]:
%load_ext autoreload
%autoreload 2

# import logging
import torch
import numpy as np
import pytorch_lightning as pl
# import warnings

# warnings.filterwarnings('ignore')
# logging.getLogger("pytorch_lightning").setLevel(logging.ERROR)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Data preproccessing

In [21]:
import os
import pandas as pd

data_path = 'data/'

source_data = pd.read_csv(os.path.join(data_path, 'transactions_train.csv'))
source_data.head(2)

Unnamed: 0,client_id,trans_date,small_group,amount_rur
0,33172,6,4,71.463
1,33172,6,35,45.017


In [22]:
from ptls.preprocessing import PandasDataPreprocessor

preprocessor = PandasDataPreprocessor(
    col_id='client_id',
    col_event_time='trans_date',
    event_time_transformation='none',
    cols_category=['small_group'],
    cols_numerical=['amount_rur'],
    return_records=True,
)

In [23]:
%%time

dataset = preprocessor.fit_transform(source_data)

CPU times: user 31.2 s, sys: 4.02 s, total: 35.3 s
Wall time: 35.2 s


In [24]:
import pickle

with open('preprocessor.p', 'wb') as f:
    pickle.dump(preprocessor, f)

In [25]:
dataset = sorted(dataset, key=lambda x: x['client_id'])

In [26]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(dataset, test_size=0.2, random_state=42)

len(train), len(test)

(24000, 6000)

In [27]:
train[0].keys()

dict_keys(['client_id', 'trans_date', 'event_time', 'small_group', 'amount_rur'])

## Embedding training

Model training in our framework organised via pytorch-lightning (pl) framework.
The key parts of neural networks training in pl are: 

    * model (`pytorch_lightning.LightningModule`)
    * data loader (`torch.utils.data.DataLoader`)
    * trainer (`pytorch_lightning.Trainer`)
    
For futher details check https://pytorchlightning.ai/

### Model definition

In [28]:
from functools import partial
from ptls.nn import TrxEncoder, RnnSeqEncoder
from ptls.frames.bert import SopNspModule

trx_encoder_params = dict(
    embeddings_noise=0.003,
    numeric_values={'amount_rur': 'identity'},
    embeddings={
        'trans_date': {'in': 800, 'out': 16},
        'small_group': {'in': 250, 'out': 16},
    },
)

seq_encoder = RnnSeqEncoder(
    trx_encoder=TrxEncoder(**trx_encoder_params),
    hidden_size=32,
    type='lstm',
)

model = SopNspModule(
    seq_encoder=seq_encoder,
    hidden_size = 16,
    drop_p = 0.2,
    optimizer_partial=partial(torch.optim.Adam, lr=0.001),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=30, gamma=0.9),
)



## Training for nsp loss

### Data loader

At first let's train our model for nsp loss. For this task we need NspDataset, which makes pairs from every sentence by splitting it into left and right part, and then generates batch with two parts: first part - where lefts and rights are in correct are from one sentence, and second part - where right parts are randomly shuffled, so we get parts from different sentences

In [29]:
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.frames.bert import NspDataset
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames import PtlsDataModule

train_dl = PtlsDataModule(
    train_data=NspDataset(
        MemoryMapDataset(
            data=train,
            i_filters=[
                SeqLenFilter(min_seq_len=25),
            ],
        ),
        splitter=SampleSlices(
            split_count=5,
            cnt_min=25,
            cnt_max=200,
        ),
    ),
    train_num_workers=16,
    train_batch_size=256,
)

### Trainer

In [30]:
import torch
import pytorch_lightning as pl

import logging

trainer = pl.Trainer(
    max_epochs=15,
    gpus=1 if torch.cuda.is_available() else 0,
    enable_progress_bar=True,
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


### Training 

In [31]:
%%time
print(f'logger.version = {trainer.logger.version}')
trainer.fit(model, train_dl)
print(trainer.logged_metrics)

logger.version = 7


  rank_zero_warn("You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name               | Type              | Params
---------------------------------------------------------
0 | _loss              | BCELoss           | 0     
1 | _seq_encoder       | RnnSeqEncoder     | 25.4 K
2 | _validation_metric | AUROC             | 0     
3 | _head              | SentencePairsHead | 26.5 K
---------------------------------------------------------
26.5 K    Trainable params
0         Non-trainable params
26.5 K    Total params
0.106     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

{'loss': tensor(0.2433)}
CPU times: user 3min 28s, sys: 25.1 s, total: 3min 53s
Wall time: 4min 55s


## Inference 

In [32]:
# embedding inference

from ptls.data_load.datasets import inference_data_loader

train_dl = inference_data_loader(train, num_workers=0, batch_size=256)
train_embeds = torch.vstack(trainer.predict(model, train_dl))

test_dl = inference_data_loader(test, num_workers=0, batch_size=256)
test_embeds = torch.vstack(trainer.predict(model, test_dl))

train_embeds.shape, test_embeds.shape

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(


Predicting: 94it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 94it [00:00, ?it/s]

(torch.Size([24000, 32]), torch.Size([6000, 32]))

In [33]:
# join target and embeddings

df_target = pd.read_csv(os.path.join(data_path, 'train_target.csv'))
df_target = df_target.set_index('client_id')
df_target.rename(columns={"bins": "target"}, inplace=True)

train_df = pd.DataFrame(data=train_embeds, columns=[f'embed_{i}' for i in range(train_embeds.shape[1])])
train_df['client_id'] = [x['client_id'] for x in train]
train_df = train_df.merge(df_target, how='left', on='client_id')

test_df = pd.DataFrame(data=test_embeds, columns=[f'embed_{i}' for i in range(test_embeds.shape[1])])
test_df['client_id'] = [x['client_id'] for x in test]
test_df = test_df.merge(df_target, how='left', on='client_id')

print(train_df.shape, test_df.shape)

(24000, 34) (6000, 34)


Obtained embeddings can be used as features for model training

For example:

In [34]:
from sklearn.ensemble import GradientBoostingClassifier

embed_columns = [x for x in train_df.columns if x.startswith('embed')]
x_train, y_train = train_df[embed_columns], train_df['target']
x_test, y_test = test_df[embed_columns], test_df['target']

clf = GradientBoostingClassifier()
clf.fit(x_train, y_train)
clf.score(x_test, y_test)

0.4826666666666667

## Training for sop loss

In this section we will use transformer and sop loss

In [49]:
from ptls.nn import TransformerSeqEncoder

trx_encoder_params = dict(
    embeddings_noise=0.003,
    numeric_values={'amount_rur': 'identity'},
    embeddings={
        'trans_date': {'in': 800, 'out': 17},
        'small_group': {'in': 250, 'out': 16},
    },
)

# define our transformer params

transformer_params = {
    "n_heads": 1,
    "n_layers": 1,
    "dim_hidden": 16
}

# define transformer sequence encoder

seq_encoder = TransformerSeqEncoder(
    trx_encoder=TrxEncoder(**trx_encoder_params),
    input_size=34,
    **transformer_params
)

# define our model

model = SopNspModule(
    seq_encoder=seq_encoder,
    hidden_size = 16,
    drop_p = 0.2,
    optimizer_partial=partial(torch.optim.Adam, lr=0.001),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=30, gamma=0.9),
)



There we will use SopDataset, which is similar to NspDataset, but we generate parts by changing or not changing order of parts of single sequence

In [50]:
from ptls.frames.bert import SopDataset

train_dl = PtlsDataModule(
    train_data=SopDataset(
        MemoryMapDataset(
            data=train,
            i_filters=[
                SeqLenFilter(min_seq_len=25),
            ],
        ),
        splitter=SampleSlices(
            split_count=5,
            cnt_min=25,
            cnt_max=200,
        ),
    ),
    train_num_workers=16,
    train_batch_size=256,
)

# Trainer

In [51]:
import torch
import pytorch_lightning as pl

import logging

trainer = pl.Trainer(
    max_epochs=15,
    gpus=1 if torch.cuda.is_available() else 0,
    enable_progress_bar=True,
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


# Training

In [52]:
%%time
print(f'logger.version = {trainer.logger.version}')
trainer.fit(model, train_dl)
print(trainer.logged_metrics)

  rank_zero_warn("You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name               | Type                  | Params
-------------------------------------------------------------
0 | _loss              | BCELoss               | 0     
1 | _seq_encoder       | TransformerSeqEncoder | 23.7 K
2 | _validation_metric | AUROC                 | 0     
3 | _head              | SentencePairsHead     | 24.9 K
-------------------------------------------------------------
24.9 K    Trainable params
0         Non-trainable params
24.9 K    Total params
0.100     Total estimated model params size (MB)


logger.version = 10


Training: 0it [00:00, ?it/s]

{'loss': tensor(0.0787)}
CPU times: user 1min 21s, sys: 6min 42s, total: 8min 4s
Wall time: 10min 8s


# Inference

In [55]:
def get_embeddings(model, train_dl, test_dl):
    model.to('cuda:0')
    train_embs, test_embs = [], []
    for batch in train_dl:
        with torch.no_grad():
            x = model(batch.to('cuda:0')).detach().cpu().numpy()
            train_embs.append(x)
            
    for batch in test_dl:
        with torch.no_grad():
            x = model(batch.to('cuda:0')).detach().cpu().numpy()
            test_embs.append(x)

    train_embs = np.concatenate(train_embs, axis=0)
    test_embs = np.concatenate(test_embs, axis=0)

    return train_embs, test_embs

In [58]:
# embedding inference

from ptls.data_load.datasets import inference_data_loader

train_dl = inference_data_loader(train, num_workers=0, batch_size=256)

test_dl = inference_data_loader(test, num_workers=0, batch_size=256)

train_embeds, test_embeds = get_embeddings(model, train_dl, test_dl)

train_embeds.shape, test_embeds.shape

((24000, 34), (6000, 34))

In [61]:
# join target and embeddings

df_target = pd.read_csv(os.path.join(data_path, 'train_target.csv'))
df_target = df_target.set_index('client_id')
df_target.rename(columns={"bins": "target"}, inplace=True)

train_df = pd.DataFrame(data=train_embeds, columns=[f'embed_{i}' for i in range(train_embeds.shape[1])])
train_df['client_id'] = [x['client_id'] for x in train]
train_df = train_df.merge(df_target, how='left', on='client_id')

test_df = pd.DataFrame(data=test_embeds, columns=[f'embed_{i}' for i in range(test_embeds.shape[1])])
test_df['client_id'] = [x['client_id'] for x in test]
test_df = test_df.merge(df_target, how='left', on='client_id')

print(train_df.shape, test_df.shape)

(24000, 36) (6000, 36)


# Using embeddings as feature vectors

In [62]:
embed_columns = [x for x in train_df.columns if x.startswith('embed')]
x_train, y_train = train_df[embed_columns], train_df['target']
x_test, y_test = test_df[embed_columns], test_df['target']

In [63]:
model = GradientBoostingClassifier()

model.fit(x_train, y_train)
model.score(x_test, y_test)

0.3566666666666667