## Data load

In [1]:
import os

if not os.path.exists('data/transactions_train.csv'):
    ! mkdir -p data
    ! curl -OL https://storage.yandexcloud.net/di-datasets/age-prediction-nti-sbebank-2019.zip
    ! unzip -j -o age-prediction-nti-sbebank-2019.zip 'data/*.csv' -d data
    ! mv age-prediction-nti-sbebank-2019.zip data/

## Setup

In [2]:
%load_ext autoreload
%autoreload 2

# import logging
import torch
import pytorch_lightning as pl
# import warnings

# warnings.filterwarnings('ignore')
# logging.getLogger("pytorch_lightning").setLevel(logging.ERROR)

## Data preproccessing

In [3]:
import os
import pandas as pd

data_path = 'data/'

source_data = pd.read_csv(os.path.join(data_path, 'transactions_train.csv'))
source_data.head(2)

Unnamed: 0,client_id,trans_date,small_group,amount_rur
0,33172,6,4,71.463
1,33172,6,35,45.017


In [4]:
# Load pretrained preprocessor
import pickle
from ptls.data_preprocessing import PandasDataPreprocessor

with open('preprocessor.p', 'rb') as f:
    preprocessor = pickle.load(f)

In [5]:
%%time

dataset = preprocessor.transform(source_data)

CPU times: user 35.2 s, sys: 7.59 s, total: 42.8 s
Wall time: 42.7 s


In [6]:
dataset = sorted(dataset, key=lambda x: x['client_id'])

In [7]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(dataset, test_size=0.2, random_state=42)

len(train), len(test)

(24000, 6000)

In [8]:
# Load targets:

df_target = pd.read_csv(os.path.join(data_path, 'train_target.csv'))
df_target.set_index('client_id', inplace=True)
df_target.rename(columns={"bins": "target"}, inplace=True)
df_target.head(5)

Unnamed: 0_level_0,target
client_id,Unnamed: 1_level_1
24662,2
1046,0
34089,2
34848,1
47076,3


In [9]:
# Add targets to train and test:

print(train[0].keys())

for el in train:
    el['target'] = df_target['target'][el['client_id']]
for el in test:
    el['target'] = df_target['target'][el['client_id']]

print(train[0].keys())

dict_keys(['client_id', 'trans_date', 'small_group', 'amount_rur', 'event_time'])
dict_keys(['client_id', 'trans_date', 'small_group', 'amount_rur', 'event_time', 'target'])


## FineTuning

### load SequenceEncoder obtained from `coles-emb.ipynb`

In [10]:
from ptls.nn import TrxEncoder, RnnSeqEncoder

trx_encoder_params = dict(
    embeddings_noise=0.003,
    numeric_values={'amount_rur': 'identity'},
    embeddings={
        'trans_date': {'in': 800, 'out': 16},
        'small_group': {'in': 250, 'out': 16},
    },
)

seq_encoder = RnnSeqEncoder(
    trx_encoder=TrxEncoder(**trx_encoder_params),
    hidden_size=256,
    type='gru',
)

seq_encoder.load_state_dict(torch.load('coles-emb.pt'))

<All keys matched successfully>

### model

In [11]:
from functools import partial
import torch
import torchmetrics
from ptls.frames.supervised import SequenceToTarget
from ptls.nn import Head

downstream_model = SequenceToTarget(
    seq_encoder=seq_encoder,
    head=Head(
        input_size=seq_encoder.embedding_size,
        use_batch_norm=True,
        objective='classification',
        num_classes=4,
    ),
    loss=torch.nn.NLLLoss(),
    metric_list=torchmetrics.Accuracy(compute_on_step=False),
    pretrained_lr=0.0001,
    optimizer_partial=partial(torch.optim.Adam, lr=0.02),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=10, gamma=0.2),
)


### Data module

In [12]:
from ptls.data_load.data_module.seq_to_target_data_module import SeqToTargetDatamodule


finetune_dm = SeqToTargetDatamodule(
    dataset=train,
    pl_module=downstream_model,
    min_seq_len=0,
    valid_size=0.05,
    train_num_workers=0,
    train_batch_size=256,
    valid_num_workers=0,
    valid_batch_size=256,
    target_col='target',
    random_state=42)


### Trainer FineTuning

In [13]:
trainer_ft = pl.Trainer(
    max_epochs=4,
    gpus=1 if torch.cuda.is_available() else 0,
    enable_progress_bar=False,
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


### Training FineTuning

In [14]:
print(f'logger.version = {trainer_ft.logger.version}')
trainer_ft.fit(downstream_model, finetune_dm)
print(trainer_ft.logged_metrics)

logger.version = 10


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name          | Type          | Params
------------------------------------------------
0 | seq_encoder   | RnnSeqEncoder | 240 K 
1 | head          | Head          | 1.5 K 
2 | loss          | NLLLoss       | 0     
3 | train_metrics | ModuleDict    | 0     
4 | valid_metrics | ModuleDict    | 0     
5 | test_metrics  | ModuleDict    | 0     
------------------------------------------------
241 K     Trainable params
0         Non-trainable params
241 K     Total params
0.967     Total estimated model params size (MB)
  rank_zero_warn(
  rank_zero_warn(


{'loss': tensor(0.6763), 'seq_len': tensor(843.3125), 'val_Accuracy': 0.5696536302566528, 'train_Accuracy': 0.5912500023841858}


### Testing

In [15]:
test_dataloader = finetune_dm.get_test_dataloader(test, num_workers=0, batch_size=128)

trainer_ft.test(dataloaders=test_dataloader)

  rank_zero_warn(
Restoring states from the checkpoint path at /home/kireev/pycharm-deploy/pytorch-lifestream/demo/lightning_logs/version_10/checkpoints/epoch=3-step=359.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
Loaded model weights from checkpoint at /home/kireev/pycharm-deploy/pytorch-lifestream/demo/lightning_logs/version_10/checkpoints/epoch=3-step=359.ckpt
  rank_zero_warn(


--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_Accuracy': 0.6203333139419556}
--------------------------------------------------------------------------------


[{'test_Accuracy': 0.6203333139419556}]