## Setup

In [4]:
import numpy as np
import pickle
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader

In [5]:
%load_ext autoreload
%autoreload 2
    

# import logging
import torch
from torch import nn
import pytorch_lightning as pl
# import warnings

# warnings.filterwarnings('ignore')
# logging.getLogger("pytorch_lightning").setLevel(logging.ERROR)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
import os

if not os.path.exists('data/transactions_train.csv'):
    ! mkdir -p data
    ! curl -OL https://storage.yandexcloud.net/ptls-datasets/age-prediction-nti-sbebank-2019.zip
    ! unzip -j -o age-prediction-nti-sbebank-2019.zip 'data/*.csv' -d data
    ! mv age-prediction-nti-sbebank-2019.zip data/

## Data preproccessing

In [6]:
import os
import pandas as pd

data_path = 'data/'

source_data = pd.read_csv(os.path.join(data_path, 'transactions_train.csv'))
#source_data = pd.read_csv(os.path.join(data_path, 'transactions.csv'))
#source_data = pd.read_csv(os.path.join(data_path, 'transactions_train_tokenized_20bins.csv'))
source_data.head(2)

Unnamed: 0,client_id,trans_date,small_group,amount_rur
0,33172,6,4,71.463
1,33172,6,35,45.017


In [7]:
import ptls

In [8]:
def bin_numeric(data, params):
    for fe in list(params.keys()):
        num_bins = params[fe] - 1
        quantiles = np.quantile(data[fe], [i/num_bins for i in range(num_bins)])
        for i in range(0, num_bins):
            data.loc[(data[fe] > quantiles[i - 1]) * (data[fe] <= quantiles[i]), fe] = i
        data.loc[data[fe] > quantiles[num_bins -1], fe] = num_bins
        data[fe] = data[fe].astype("int64")
    return data

In [9]:
from ptls.preprocessing import PandasDataPreprocessor

preprocessor = PandasDataPreprocessor(
    col_id='client_id',
    col_event_time='trans_date',
    event_time_transformation='none',
    cols_category=['small_group'],
    cols_numerical=['amount_rur'],
    return_records=True,
)

data=source_data

In [10]:
from ptls.nn.trx_encoder.glove_embedding import GloveEmbedding

In [11]:
#primary encoding of bin features with bin categories

data = bin_numeric(source_data, {"amount_rur" : 50})

In [12]:
glove_embedding = GloveEmbedding(
        feature_names=["trans_date", "small_group", "amount_rur"],
        calculate_cooccur=True,
        embedding_folder="glove_embeddings",
        glove_params={"alpha" : 0.75, 
                      "x_max" : 5000, 
                      "embedding_size" : 24, 
                      "num_epochs_train" : 50}
    )

In [21]:
#glove_embedding.load()

In [13]:
glove_embedding.fit(data)

train started


  2%|█▋                                                                                 | 1/50 [00:05<04:49,  5.90s/it]

Epoch 0: loss = 14057.688468848433


  4%|███▎                                                                               | 2/50 [00:11<04:30,  5.65s/it]

Epoch 1: loss = 6069.036664531014


  6%|████▉                                                                              | 3/50 [00:16<04:22,  5.59s/it]

Epoch 2: loss = 3501.885680750742


  8%|██████▋                                                                            | 4/50 [00:22<04:15,  5.55s/it]

Epoch 3: loss = 2235.5091814691814


 10%|████████▎                                                                          | 5/50 [00:27<04:09,  5.55s/it]

Epoch 4: loss = 1531.498612244113


 12%|█████████▉                                                                         | 6/50 [00:33<04:08,  5.64s/it]

Epoch 5: loss = 1103.8322947123788


 14%|███████████▌                                                                       | 7/50 [00:39<04:02,  5.64s/it]

Epoch 6: loss = 826.5233783969879


 16%|█████████████▎                                                                     | 8/50 [00:45<03:59,  5.70s/it]

Epoch 7: loss = 637.4969524432029


 18%|██████████████▉                                                                    | 9/50 [00:51<03:55,  5.74s/it]

Epoch 8: loss = 504.0402328820994


 20%|████████████████▍                                                                 | 10/50 [00:56<03:51,  5.79s/it]

Epoch 9: loss = 407.09062228438154


 22%|██████████████████                                                                | 11/50 [01:02<03:46,  5.80s/it]

Epoch 10: loss = 334.9180950354404


 24%|███████████████████▋                                                              | 12/50 [01:08<03:40,  5.80s/it]

Epoch 11: loss = 280.06819134304106


 26%|█████████████████████▎                                                            | 13/50 [01:14<03:36,  5.84s/it]

Epoch 12: loss = 237.95407057340555


 28%|██████████████████████▉                                                           | 14/50 [01:20<03:37,  6.03s/it]

Epoch 13: loss = 205.02953164231266


 30%|████████████████████████▌                                                         | 15/50 [01:27<03:36,  6.17s/it]

Epoch 14: loss = 178.9857703412006


 32%|██████████████████████████▏                                                       | 16/50 [01:33<03:30,  6.20s/it]

Epoch 15: loss = 158.08707270437242


 34%|███████████████████████████▉                                                      | 17/50 [01:39<03:21,  6.11s/it]

Epoch 16: loss = 141.2035379848564


 36%|█████████████████████████████▌                                                    | 18/50 [01:46<03:22,  6.31s/it]

Epoch 17: loss = 127.41894690061815


 38%|███████████████████████████████▏                                                  | 19/50 [01:52<03:11,  6.19s/it]

Epoch 18: loss = 115.99045577653341


 40%|████████████████████████████████▊                                                 | 20/50 [01:58<03:06,  6.23s/it]

Epoch 19: loss = 106.50309436909927


 42%|██████████████████████████████████▍                                               | 21/50 [02:04<02:56,  6.10s/it]

Epoch 20: loss = 98.53721350188651


 44%|████████████████████████████████████                                              | 22/50 [02:10<02:50,  6.10s/it]

Epoch 21: loss = 91.80086323349872


 46%|█████████████████████████████████████▋                                            | 23/50 [02:16<02:43,  6.07s/it]

Epoch 22: loss = 86.05985830599


 48%|███████████████████████████████████████▎                                          | 24/50 [02:22<02:35,  5.99s/it]

Epoch 23: loss = 81.12292151658455


 50%|█████████████████████████████████████████                                         | 25/50 [02:28<02:28,  5.94s/it]

Epoch 24: loss = 76.85578199639411


 52%|██████████████████████████████████████████▋                                       | 26/50 [02:33<02:21,  5.89s/it]

Epoch 25: loss = 73.15396532517015


 54%|████████████████████████████████████████████▎                                     | 27/50 [02:39<02:15,  5.88s/it]

Epoch 26: loss = 69.91049950624785


 56%|█████████████████████████████████████████████▉                                    | 28/50 [02:45<02:08,  5.82s/it]

Epoch 27: loss = 67.05063232323762


 58%|███████████████████████████████████████████████▌                                  | 29/50 [02:51<02:04,  5.93s/it]

Epoch 28: loss = 64.52585434634346


 60%|█████████████████████████████████████████████████▏                                | 30/50 [02:57<01:58,  5.94s/it]

Epoch 29: loss = 62.285389414509076


 62%|██████████████████████████████████████████████████▊                               | 31/50 [03:03<01:51,  5.88s/it]

Epoch 30: loss = 60.272832942338916


 64%|████████████████████████████████████████████████████▍                             | 32/50 [03:09<01:45,  5.84s/it]

Epoch 31: loss = 58.46270380559316


 66%|██████████████████████████████████████████████████████                            | 33/50 [03:15<01:39,  5.85s/it]

Epoch 32: loss = 56.834330020253425


 68%|███████████████████████████████████████████████████████▊                          | 34/50 [03:20<01:33,  5.82s/it]

Epoch 33: loss = 55.35804658532015


 70%|█████████████████████████████████████████████████████████▍                        | 35/50 [03:27<01:29,  5.96s/it]

Epoch 34: loss = 54.01873524441874


 72%|███████████████████████████████████████████████████████████                       | 36/50 [03:33<01:23,  5.97s/it]

Epoch 35: loss = 52.79255632365587


 74%|████████████████████████████████████████████████████████████▋                     | 37/50 [03:38<01:16,  5.91s/it]

Epoch 36: loss = 51.67931820564532


 76%|██████████████████████████████████████████████████████████████▎                   | 38/50 [03:44<01:10,  5.91s/it]

Epoch 37: loss = 50.70263703145416


 78%|███████████████████████████████████████████████████████████████▉                  | 39/50 [03:50<01:04,  5.88s/it]

Epoch 38: loss = 49.68374901322384


 80%|█████████████████████████████████████████████████████████████████▌                | 40/50 [03:56<00:58,  5.87s/it]

Epoch 39: loss = 48.81821487650481


 82%|███████████████████████████████████████████████████████████████████▏              | 41/50 [04:02<00:53,  5.92s/it]

Epoch 40: loss = 47.98499196124035


 84%|████████████████████████████████████████████████████████████████████▉             | 42/50 [04:08<00:47,  5.98s/it]

Epoch 41: loss = 47.21933452459286


 86%|██████████████████████████████████████████████████████████████████████▌           | 43/50 [04:14<00:41,  6.00s/it]

Epoch 42: loss = 46.518114230009466


 88%|████████████████████████████████████████████████████████████████████████▏         | 44/50 [04:20<00:35,  5.99s/it]

Epoch 43: loss = 45.843520483001406


 90%|█████████████████████████████████████████████████████████████████████████▊        | 45/50 [04:26<00:30,  6.02s/it]

Epoch 44: loss = 45.22176668280066


 92%|███████████████████████████████████████████████████████████████████████████▍      | 46/50 [04:32<00:23,  5.97s/it]

Epoch 45: loss = 44.63243638977694


 94%|█████████████████████████████████████████████████████████████████████████████     | 47/50 [04:38<00:17,  5.94s/it]

Epoch 46: loss = 44.08967729799836


 96%|██████████████████████████████████████████████████████████████████████████████▋   | 48/50 [04:44<00:11,  5.91s/it]

Epoch 47: loss = 43.572566422965316


 98%|████████████████████████████████████████████████████████████████████████████████▎ | 49/50 [04:50<00:05,  5.94s/it]

Epoch 48: loss = 43.076340482519775


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [04:56<00:00,  5.92s/it]

Epoch 49: loss = 42.6011365113237





In [14]:
data = glove_embedding.tokenize_data(data)

In [15]:
%%time

dataset = preprocessor.fit_transform(data)

CPU times: total: 42.3 s
Wall time: 45.7 s


In [16]:
import pickle

with open('preprocessor.p', 'wb') as f:
    pickle.dump(preprocessor, f)

In [17]:
dataset = sorted(dataset, key=lambda x: x['client_id'])

In [18]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(dataset, test_size=0.2, random_state=42)

len(train), len(test)

(24000, 6000)

In [19]:
del dataset

## Embedding training

Model training in our framework organised via pytorch-lightning (pl) framework.
The key parts of neural networks training in pl are: 

    * model (`pytorch_lightning.LightningModule`)
    * data loader (`torch.utils.data.DataLoader`)
    * trainer (`pytorch_lightning.Trainer`)
    
For futher details check https://pytorchlightning.ai/

### Model definition

In [20]:
import ptls

In [21]:
from functools import partial
from ptls.nn import RnnSeqEncoder, TrxEncoder
from ptls.nn.trx_encoder.trx_encoder_glove import TrxEncoderGlove
from ptls.frames.coles import CoLESModule

trx_encoder_params = dict(
    embeddings_noise=0.003,
    numeric_values={'amount_rur': 'identity'},
    embeddings={
        'trans_date': {'in': 800, 'out': 16},
        'small_group': {'in': 250, 'out': 16},
    },
)

seq_encoder = RnnSeqEncoder(
    trx_encoder=TrxEncoderGlove(glove_embedding, agg_type="mean"),
    hidden_size=256,
    type='gru',
)

# seq_encoder = RnnSeqEncoder(
#     trx_encoder=TrxEncoder(**trx_encoder_params),
#     hidden_size=256,
#     type='gru',
# )

model = CoLESModule(
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=0.001),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=30, gamma=0.9),
)

### Data loader

In [22]:
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.frames.coles import ColesDataset
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames import PtlsDataModule

train_dl = PtlsDataModule(
    train_data=ColesDataset(
        MemoryMapDataset(
            data=train,
            i_filters=[
                SeqLenFilter(min_seq_len=25),
            ],
        ),
        splitter=SampleSlices(
            split_count=5,
            cnt_min=25,
            cnt_max=200,
        ),
    ),
    train_num_workers=1,
    train_batch_size=256,
)

### Trainer

In [23]:
import torch
import pytorch_lightning as pl

import logging

trainer = pl.Trainer(
    max_epochs=15,
    accelerator="gpu",
    #devices=1,
    enable_progress_bar=True,
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


### Training 

In [24]:
%%time
print(f'logger.version = {trainer.logger.version}')
trainer.fit(model, train_dl)
print(trainer.logged_metrics)

logger.version = 27


  rank_zero_warn(
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name               | Type            | Params
-------------------------------------------------------
0 | _loss              | ContrastiveLoss | 0     
1 | _seq_encoder       | RnnSeqEncoder   | 239 K 
2 | _validation_metric | BatchRecallTopK | 0     
3 | _head              | Head            | 0     
-------------------------------------------------------
216 K     Trainable params
23.0 K    Non-trainable params
239 K     Total params
0.959     Total estimated model params size (MB)
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.


{'loss': tensor(471.2338), 'seq_len': tensor(112.9271)}
CPU times: total: 27.6 s
Wall time: 1min


### Save sequence encoder for other experiments

In [22]:
torch.save(seq_encoder.state_dict(), "coles-emb.pt")

## Inference 

In [25]:
# embedding inference

from ptls.data_load.datasets import inference_data_loader

train_dl = inference_data_loader(train, num_workers=0, batch_size=256)
train_embeds = torch.vstack(trainer.predict(model, train_dl, ))

test_dl = inference_data_loader(test, num_workers=0, batch_size=256)
test_embeds = torch.vstack(trainer.predict(model, test_dl))

train_embeds.shape, test_embeds.shape

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(


Predicting: 94it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 94it [00:00, ?it/s]

(torch.Size([24000, 256]), torch.Size([6000, 256]))

In [26]:
# join target and embeddings

df_target = pd.read_csv(os.path.join(data_path, 'train_target.csv'))
df_target = df_target.set_index('client_id')
df_target.rename(columns={"bins": "target"}, inplace=True)

train_df = pd.DataFrame(data=train_embeds, columns=[f'embed_{i}' for i in range(train_embeds.shape[1])])
train_df['client_id'] = [x['client_id'] for x in train]
train_df = train_df.merge(df_target, how='left', on='client_id')

test_df = pd.DataFrame(data=test_embeds, columns=[f'embed_{i}' for i in range(test_embeds.shape[1])])
test_df['client_id'] = [x['client_id'] for x in test]
test_df = test_df.merge(df_target, how='left', on='client_id')

print(train_df.shape, test_df.shape)

(24000, 258) (6000, 258)


Obtained embeddings can be used as features for model training

For example:

In [None]:
from sklearn.ensemble import RandomForestClassifier

sc = 0
for i in range(5):
    embed_columns = [x for x in train_df.columns if x.startswith('embed')]
    x_train, y_train = train_df[embed_columns], train_df['target']
    x_test, y_test = test_df[embed_columns], test_df['target']
    
    clf = RandomForestClassifier()
    clf.fit(x_train, y_train)
    sc += clf.score(x_test, y_test)
sc/5