# Coles with pretrained text encoder

Steps:
1. Load dataset with sequences with text features in events
2. Encode text features with pretrained NLP model
3. Use embeddings from NLP model as event features

In [1]:
import os
import pickle
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as functional
import pytorch_lightning as pl
import matplotlib.pyplot as plt

from typing import List
from functools import partial
from pathlib import Path

from transformers import AutoTokenizer, AutoModel

from ptls.data_load.utils import collate_feature_dict
from ptls.frames.inference_module import InferenceModule
from ptls.nn import TrxEncoder, RnnSeqEncoder
from ptls.preprocessing import PandasDataPreprocessor
from ptls.frames.coles import ColesDataset, CoLESModule
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames import PtlsDataModule
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing import ISeqLenLimit, FeatureFilter
from ptls.nn.trx_encoder.encoders import IdentityEncoder

from sklearn.preprocessing import MaxAbsScaler

from pytorch_lightning.loggers import TensorBoardLogger

from lightgbm import LGBMClassifier

In [2]:
data_path = Path('data')
pl.seed_everything(42)
plt.style.use('bmh')
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

Global seed set to 42


# Training sequence encoder

creating embeddigns of mcc descriptions

In [4]:
def embed_mcc_descs(mcc_descriptions: List[str], batch_size: int = 10000):
    tokenizer = AutoTokenizer.from_pretrained('cointegrated/rubert-tiny2')
    bert = AutoModel.from_pretrained('cointegrated/rubert-tiny2').cuda()
    res = dict()
    for i in range(0, len(mcc_descriptions), batch_size):
        descs = mcc_descriptions[i:i+batch_size]
        tokens = tokenizer(descs, padding=True, truncation=True, return_tensors='pt')
        with torch.no_grad():
            out = bert(**{k: v.to(bert.device) for k, v in tokens.items()})
        embeddings = functional.normalize(out.last_hidden_state[:, 0, :]).cpu()
        res.update(dict(zip(descs, embeddings)))
    return res

## Data load and preprocessing

In [5]:
joined = pd.merge(
    pd.read_csv('https://huggingface.co/datasets/dllllb/age-group-prediction/resolve/main/transactions_train.csv.gz?download=true',compression='gzip'),
    pd.read_csv('https://huggingface.co/datasets/dllllb/age-group-prediction/resolve/main/small_group_description.csv?download=true').rename(columns={'small_group': 'mcc_description'}),
    left_on='small_group', right_on='small_group_code',
).drop(columns=['small_group', 'small_group_code'])

joined.head()

Unnamed: 0,client_id,trans_date,amount_rur,mcc_description
0,33172,6,71.463,Аптеки
1,33172,34,26.332,Аптеки
2,33172,37,8.569,Аптеки
3,33172,63,4.045,Аптеки
4,33172,76,19.692,Аптеки


In [6]:
embs = embed_mcc_descs(joined['mcc_description'].unique().tolist())
joined['mcc_description_emb'] = joined['mcc_description'].apply(lambda description: embs[description])
joined.head()

Some weights of the model checkpoint at cointegrated/rubert-tiny2 were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Unnamed: 0,client_id,trans_date,amount_rur,mcc_description,mcc_description_emb
0,33172,6,71.463,Аптеки,"[tensor(0.0262), tensor(-0.0295), tensor(-0.02..."
1,33172,34,26.332,Аптеки,"[tensor(0.0262), tensor(-0.0295), tensor(-0.02..."
2,33172,37,8.569,Аптеки,"[tensor(0.0262), tensor(-0.0295), tensor(-0.02..."
3,33172,63,4.045,Аптеки,"[tensor(0.0262), tensor(-0.0295), tensor(-0.02..."
4,33172,76,19.692,Аптеки,"[tensor(0.0262), tensor(-0.0295), tensor(-0.02..."


In [30]:
{k: v for k, v in joined.iloc[0].items()}

{'client_id': 33172,
 'trans_date': 6,
 'amount_rur': 71.463,
 'mcc_description': 'Аптеки',
 'mcc_description_emb': tensor([ 2.6240e-02, -2.9494e-02, -2.6936e-02, -3.8379e-02, -2.3740e-02,
         -2.2943e-02,  4.9838e-02,  1.5536e-02, -7.0315e-02, -4.2724e-02,
          4.9333e-02, -3.3095e-02,  1.7819e-02,  3.8584e-02,  3.2053e-02,
         -6.5642e-03,  2.5132e-02,  2.8006e-02, -1.4004e-02,  1.2496e-01,
         -4.6310e-03,  1.4552e-02,  1.3387e-03,  3.4245e-02,  1.1075e-01,
          2.0765e-02, -9.3637e-03,  4.7586e-02,  5.6251e-03,  4.1272e-02,
          4.8527e-03, -6.9148e-02,  6.3368e-03, -1.1395e-02,  8.3917e-04,
          3.2246e-03,  5.9728e-02,  2.4115e-02, -9.0850e-02, -4.7498e-02,
         -8.6390e-02,  8.3203e-02,  5.6650e-02, -9.0638e-02,  1.8658e-02,
          2.0435e-03,  1.1549e-01,  1.6694e-02,  8.7386e-02, -2.8322e-02,
          1.3581e-02,  7.5472e-02, -5.3573e-02, -9.3808e-03, -6.2176e-02,
          7.3927e-02,  3.9668e-02,  7.5701e-02, -1.7884e-02,  1.1072e-0

In [7]:
preprocessor = PandasDataPreprocessor(
    col_id='client_id',
    col_event_time='trans_date',
    event_time_transformation='none',
    cols_category=[],
    cols_numerical=['amount_rur'],
    cols_identity=['mcc_description_emb'],
)

In [8]:
%%time
dataset = MemoryMapDataset(
    data=preprocessor.fit_transform(joined.drop(columns=['mcc_description'])),
    i_filters=[
        ISeqLenLimit(max_seq_len=2000),
    ]
)

CPU times: user 2min 56s, sys: 1min 30s, total: 4min 26s
Wall time: 4min 26s


In [9]:
with open('preprocessor.p', 'wb') as f:
    pickle.dump(preprocessor, f)

In [10]:
{k: (v[:5], v.size()) if type(v) is torch.Tensor else v for k, v in dataset[0].items()}

{'client_id': 4,
 'trans_date': (tensor([0, 2, 3, 4, 6]), torch.Size([720])),
 'mcc_description_emb': (tensor([[ 0.0610,  0.0054,  0.0250,  ...,  0.0379,  0.0161, -0.0923],
          [ 0.0253,  0.0463, -0.0293,  ...,  0.0987,  0.1050, -0.0684],
          [ 0.0610,  0.0054,  0.0250,  ...,  0.0379,  0.0161, -0.0923],
          [ 0.0610,  0.0054,  0.0250,  ...,  0.0379,  0.0161, -0.0923],
          [ 0.0262, -0.0295, -0.0269,  ...,  0.0269,  0.0845, -0.1108]]),
  torch.Size([720, 312])),
 'event_time': (tensor([0, 2, 3, 4, 6]), torch.Size([720])),
 'amount_rur': (tensor([10.2090, 27.7060, 13.0240, 17.6320, 18.0890], dtype=torch.float64),
  torch.Size([720]))}

## Train-validation split

In [11]:
TRAIN_SIZE = int(len(dataset) * 0.8)
VAL_SIZE = len(dataset) - TRAIN_SIZE

train, val = torch.utils.data.random_split(dataset, [TRAIN_SIZE, VAL_SIZE])

## Model definition

In [12]:
trx_encoder_params = dict(
    embeddings_noise=0.003,
    numeric_values={'amount_rur': 'identity'},
    embeddings={
        'trans_date': {'in': 800, 'out': 16},
    },
    custom_embeddings = {'mcc_description_emb': IdentityEncoder(312)},
    norm_embeddings=False
)

seq_encoder = RnnSeqEncoder(
    trx_encoder=TrxEncoder(**trx_encoder_params),
    hidden_size=256,
    type='gru',
    bidir=False,
    trainable_starter='static'
)

model = CoLESModule(
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=0.001, weight_decay=0.0),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=30, gamma=0.9),
)

## Data loaders

In [13]:
train_dl = PtlsDataModule(
    train_data=ColesDataset(
        train,
        splitter=SampleSlices(
            split_count=5,
            cnt_min=15,
            cnt_max=75,
        ),
    ),
    train_num_workers=4,
    train_batch_size=256,
    valid_data=ColesDataset(
        val,
        splitter=SampleSlices(
            split_count=5,
            cnt_min=25,
            cnt_max=200
        )
    ),
    valid_batch_size=256,
    valid_num_workers=4
)

## Trainer

In [14]:
trainer = pl.Trainer(
    max_epochs=15,
    gpus=1 if torch.cuda.is_available() else 0,
    enable_progress_bar=False,
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


## Training

In [15]:
%%time
print(trainer.logger.version)
trainer.fit(model, train_dl)
print(trainer.logged_metrics)

Missing logger folder: /home/jovyan/src/pytorch-lifestream/demo/lightning_logs


0


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name               | Type            | Params
-------------------------------------------------------
0 | _loss              | ContrastiveLoss | 0     
1 | _seq_encoder       | RnnSeqEncoder   | 464 K 
2 | _validation_metric | BatchRecallTopK | 0     
3 | _head              | Head            | 0     
-------------------------------------------------------
464 K     Trainable params
0         Non-trainable params
464 K     Total params
1.858     Total estimated model params size (MB)


{'loss': tensor(345.2067), 'seq_len': tensor(45.0979), 'recall_top_k': tensor(0.6402)}
CPU times: user 2min 49s, sys: 2min 35s, total: 5min 25s
Wall time: 6min 1s


In [16]:
# Model saving
torch.save(model.state_dict(), 'seq_encoder.pt')

# Using embeddings for downstream task

In [17]:
model = torch.load('seq_encoder.pt')

## Inference data loaders

In [18]:
train_dl = torch.utils.data.DataLoader(
    dataset=train,
    collate_fn=collate_feature_dict,
    shuffle=False,
    batch_size=512,
    num_workers=4
)

val_dl = torch.utils.data.DataLoader(
    dataset=val,
    collate_fn=collate_feature_dict,
    shuffle=False,
    batch_size=512,
    num_workers=4
)

## Getting user embeddings

In [19]:
inf_model = InferenceModule(
    seq_encoder
)

In [20]:
df_train = pd.concat(trainer.predict(inf_model, train_dl))
df_val = pd.concat(trainer.predict(inf_model, val_dl))

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


## Downstream task

In [21]:
target_df = pd.read_csv('https://huggingface.co/datasets/dllllb/age-group-prediction/resolve/main/train_target.csv?download=true')

In [22]:
df_train = df_train.merge(target_df, how='left', on='client_id').dropna()
df_val = df_val.merge(target_df, how='left', on='client_id').dropna()

In [23]:
X_train = df_train.drop(columns=['bins'])
y_train = df_train['bins']
X_val = df_val.drop(columns=['bins'])
y_val = df_val['bins']
scaler = MaxAbsScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_val = scaler.transform(X_val)

In [24]:
model = LGBMClassifier(
    n_estimators=1000,
    boosting_type='gbdt',
    objective='multiclass',
    num_class=4,
    metric='multi_error',
    learning_rate=0.02,
    subsample=0.75,
    subsample_freq=1,
    feature_fraction=0.75,
    colsample_bytree=None,
    max_depth=12,
    lambda_l1=1,
    reg_alpha=None,
    lambda_l2=1,
    reg_lambda=None,
    min_data_in_leaf=50,
    min_child_samples=None,
    num_leaves=50,
    random_state=42,
    n_jobs=4,
)

In [25]:
%%time
model = model.fit(X_train, y_train)

CPU times: user 4min 23s, sys: 929 ms, total: 4min 24s
Wall time: 1min 6s


In [26]:
model.score(X_val, y_val)

0.6028333333333333