# Coles with finetuning pretrained text encoder

Steps:
1. Load dataset with sequences with text features in events
2. Include pretrained NLP model into SeqEncoder model
3. Learn all parameters (NLP and SeqEncoder) with CoLES loss


In [1]:
import os
import pickle
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as functional
import pytorch_lightning as pl
import matplotlib.pyplot as plt

from functools import partial
from pathlib import Path

from transformers import AutoTokenizer, AutoModel

from ptls.data_load.utils import collate_feature_dict
from ptls.frames.inference_module import InferenceModule
from ptls.nn import TrxEncoder, RnnSeqEncoder
from ptls.preprocessing import PandasDataPreprocessor
from ptls.frames.coles import ColesDataset, CoLESModule
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames import PtlsDataModule
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing import ISeqLenLimit, FeatureFilter
from ptls.nn.trx_encoder.encoders import BaseEncoder

from sklearn.preprocessing import MaxAbsScaler
from sklearn.metrics import plot_roc_curve

from pytorch_lightning.loggers import TensorBoardLogger

from lightgbm import LGBMClassifier

In [2]:
data_path = Path('data')
pl.seed_everything(42)
plt.style.use('bmh')
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

Global seed set to 42


# Load data

In [3]:
if not data_path.joinpath('transactions_train.csv').exists():
    ! mkdir -p data
    ! curl -OL https://storage.yandexcloud.net/di-datasets/age-prediction-nti-sbebank-2019.zip
    ! unzip -j -o age-prediction-nti-sbebank-2019.zip 'data/*.csv' -d data
    ! mv age-prediction-nti-sbebank-2019.zip data/

# Training Sequence Encoder

## Data load

In [4]:
joined = pd.merge(
    pd.read_csv('https://huggingface.co/datasets/dllllb/age-group-prediction/resolve/main/transactions_train.csv.gz?download=true', compression='gzip'),
    pd.read_csv('https://huggingface.co/datasets/dllllb/age-group-prediction/resolve/main/small_group_description.csv?download=true').rename(columns={'small_group': 'mcc_description'}),
    left_on='small_group', right_on='small_group_code',
).drop(columns=['small_group', 'small_group_code'])

joined.head()

Unnamed: 0,client_id,trans_date,amount_rur,mcc_description
0,33172,6,71.463,Аптеки
1,33172,34,26.332,Аптеки
2,33172,37,8.569,Аптеки
3,33172,63,4.045,Аптеки
4,33172,76,19.692,Аптеки


## Data preprocessing

In [5]:
preprocessor = PandasDataPreprocessor(
    col_id='client_id',
    col_event_time='trans_date',
    event_time_transformation='none',
    cols_category=[],
    cols_numerical=['amount_rur'],
    cols_identity=['mcc_description'],
)

In [6]:
dataset = MemoryMapDataset(
    data=preprocessor.fit_transform(joined),
    i_filters=[
        ISeqLenLimit(max_seq_len=2000),
        FeatureFilter(keep_feature_names='customer_id')
    ]
)

In [7]:
with open('preprocessor.p', 'wb') as f:
    pickle.dump(preprocessor, f)

In [8]:
{k: (v[:5], v.size()) 
 if type(v) is torch.Tensor else v[:5] 
 if type(v) is np.ndarray else v for k, v in dataset[0].items()}

{'trans_date': (tensor([0, 2, 3, 4, 6]), torch.Size([720])),
 'mcc_description': array(['Сетевые супермаркеты и продуктовые магазины',
        'Оплата телефона и связи',
        'Сетевые супермаркеты и продуктовые магазины',
        'Сетевые супермаркеты и продуктовые магазины', 'Аптеки'],
       dtype=object),
 'event_time': (tensor([0, 2, 3, 4, 6]), torch.Size([720])),
 'amount_rur': (tensor([10.2090, 27.7060, 13.0240, 17.6320, 18.0890], dtype=torch.float64),
  torch.Size([720]))}

## Train-validation split

In [9]:
TRAIN_SIZE = int(len(dataset) * 0.8)
VAL_SIZE = len(dataset) - TRAIN_SIZE

train, val = torch.utils.data.random_split(dataset, [TRAIN_SIZE, VAL_SIZE])

## Embedder

defining embedder for the augmented mcc descriptions

**Each embedder has to inherit BaseEncoder and define `output_size` property**

In [10]:
class Embedder(BaseEncoder):
    def __init__(self, base_model, tokenizer):
        super().__init__()
        self.__output_size = 312
        self.base_model = base_model
        self.tokenizer = tokenizer

    def forward(self, X: np.ndarray):
        tokenized = self.tokenizer(X.tolist(), padding=True, truncation=True, return_tensors='pt')
        return functional.normalize(
            self.base_model(
                **{k: v.to(self.base_model.device) for k, v in tokenized.items()}
            ).last_hidden_state[:, 0, :])

    @property
    def output_size(self):
        return self.__output_size
    
    @property
    def batch_size(self):
        return 2048


In [11]:
tokenizer = AutoTokenizer.from_pretrained('cointegrated/rubert-tiny2')
bert = AutoModel.from_pretrained('cointegrated/rubert-tiny2')
embedder = Embedder(bert, tokenizer)

Some weights of the model checkpoint at cointegrated/rubert-tiny2 were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Model definition

In [12]:
trx_encoder_params = dict(
    embeddings_noise=0.003,
    numeric_values={'amount_rur': 'identity'},
    embeddings={
        'trans_date': {'in': 800, 'out': 16},
    },
    custom_embeddings={'mcc_description': embedder},
    norm_embeddings=False
)

seq_encoder = RnnSeqEncoder(
    trx_encoder=TrxEncoder(**trx_encoder_params),
    hidden_size=256,
    type='gru',
    bidir=False,
    trainable_starter='static'
)

model = CoLESModule(
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=0.001, weight_decay=0.0),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=30, gamma=0.9),
)

## Data loaders

In [13]:
train_dl = PtlsDataModule(
    train_data=ColesDataset(
        train,
        splitter=SampleSlices(
            split_count=5,
            cnt_min=15,
            cnt_max=75,
        ),
    ),
    train_num_workers=4,
    train_batch_size=128,
    valid_data=ColesDataset(
        val,
        splitter=SampleSlices(
            split_count=5,
            cnt_min=25,
            cnt_max=100
        )
    ),
    valid_batch_size=256,
    valid_num_workers=4
)

## Trainer

In [14]:
trainer = pl.Trainer(
    max_epochs=3,
    gpus=[0] if torch.cuda.is_available() else 0,
    enable_progress_bar=False,
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


## Training

In [15]:
%%time
print(trainer.logger.version)
trainer.fit(model, train_dl)
print(trainer.logged_metrics)

10


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name               | Type            | Params
-------------------------------------------------------
0 | _loss              | ContrastiveLoss | 0     
1 | _seq_encoder       | RnnSeqEncoder   | 29.7 M
2 | _validation_metric | BatchRecallTopK | 0     
3 | _head              | Head            | 0     
-------------------------------------------------------
29.7 M    Trainable params
0         Non-trainable params
29.7 M    Total params
118.633   Total estimated model params size (MB)


{'loss': tensor(112.8085), 'seq_len': tensor(45.4562), 'recall_top_k': tensor(0.4519)}
CPU times: user 19min 6s, sys: 15.6 s, total: 19min 22s
Wall time: 19min 28s


In [None]:
# Model saving
torch.save(model.state_dict(), 'seq_encoder.pt')

# Using embeddings for downstream task

In [None]:
model = torch.load('seq_encoder.pt')

## Inference data loaders

In [None]:
train_dl = torch.utils.data.DataLoader(
    dataset=train,
    collate_fn=collate_feature_dict,
    shuffle=False,
    batch_size=512,
    num_workers=4
)

val_dl = torch.utils.data.DataLoader(
    dataset=val,
    collate_fn=collate_feature_dict,
    shuffle=False,
    batch_size=512,
    num_workers=4
)

## Getting user embeddigns

In [None]:
inf_model = InferenceModule(
    seq_encoder
)

In [None]:
df_train = pd.concat(trainer.predict(inf_model, train_dl))
df_val = pd.concat(trainer.predict(inf_model, val_dl))

## Downstream task

In [None]:
target_df = pd.read_csv(data_path / 'gender_train.csv')

In [None]:
df_train = df_train.merge(target_df, how='left', on='customer_id').dropna()

df_val = df_val.merge(target_df, how='left', on='customer_id').dropna()

In [None]:
X_train = df_train.drop(columns=['gender'])
y_train = df_train['gender']
X_val = df_val.drop(columns=['gender'])
y_val = df_val['gender']
scaler = MaxAbsScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_val = scaler.transform(X_val)

In [None]:
model = LGBMClassifier(
    n_estimators=500,
    boosting_type='gbdt',
    objective='binary',
    metric='auc',
    subsample=0.5,
    subsample_freq=1,
    learning_rate=0.02,
    feature_fraction=0.75,
    max_depth=6,
    lambda_l1=1,
    lambda_l2=1,
    min_data_in_leaf=50,
    random_state=42,
    n_jobs=8
)

In [None]:
model = model.fit(X_train, y_train, 
    eval_set=(X_val, y_val),
    eval_metric=['roc_auc', 'accuracy'])

In [None]:
plot_roc_curve(model, X_train, y_train)
plt.title('Train ROC')
plt.show()
plot_roc_curve(model, X_val, y_val)
plt.title('Val ROC')
plt.show()