# Coles with pretrained text encoder

Steps:
1. Load dataset with sequences with text features in events
2. Encode text features with pretrained NLP model
3. Use embeddings from NLP model as event features

# Colab setup

In [None]:
import sys
if "google.colab" in str(get_ipython()):
    ! {sys.executable} -m pip install pytorch-lifestream
    ! {sys.executable} -m pip install catboost

Collecting pytorch-lifestream
  Downloading pytorch-lifestream-0.6.0.tar.gz (163 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/163.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━[0m [32m143.4/163.4 kB[0m [31m5.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.4/163.4 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting hydra-core>=1.1.2 (from pytorch-lifestream)
  Downloading hydra_core-1.3.2-py3-none-any.whl.metadata (5.5 kB)
Collecting omegaconf (from pytorch-lifestream)
  Downloading omegaconf-2.3.0-py3-none-any.whl.metadata (3.9 kB)
Collecting pytorch-lightning>=1.6.0 (from pytorch-lifestream)
  Downloading pytorch_lightning-2.4.0-py3-none-any.whl.metadata (2

In [None]:
import os
import pickle
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as functional
import pytorch_lightning as pl
import matplotlib.pyplot as plt

from typing import List
from functools import partial
from pathlib import Path

from transformers import AutoTokenizer, AutoModel

from ptls.data_load.utils import collate_feature_dict
from ptls.frames.inference_module import InferenceModule
from ptls.nn import TrxEncoder, RnnSeqEncoder
from ptls.preprocessing import PandasDataPreprocessor
from ptls.frames.coles import ColesDataset, CoLESModule
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames import PtlsDataModule
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing import ISeqLenLimit, FeatureFilter
from ptls.nn.trx_encoder.encoders import IdentityEncoder

from sklearn.preprocessing import MaxAbsScaler

from lightgbm import LGBMClassifier

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [None]:
data_path = Path("data")
pl.seed_everything(42)
plt.style.use("bmh")
os.environ["TOKENIZERS_PARALLELISM"] = "false"

INFO:lightning_fabric.utilities.seed:Seed set to 42


# Training sequence encoder

creating embeddigns of mcc descriptions

In [None]:
def embed_mcc_descs(mcc_descriptions: List[str], batch_size: int = 10000):
    tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
    bert = AutoModel.from_pretrained("cointegrated/rubert-tiny2").cuda()
    res = dict()

    for i in range(0, len(mcc_descriptions), batch_size):
        descs = mcc_descriptions[i:i+batch_size]
        tokens = tokenizer(descs, padding=True, truncation=True, return_tensors="pt")

        with torch.no_grad():
            out = bert(**{k: v.to(bert.device) for k, v in tokens.items()})

        embeddings = functional.normalize(out.last_hidden_state[:, 0, :]).cpu()
        res.update(dict(zip(descs, embeddings)))

    return res

## Data load and preprocessing

In [None]:
path_train = "https://huggingface.co/datasets/dllllb/age-group-prediction/resolve/main/transactions_train.csv.gz?download=true"
path_desc = "https://huggingface.co/datasets/dllllb/age-group-prediction/resolve/main/small_group_description.csv?download=true"

# Just for the demonstration (the demo will work faster), we will take
# only a part of the training data, so the training result will be poor.
joined = pd.merge(
    pd.read_csv(path_train,compression="gzip"),
    pd.read_csv(path_desc).rename(columns={"small_group": "mcc_description"}),
    left_on="small_group",
    right_on="small_group_code",
).drop(columns=["small_group", "small_group_code"]).iloc[:800000]

joined

Unnamed: 0,client_id,trans_date,amount_rur,mcc_description
0,33172,6,71.463,Аптеки
1,33172,34,26.332,Аптеки
2,33172,37,8.569,Аптеки
3,33172,63,4.045,Аптеки
4,33172,76,19.692,Аптеки
...,...,...,...,...
799995,45113,262,5.973,Аптеки
799996,45113,272,15.616,Аптеки
799997,45113,285,111.099,Аптеки
799998,45113,286,10.739,Аптеки


In [None]:
embs = embed_mcc_descs(joined["mcc_description"].unique().tolist())

joined["mcc_description_emb"] = joined["mcc_description"] .apply(
    lambda description: embs[description]
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/401 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.74M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/693 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/118M [00:00<?, ?B/s]

In [None]:
joined

Unnamed: 0,client_id,trans_date,amount_rur,mcc_description,mcc_description_emb
0,33172,6,71.463,Аптеки,"[tensor(0.0262), tensor(-0.0295), tensor(-0.02..."
1,33172,34,26.332,Аптеки,"[tensor(0.0262), tensor(-0.0295), tensor(-0.02..."
2,33172,37,8.569,Аптеки,"[tensor(0.0262), tensor(-0.0295), tensor(-0.02..."
3,33172,63,4.045,Аптеки,"[tensor(0.0262), tensor(-0.0295), tensor(-0.02..."
4,33172,76,19.692,Аптеки,"[tensor(0.0262), tensor(-0.0295), tensor(-0.02..."
...,...,...,...,...,...
799995,45113,262,5.973,Аптеки,"[tensor(0.0262), tensor(-0.0295), tensor(-0.02..."
799996,45113,272,15.616,Аптеки,"[tensor(0.0262), tensor(-0.0295), tensor(-0.02..."
799997,45113,285,111.099,Аптеки,"[tensor(0.0262), tensor(-0.0295), tensor(-0.02..."
799998,45113,286,10.739,Аптеки,"[tensor(0.0262), tensor(-0.0295), tensor(-0.02..."


In [None]:
preprocessor = PandasDataPreprocessor(
    col_id="client_id",
    col_event_time="trans_date",
    event_time_transformation="none",
    cols_category=[],
    cols_numerical=["amount_rur"],
    cols_identity=["mcc_description_emb"],
)

In [None]:
dataset = MemoryMapDataset(
    data=preprocessor.fit_transform(
        joined.drop(columns=["mcc_description"])
    ),
    i_filters=[
        ISeqLenLimit(max_seq_len=200),
    ]
)

## Train-validation split

In [None]:
TRAIN_SIZE = int(len(dataset) * 0.8)
VAL_SIZE = len(dataset) - TRAIN_SIZE

train, val = torch.utils.data.random_split(dataset, [TRAIN_SIZE, VAL_SIZE])

## Model definition

In [None]:
trx_encoder_params = dict(
    embeddings_noise=0.003,
    numeric_values={"amount_rur": "identity"},
    embeddings={"trans_date": {"in": 800, "out": 16}},
    custom_embeddings = {"mcc_description_emb": IdentityEncoder(312)},
    norm_embeddings=False
)

seq_encoder = RnnSeqEncoder(
    trx_encoder=TrxEncoder(**trx_encoder_params),
    hidden_size=256,
    type="gru",
    bidir=False,
    trainable_starter="static"
)

model = CoLESModule(
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=0.001, weight_decay=0.0),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=30, gamma=0.9),
)

## Data loaders

In [None]:
train_dl = PtlsDataModule(
    train_data=ColesDataset(
        train,
        splitter=SampleSlices(
            split_count=5,
            cnt_min=15,
            cnt_max=75,
        ),
    ),
    train_num_workers=4,
    train_batch_size=256,
    valid_data=ColesDataset(
        val,
        splitter=SampleSlices(
            split_count=5,
            cnt_min=25,
            cnt_max=200
        )
    ),
    valid_batch_size=256,
    valid_num_workers=4
)

## Training

In [None]:
trainer = pl.Trainer(
    max_epochs=15,
    accelerator="cuda" if torch.cuda.is_available() else "cpu",
    enable_progress_bar=False,
)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [None]:
trainer.fit(model, train_dl)
print(trainer.logged_metrics)

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name               | Type            | Params | Mode 
---------------------------------------------------------------
0 | _loss              | ContrastiveLoss | 0      | train
1 | _seq_encoder       | RnnSeqEncoder   | 464 K  | train
2 | _validation_metric | BatchRecallTopK | 0      | train
3 | _head              | Head            | 0      | train
---------------------------------------------------------------
464 K     Trainable params
0         Non-trainable params
464 K     Total params
1.858     Total estimated model params size (MB)
18        Modules in train mode
0         Modules in eval mode
  self.pid = os.fork()
  self.pid = os.fork()
INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=15` reached.


{'loss': tensor(163.8855), 'seq_len': tensor(28.1966), 'valid/recall_top_k': tensor(0.9799)}


In [None]:
torch.save(model.state_dict(), "seq_encoder.pt")

# Using embeddings for downstream task

## Inference data loaders

In [None]:
# model = torch.load("seq_encoder.pt")

In [None]:
train_dl = torch.utils.data.DataLoader(
    dataset=train,
    collate_fn=collate_feature_dict,
    shuffle=False,
    batch_size=512,
    num_workers=4
)

val_dl = torch.utils.data.DataLoader(
    dataset=val,
    collate_fn=collate_feature_dict,
    shuffle=False,
    batch_size=512,
    num_workers=4
)

## Getting user embeddings

In [None]:
inf_model = InferenceModule(seq_encoder)

In [None]:
df_train = pd.concat(trainer.predict(inf_model, train_dl))
df_val = pd.concat(trainer.predict(inf_model, val_dl))

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  self.pid = os.fork()
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  self.pid = os.fork()
  self.pid = os.fork()


## Downstream task

In [None]:
path = "https://huggingface.co/datasets/dllllb/age-group-prediction/resolve/main/train_target.csv?download=true"
target_df = pd.read_csv(path)

In [None]:
df_train = df_train.merge(target_df, how="left", on="client_id").dropna()
df_val = df_val.merge(target_df, how="left", on="client_id").dropna()

In [None]:
X_train = df_train.drop(columns=["bins"])
y_train = df_train["bins"]
X_val = df_val.drop(columns=["bins"])
y_val = df_val["bins"]
scaler = MaxAbsScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_val = scaler.transform(X_val)

In [None]:
model = LGBMClassifier(
    n_estimators=1000,
    boosting_type="gbdt",
    objective="multiclass",
    num_class=4,
    metric="multi_error",
    learning_rate=0.02,
    subsample=0.75,
    subsample_freq=1,
    feature_fraction=0.75,
    colsample_bytree=None,
    max_depth=12,
    lambda_l1=1,
    reg_alpha=None,
    lambda_l2=1,
    reg_lambda=None,
    min_data_in_leaf=50,
    min_child_samples=None,
    num_leaves=50,
    random_state=42,
    n_jobs=4,
)

In [None]:
model = model.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.046689 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 65535
[LightGBM] [Info] Number of data points in the train set: 14568, number of used features: 257
[LightGBM] [Info] Start training from score -1.317082
[LightGBM] [Info] Start training from score -1.132755
[LightGBM] [Info] Start training from score -2.234783
[LightGBM] [Info] Start training from score -1.194273


In [None]:
model.score(X_val, y_val)

0.36673071644249244