# Colab setup

In [None]:
import sys
if "google.colab" in str(get_ipython()):
    ! {sys.executable} -m pip install pytorch-lifestream
    ! {sys.executable} -m pip install catboost
    ! {sys.executable} -m pip install torchmetrics

Collecting pytorch-lifestream
  Downloading pytorch-lifestream-0.6.0.tar.gz (163 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.4/163.4 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting hydra-core>=1.1.2 (from pytorch-lifestream)
  Downloading hydra_core-1.3.2-py3-none-any.whl.metadata (5.5 kB)
Collecting omegaconf (from pytorch-lifestream)
  Downloading omegaconf-2.3.0-py3-none-any.whl.metadata (3.9 kB)
Collecting pytorch-lightning>=1.6.0 (from pytorch-lifestream)
  Downloading pytorch_lightning-2.4.0-py3-none-any.whl.metadata (21 kB)
Collecting torchmetrics>=0.9.0 (from pytorch-lifestream)
  Downloading torchmetrics-1.4.1-py3-none-any.whl.metadata (20 kB)
Collecting antlr4-python3-runtime==4.9.* (from hydra-core>=1.1.2->pytorch-lifestream)
  Downloading antlr4-python3-runtime-4.

# Supervised task

## Prepare your data

- Use `Pyspark` in local or cluster mode for big dataset and `Pandas` for small.
- Split data into required parts (train, valid, test, ...).
- Use `ptls.preprocessing` for simple data preparation.
- Transform features to compatible format using `Pyspark` or `Pandas` functions.
You can also use `ptls.data_load.preprocessing` for common data transformation patterns.
- Split sequences to `ptls-data` format with `ptls.data_load.split_tools`. Save prepared data into `Parquet` format or
keep it in memory (`Pickle` also works).
- Use one of the available `ptls.data_load.datasets` to define input for the models.

In [None]:
import torch

import numpy as np
import pandas as pd
import torchmetrics
import pytorch_lightning as pl

from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from functools import partial
from ptls.frames import PtlsDataModule
from ptls.nn import TrxEncoder, RnnSeqEncoder, Head
from ptls.data_load.datasets import MemoryMapDataset
from ptls.preprocessing import PandasDataPreprocessor
from ptls.frames.supervised import SeqToTargetDataset, SequenceToTarget
from ptls.data_load.utils import collate_feature_dict
from ptls.frames.inference_module import InferenceModule

In [None]:
df_target = pd.read_csv(
    "https://huggingface.co/datasets/dllllb/age-group-prediction/resolve/main/train_target.csv?download=true"
)
df_target.head()

Unnamed: 0,client_id,bins
0,24662,2
1,1046,0
2,34089,2
3,34848,1
4,47076,3


In [None]:
df_target_train, df_target_test = train_test_split(
    df_target, test_size=7000, stratify=df_target["bins"], random_state=142)
df_target_train, df_target_valid = train_test_split(
    df_target_train, test_size=3000, stratify=df_target_train["bins"], random_state=142)
print("Split {} records to train: {}, valid: {}, test: {}".format(
    *[
      len(df)
      for df in [df_target, df_target_train, df_target_valid, df_target_test]
    ]
))

Split 30000 records to train: 20000, valid: 3000, test: 7000


In [None]:
df_trx = pd.read_csv(
    "https://huggingface.co/datasets/dllllb/age-group-prediction/resolve/main/transactions_train.csv.gz?download=true",
    compression="gzip"
)
df_trx

Unnamed: 0,client_id,trans_date,small_group,amount_rur
0,33172,6,4,71.463
1,33172,6,35,45.017
2,33172,8,11,13.887
3,33172,9,11,15.983
4,33172,10,11,21.341
...,...,...,...,...
26450572,43300,727,25,7.602
26450573,43300,727,15,3.709
26450574,43300,727,1,6.448
26450575,43300,727,11,24.669


In [None]:
df_trx_train = pd.merge(df_trx, df_target_train["client_id"], on="client_id", how="inner")
df_trx_valid = pd.merge(df_trx, df_target_valid["client_id"], on="client_id", how="inner")
df_trx_test = pd.merge(df_trx, df_target_test["client_id"], on="client_id", how="inner")
print("Split {} transactions to train: {}, valid: {}, test: {}".format(
    *[len(df) for df in [df_trx, df_trx_train, df_trx_valid, df_trx_test]]))

Split 26450577 transactions to train: 17622321, valid: 2634248, test: 6194008


In [None]:
preprocessor = PandasDataPreprocessor(
    col_id="client_id",
    col_event_time="trans_date",
    event_time_transformation="none",
    cols_category=["small_group"],
    cols_numerical=["amount_rur"],
    return_records=False,
)

In [None]:
df_data_train = preprocessor.fit_transform(df_trx_train)
df_data_valid = preprocessor.transform(df_trx_valid)
df_data_test = preprocessor.transform(df_trx_test)

In [None]:
print(
    "Record in dataset, train {}, valid {}, test {}".format(
        *[len(df) for df in [df_data_train, df_data_valid, df_data_test]]
    )
)
print("Each record is a client with list of transactions")

Record in dataset, train 20000, valid 3000, test 7000
Each record is a client with list of transactions


In [None]:
df_target = df_target.rename(columns={"bins": "target_bin"})

In [None]:
df_data_train = pd.merge(df_data_train, df_target, on="client_id")
df_data_valid = pd.merge(df_data_valid, df_target, on="client_id")
df_data_test = pd.merge(df_data_test, df_target, on="client_id")

In [None]:
df_data_train = df_data_train.to_dict(orient="records")
df_data_valid = df_data_valid.to_dict(orient="records")
df_data_test = df_data_test.to_dict(orient="records")

In [None]:
# show first 10 transactions from one record
rec = df_data_train[0]
{k: v[:10] if type(v) is torch.Tensor else v for k, v in rec.items()}

{'client_id': 6,
 'trans_date': tensor([ 0,  5, 10, 11, 15, 15, 16, 16, 17, 18]),
 'event_time': tensor([ 0,  5, 10, 11, 15, 15, 16, 16, 17, 18]),
 'small_group': tensor([ 4,  3,  1,  3,  4,  1,  4,  3, 18,  2]),
 'amount_rur': tensor([ 4.0540, 13.7380, 20.7010, 21.5640, 13.4990, 23.7220,  4.3040,  8.6250,
         12.9380, 28.1620], dtype=torch.float64),
 'target_bin': 1}

In [None]:
dataset_train = MemoryMapDataset(df_data_train)
dataset_valid = MemoryMapDataset(df_data_valid)
dataset_test = MemoryMapDataset(df_data_test)

## Build encoder

- All parts are available in `ptls.nn`.
- You can also use pretrained layers.

In [None]:
preprocessor.get_category_dictionary_sizes()

{'small_group': 203}

In [None]:
seq_encoder = RnnSeqEncoder(
    trx_encoder=TrxEncoder(
        embeddings={
            "small_group": {"in": 150, "out": 32},
        },
        numeric_values={
            "amount_rur": "log",
        },
        embeddings_noise=0.001,
    ),
    hidden_size=48,
)

## Choose framework for encoder train

- There are both supervised of unsupervised frameworks in `ptls.frames`.
- Keep in mind that each framework requires his own batch format.
Tools for batch collate can be found in the selected framework package.

In [None]:
sup_module = SequenceToTarget(
    seq_encoder=seq_encoder,
    head=Head(input_size=seq_encoder.embedding_size, objective="classification", num_classes=4),
    loss=torch.nn.NLLLoss(),
    metric_list=torchmetrics.Accuracy(task="multiclass", num_classes=4),
    optimizer_partial=partial(torch.optim.Adam),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=4, gamma=0.5),
)

In [None]:
sup_data = PtlsDataModule(
    train_data=SeqToTargetDataset(dataset_train, target_col_name="target_bin", target_dtype=torch.long),
    valid_data=SeqToTargetDataset(dataset_valid, target_col_name="target_bin", target_dtype=torch.long),
    test_data=SeqToTargetDataset(dataset_test, target_col_name="target_bin", target_dtype=torch.long),
    train_batch_size=128,
    valid_batch_size=1024,
    train_num_workers=8,
)

## Train your encoder with selected framework and `pytorch_lightning`

In [None]:
trainer = pl.Trainer(
    max_epochs=1,
    accelerator="gpu" if torch.cuda.is_available() else "cpu",
    enable_progress_bar=False,
)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [None]:
trainer.fit(sup_module, sup_data)

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name          | Type          | Params | Mode 
--------------------------------------------------------
0 | seq_encoder   | RnnSeqEncoder | 16.8 K | train
1 | head          | Head          | 196    | train
2 | loss          | NLLLoss       | 0      | train
3 | train_metrics | ModuleDict    | 0      | train
4 | valid_metrics | ModuleDict    | 0      | train
5 | test_metrics  | ModuleDict    | 0      | train
--------------------------------------------------------
17.0 K    Trainable params
0         Non-trainable params
17.0 K    Total params
0.068     Total estimated model params size (MB)
24        Modules in train mode
0         Modules in eval mode
  self.pid = os.fork()
  self.pid = os.fork()
INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=1` reached.


In [None]:
# train and validation metrics
print(trainer.logged_metrics)

{'loss': tensor(1.0582), 'seq_len': tensor(849.3438), 'y': tensor(1.7188), 'val_loss': tensor(1.1752), 'valid/MulticlassAccuracy': tensor(0.4507)}


In [None]:
# test metrics
trainer.test(ckpt_path="best", dataloaders=sup_data.test_dataloader())

INFO:pytorch_lightning.utilities.rank_zero:Restoring states from the checkpoint path at /content/lightning_logs/version_0/checkpoints/epoch=0-step=157.ckpt
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loaded model weights from the checkpoint at /content/lightning_logs/version_0/checkpoints/epoch=0-step=157.ckpt


[{'test/MulticlassAccuracy': 0.44699999690055847}]

# Make predict

Let"s make predict to check metrics

In [None]:
inference_dl = torch.utils.data.DataLoader(
    dataset=dataset_test,
    collate_fn=collate_feature_dict,
    shuffle=False,
    batch_size=1000,
    num_workers=4,
)



In [None]:
inf_module = InferenceModule(
    torch.nn.Sequential(
        sup_module,
        torch.nn.Softmax(dim=1),
    ),
    model_out_name="prob",
)

In [None]:
df_predict = trainer.predict(inf_module, inference_dl)

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


In [None]:
df_predict = pd.concat(df_predict, axis=0)

In [None]:
df_predict.head()

Unnamed: 0,client_id,target_bin,prob_0000,prob_0001,prob_0002,prob_0003
0,10,3,0.149224,0.51938,0.025183,0.306213
1,11,3,0.210812,0.052006,0.653654,0.083527
2,18,1,0.198591,0.430765,0.046228,0.324416
3,33,0,0.27337,0.04358,0.577292,0.105758
4,34,2,0.336681,0.175975,0.179719,0.307625


In [None]:
y_pred = df_predict[[f"prob_{i:04d}" for i in range(4)]].values.argmax(axis=1)
y_pred

array([1, 2, 1, ..., 1, 2, 2])

In [None]:
y_true = df_predict["target_bin"].values
y_true

array([3, 3, 1, ..., 1, 2, 2])

In [None]:
accuracy_score(y_true, y_pred)

0.447

In [None]:
confusion_matrix(y_true, y_pred)

array([[ 319,  260,  928,  228],
       [ 164,  983,  368,  234],
       [ 125,   40, 1549,   50],
       [ 294,  656,  524,  278]])