# Colab setup

In [None]:
import sys
if "google.colab" in str(get_ipython()):
    ! {sys.executable} -m pip install pytorch-lifestream
    ! {sys.executable} -m pip install catboost

Collecting pytorch-lifestream
  Downloading pytorch-lifestream-0.6.0.tar.gz (163 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/163.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━[0m [32m153.6/163.4 kB[0m [31m5.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.4/163.4 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting hydra-core>=1.1.2 (from pytorch-lifestream)
  Downloading hydra_core-1.3.2-py3-none-any.whl.metadata (5.5 kB)
Collecting omegaconf (from pytorch-lifestream)
  Downloading omegaconf-2.3.0-py3-none-any.whl.metadata (3.9 kB)
Collecting pytorch-lightning>=1.6.0 (from pytorch-lifestream)
  Downloading pytorch_lightning-2.4.0-py3-none-any.whl.metadata (2

## Libs

In [None]:
import os

import torch
import pickle
import numpy as np
import pandas as pd
import torchmetrics
from functools import partial
import pytorch_lightning as pl

from ptls.nn import Head
from ptls.nn import TrxEncoder, RnnSeqEncoder
from ptls.frames import PtlsDataModule
from ptls.frames.supervised import SequenceToTarget
from ptls.frames.supervised import SeqToTargetDataset
from ptls.preprocessing import PandasDataPreprocessor
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.datasets import inference_data_loader
from ptls.data_load.iterable_processing import SeqLenFilter, FeatureFilter
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, metrics

## Data preproccessing

In [None]:
path = "https://huggingface.co/datasets/dllllb/age-group-prediction/resolve/main/transactions_train.csv.gz?download=true"
source_data = pd.read_csv(path, compression="gzip")
source_data

Unnamed: 0,client_id,trans_date,small_group,amount_rur
0,33172,6,4,71.463
1,33172,6,35,45.017
2,33172,8,11,13.887
3,33172,9,11,15.983
4,33172,10,11,21.341
...,...,...,...,...
26450572,43300,727,25,7.602
26450573,43300,727,15,3.709
26450574,43300,727,1,6.448
26450575,43300,727,11,24.669


In [None]:
def trx_transform(df):
    df["event_time"] = df["trans_date"].astype(float)
    df["weekday"] = df["trans_date"] % 7
    return df

source_data = trx_transform(source_data)

In [None]:
preprocessor = PandasDataPreprocessor(
    col_id="client_id",
    col_event_time="trans_date",
    event_time_transformation="none",
    cols_category=["small_group", "weekday"],
    category_transformation="frequency",
    cols_numerical=["amount_rur"],
    return_records=True,
)

In [None]:
dataset = preprocessor.fit_transform(source_data)
dataset = sorted(dataset, key=lambda x: x["client_id"])

In [None]:
train, test = train_test_split(dataset, test_size=0.2, random_state=42)

In [None]:
path = "https://huggingface.co/datasets/dllllb/age-group-prediction/resolve/main/train_target.csv?download=true"
df_target = pd.read_csv(path)
df_target.set_index("client_id", inplace=True)
df_target.rename(columns={"bins": "target"}, inplace=True)
df_target

Unnamed: 0_level_0,target
client_id,Unnamed: 1_level_1
24662,2
1046,0
34089,2
34848,1
47076,3
...,...
14303,1
22301,2
25731,0
16820,3


In [None]:
for el in dataset:
    el["target"] = df_target["target"][el["client_id"]]

print(dataset[0].keys())

dict_keys(['client_id', 'trans_date', 'event_time', 'small_group', 'weekday', 'amount_rur', 'target'])


In [None]:
train, test = train_test_split(dataset, test_size=0.2, random_state=42)
train, valid = train_test_split(train, test_size=0.1, random_state=42)

## FineTuning

### load SequenceEncoder obtained from `coles-emb.ipynb`

In [None]:
trx_encoder_params = dict(
    embeddings_noise=0.003,
    numeric_values={"amount_rur": "identity"},
    embeddings={
        "trans_date": {"in": 800, "out": 16},
        "small_group": {"in": 250, "out": 16},
    },
)

seq_encoder = RnnSeqEncoder(
    trx_encoder=TrxEncoder(**trx_encoder_params),
    hidden_size=256,
    type="gru",
)

seq_encoder.load_state_dict(torch.load("/content/coles-emb.pt"))

<All keys matched successfully>

### model

In [None]:
downstream_model = SequenceToTarget(
    seq_encoder=seq_encoder,
    head=Head(
        input_size=seq_encoder.embedding_size,
        use_batch_norm=True,
        objective="classification",
        num_classes=4,
    ),
    loss=torch.nn.NLLLoss(),
    metric_list=torchmetrics.Accuracy(task="multiclass", num_classes=4),
    pretrained_lr=0.0001,
    optimizer_partial=partial(torch.optim.Adam, lr=0.02),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=10, gamma=0.2),
)

### Data module

In [None]:
def get_dataset(data):
    return SeqToTargetDataset(
        MemoryMapDataset(
            data=data,
        ),
        target_col_name="target",
    )

finetune_dm = PtlsDataModule(
    train_data=get_dataset(train),
    valid_data=get_dataset(valid),
    test_data=get_dataset(test),
    train_num_workers=4,
    train_batch_size=256,
    test_batch_size=128,
)

### Trainer FineTuning

In [None]:
trainer_ft = pl.Trainer(
    max_epochs=4,
    accelerator="cuda" if torch.cuda.is_available() else "cpu",
    enable_progress_bar=False,
)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


### Training FineTuning

In [None]:
trainer_ft.fit(downstream_model, finetune_dm)
print(trainer_ft.logged_metrics)

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name          | Type          | Params | Mode 
--------------------------------------------------------
0 | seq_encoder   | RnnSeqEncoder | 240 K  | train
1 | head          | Head          | 1.5 K  | train
2 | loss          | NLLLoss       | 0      | train
3 | train_metrics | ModuleDict    | 0      | train
4 | valid_metrics | ModuleDict    | 0      | train
5 | test_metrics  | ModuleDict    | 0      | train
--------------------------------------------------------
242 K     Trainable params
0         Non-trainable params
242 K     Total params
0.968     Total estimated model params size (MB)
27        Modules in train mode
0         Modules in eval mode
  self.pid = os.fork()
  self.pid = os.fork()
INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=4` reached.


{'loss': tensor(0.9243), 'seq_len': tensor(874.9375), 'y': tensor(1.4688), 'val_loss': tensor(0.9168), 'valid/MulticlassAccuracy': tensor(0.5850)}


### Testing

In [None]:
trainer_ft.test(
    downstream_model,
    dataloaders=finetune_dm.test_dataloader()
)

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


[{'test/MulticlassAccuracy': 0.6111666560173035}]