In [1]:
from __future__ import annotations

import polars
import torch
import pytorch_lightning as pl
from tqdm import tqdm

from mts_ml_cup.preprocessing import _polars_map

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
%%time
df = polars.read_parquet("../data/processed/sessions.pq")

CPU times: user 54.3 s, sys: 14.3 s, total: 1min 8s
Wall time: 27.9 s


# Data preparation

In [3]:
%%time
urls = df["url_host"].unique()
urls_mapping = dict(zip(sorted(urls), range(1, len(urls) + 1)))
df = (
    df
    .sort(["user_id", "date", "part_of_day_id"])
    .join(
        other=_polars_map(
            mapping=urls_mapping,
            key_name="url_host",
            id_name="url_id",
            id_dtype=polars.Int32,
        ),
        how="left",
        on="url_host",
    )
)

CPU times: user 3min 13s, sys: 32.5 s, total: 3min 45s
Wall time: 56.7 s


In [4]:
print(
    f"{df['region_id'].n_unique() = }\n"
    f"{df['city_id'].n_unique() = }\n"
    f"{df['manufacturer_id'].n_unique() = }\n"
    f"{df['model_id'].n_unique() = }\n"
    f"{df['part_of_day_id'].n_unique() = }\n"
    f"{df['url_id'].n_unique() = }"
)

df['region_id'].n_unique() = 81
df['city_id'].n_unique() = 1000
df['manufacturer_id'].n_unique() = 31
df['model_id'].n_unique() = 603
df['part_of_day_id'].n_unique() = 4
df['url_id'].n_unique() = 199527


In [7]:
def dataset_for_ptls(df: polars.DataFrame) -> dict[str, torch.Tensor | float]:
    dataset = []
    for user_id, user_df in tqdm(df.groupby("user_id"), total=df["user_id"].n_unique()):
        user_encoded = {
            "user_id": user_id,
            "region_id": torch.from_numpy(user_df["region_id"].to_numpy()),
            "city_id": torch.from_numpy(user_df["city_id"].to_numpy().astype("int16")),
            "manufacturer_id": torch.from_numpy(user_df["manufacturer_id"].to_numpy()),
            "model_id": torch.from_numpy(user_df["model_id"].to_numpy().astype("int16")),
            "event_time": torch.from_numpy(user_df["date"].to_numpy().astype("datetime64[s]").astype("int64")),
            "part_of_day_id": torch.from_numpy(user_df["part_of_day_id"].to_numpy()),
            "url_id": torch.from_numpy(user_df["url_id"].to_numpy()),
            "request_cnt": torch.from_numpy(user_df["request_cnt"].to_numpy()),
        }
        dataset.append(user_encoded)
    return dataset

In [8]:
dataset = dataset_for_ptls(df)

100%|█████████████████████████████████| 415317/415317 [03:53<00:00, 1777.26it/s]


In [9]:
%%time
torch.save(dataset, "../data/ptls/dataset.pt")

CPU times: user 2min 22s, sys: 12.9 s, total: 2min 35s
Wall time: 2min 41s


In [10]:
! ls -lh ../data/ptls/

total 6.9G
-rw-rw-r-- 1 ababkin ababkin 6.9G Mar  1 19:30 dataset.pt


# Fit self-supervised

In [2]:
%%time
dataset = torch.load("../data/ptls/dataset.pt")

CPU times: user 2min 2s, sys: 18.5 s, total: 2min 20s
Wall time: 12min 38s


In [3]:
dataset[0].keys()

dict_keys(['user_id', 'region_id', 'city_id', 'manufacturer_id', 'model_id', 'event_time', 'part_of_day_id', 'url_id', 'request_cnt'])

In [4]:
from functools import partial
from ptls.nn import TrxEncoder, RnnSeqEncoder
from ptls.frames.coles import CoLESModule

In [5]:
seq_encoder = RnnSeqEncoder(
    trx_encoder=TrxEncoder(
        embeddings={
            "region_id": {"in": 81, "out": 4},
            "city_id": {"in": 1_000, "out": 8},
            "manufacturer_id": {"in": 31, "out": 4},
            "model_id": {"in": 603, "out": 8},
            "part_of_day_id": {"in": 4, "out": 2},
            "url_id": {"in": 199527, "out": 128},
        },
        embeddings_noise=0.003,
        numeric_values={
            "request_cnt": "log",
        },
        use_batch_norm_with_lens=True,
        orthogonal_init=True,
    ),
    hidden_size=256,
    type="gru",
)

model = CoLESModule(
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=0.001),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=30, gamma=0.9),
)

In [6]:
sum(p.numel() for p in seq_encoder.parameters())

25870178

In [7]:
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.frames.coles import ColesDataset
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames import PtlsDataModule

train_dl = PtlsDataModule(
    train_data=ColesDataset(
        MemoryMapDataset(
            data=dataset,
            i_filters=[
                SeqLenFilter(min_seq_len=25),
            ],
        ),
        splitter=SampleSlices(
            split_count=5,
            cnt_min=25,
            cnt_max=200,
        ),
    ),
    train_num_workers=0,
    train_batch_size=256,
)

In [8]:
trainer = pl.Trainer(
    max_epochs=15,
    gpus=1 if torch.cuda.is_available() else 0,
    enable_progress_bar=True,
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [None]:
trainer.fit(model, train_dl)
print(trainer.logged_metrics)

  rank_zero_warn("You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name               | Type            | Params
-------------------------------------------------------
0 | _loss              | ContrastiveLoss | 0     
1 | _seq_encoder       | RnnSeqEncoder   | 25.9 M
2 | _validation_metric | BatchRecallTopK | 0     
3 | _head              | Head            | 0     
-------------------------------------------------------
25.9 M    Trainable params
0         Non-trainable params
25.9 M    Total params
103.481   Total estimated model params size (MB)
  rank_zero_warn(


Epoch 0:  82%|▊| 1180/1443 [06:35<01:28,  2.99it/s, loss=71.3, v_num=5, seq_len=

In [11]:
print(trainer.logged_metrics)

{'loss': tensor(30.7693), 'seq_len': tensor(95.3654)}


In [15]:
torch.save(seq_encoder, "../data/ptls/seq-encoder-v1.pt")

In [18]:
from ptls.data_load.datasets import inference_data_loader

In [28]:
inference_dl = inference_data_loader(dataset, num_workers=0, batch_size=256)

In [22]:
model.eval();

In [29]:
coles_embs = trainer.predict(model, inference_dl)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting DataLoader 0: : 1623it [06:28,  2.16s/it]   


In [34]:
%%time
coles_embs = torch.vstack(coles_embs)

CPU times: user 461 ms, sys: 232 ms, total: 693 ms
Wall time: 176 ms


In [36]:
torch.save(coles_embs, "../data/ptls/embeddings-v1.pt")

In [37]:
coles_embs.shape

torch.Size([415317, 256])

In [40]:
user_ids = [d["user_id"] for d in dataset]

In [41]:
import joblib as jbl
jbl.dump(user_ids, "../data/ptls/user_ids.jbl")

['../data/ptls/user_ids.jbl']