Reference CPU: AMD Ryzen 7 7840HS w/ Radeon 780M Graphics        3.80 GHz

In [1]:
import pandas as pd
import torch
from joblib import cpu_count

from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.frames.coles import ColesDataset
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames import PtlsDataModule
from sklearn.model_selection import train_test_split
from functools import partial
from ptls.nn import TrxEncoder, RnnSeqEncoder
from ptls.frames.coles import CoLESModule
from ptls.preprocessing.pandas.pandas_preprocessor import PandasDataPreprocessor
from pyinstrument import Profiler
import pytorch_lightning as pl

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
BATCH_SIZE = 48

In [3]:
profiler = Profiler()

def define_data():
    source_data = pd.read_csv(
        'https://huggingface.co/datasets/dllllb/age-group-prediction/resolve/main/transactions_train.csv.gz?download=true',
        compression='gzip')

    preprocessor = PandasDataPreprocessor(
        col_id='client_id',
        col_event_time='trans_date',
        event_time_transformation='none',
        cols_category=['small_group'],
        cols_numerical=['amount_rur'],
        return_records=True
    )
    return preprocessor, source_data


def define_model():
    trx_encoder_params = dict(embeddings_noise=0.003,
                              numeric_values={'amount_rur': 'identity'},
                              embeddings={'trans_date': {'in': 800, 'out': 16},
                                          'small_group': {'in': 250, 'out': 16}})

    seq_encoder = RnnSeqEncoder(trx_encoder=TrxEncoder(**trx_encoder_params), hidden_size=256, type='gru')
    model = CoLESModule(seq_encoder=seq_encoder, optimizer_partial=partial(torch.optim.Adam, lr=0.001),
                        lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=30, gamma=0.9))
    return model

In [4]:
if __name__ == "__main__":
    accelerator = "cuda" if torch.cuda.is_available() else "cpu"
    device = 1 if torch.cuda.is_available() else "auto"
    data_loader_workers = 1 if torch.cuda.is_available() else cpu_count()
    preprocessor, source_data = define_data()
    model = define_model()

    dataset = preprocessor.fit_transform(source_data)
    train, test = train_test_split(dataset, test_size=0.2, random_state=42)
    
    len_filter = SeqLenFilter(min_seq_len=25)
    in_memory_dataset = MemoryMapDataset(data=train, i_filters=[len_filter])
    data_splitter = SampleSlices(split_count=5, cnt_min=25, cnt_max=200)
    coles_df = ColesDataset(data=in_memory_dataset, splitter=data_splitter)
    train_dl = PtlsDataModule(
        train_data=coles_df,
        train_num_workers=data_loader_workers,
        train_batch_size=BATCH_SIZE,
    )
    trainer = pl.Trainer(
        max_epochs=1,
        accelerator=accelerator,
        devices=device,
        enable_progress_bar=True,
        # prepare_data_per_node=False,
        # replace_sampler_ddp=False,
        # sync_batchnorm=True
    )

Creating Dask Server
Link Dask Server - http://192.168.0.105:8787/status


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
c:\Users\1\ptls\.venv\lib\site-packages\pytorch_lightning\trainer\connectors\logger_connector\logger_connector.py:75: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default


In [5]:
print(f'logger.version = {trainer.logger.version}')
trainer.fit(model, train_dl)
print(trainer.logged_metrics)

c:\Users\1\ptls\.venv\lib\site-packages\pytorch_lightning\trainer\configuration_validator.py:70: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.

  | Name               | Type            | Params | Mode 
---------------------------------------------------------------
0 | _loss              | ContrastiveLoss | 0      | train
1 | _seq_encoder       | RnnSeqEncoder   | 240 K  | train
2 | _validation_metric | BatchRecallTopK | 0      | train
3 | _head              | Head            | 0      | train
---------------------------------------------------------------
240 K     Trainable params
0         Non-trainable params
240 K     Total params
0.962     Total estimated model params size (MB)
17        Modules in train mode
0         Modules in eval mode
c:\Users\1\ptls\.venv\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:419: Consider setting `persistent_workers=True` in 'train_dataloader' to speed up the dataloader worker initializat

logger.version = 145






Epoch 0: 100%|██████████| 500/500 [06:34<00:00,  1.27it/s, v_num=145, seq_len=109.0]

`Trainer.fit` stopped: `max_epochs=1` reached.


Epoch 0: 100%|██████████| 500/500 [06:34<00:00,  1.27it/s, v_num=145, seq_len=109.0]
{'loss': tensor(74.2729), 'seq_len': tensor(108.9583)}


### Save sequence encoder for other experiments

In [6]:
torch.save(model.state_dict(), "coles-emb.pt")

In [5]:
model.load_state_dict(torch.load("coles-emb.pt"))

  model.load_state_dict(torch.load("coles-emb.pt"))


<All keys matched successfully>

## Inference 

In [6]:
from ptls.data_load.datasets import inference_data_loader
train_dl = inference_data_loader(train, num_workers=0, batch_size=BATCH_SIZE)
batch = next(iter(train_dl))
out = model(batch)
out

tensor([[-0.9735,  0.9915, -0.3622,  ...,  0.9922,  0.0333,  0.5107],
        [-0.9772,  0.9890, -0.4413,  ...,  0.9930,  0.3060,  0.3452],
        [-0.9811,  0.9920, -0.3003,  ...,  0.9889,  0.4440,  0.5506],
        ...,
        [-0.9769,  0.9884, -0.5546,  ...,  0.9910,  0.1383,  0.6291],
        [-0.9868,  0.9934, -0.4172,  ...,  0.9950,  0.2357,  0.6615],
        [-0.9772,  0.9937, -0.6985,  ...,  0.9937,  0.2834,  0.4918]],
       grad_fn=<SliceBackward0>)

In [7]:
from ptls.frames.inference_module import InferenceModule

inf_model = InferenceModule(model, model_out_name="coles-emb.pt", pandas_output=False)

batch = next(iter(train_dl))
out = inf_model(batch)
out

{'coles-emb.pt': tensor([[-0.9735,  0.9915, -0.3598,  ...,  0.9921,  0.0340,  0.5108],
         [-0.9772,  0.9890, -0.4384,  ...,  0.9930,  0.3072,  0.3461],
         [-0.9811,  0.9920, -0.2973,  ...,  0.9889,  0.4447,  0.5513],
         ...,
         [-0.9768,  0.9884, -0.5521,  ...,  0.9910,  0.1389,  0.6295],
         [-0.9868,  0.9934, -0.4147,  ...,  0.9950,  0.2362,  0.6621],
         [-0.9772,  0.9937, -0.6976,  ...,  0.9937,  0.2840,  0.4926]],
        grad_fn=<SliceBackward0>)}

## ONNX Inference 

In [8]:
from sklearn.ensemble import RandomForestClassifier
from ptls.frames.inference_module import ONNXInferenceModule
from ptls.data_load.datasets import inference_data_loader

%load_ext pyinstrument

torch.set_float32_matmul_precision('high')

train_dl = inference_data_loader(train, num_workers=0, batch_size=BATCH_SIZE, onnx=True)
test_dl = inference_data_loader(test, num_workers=0, batch_size=BATCH_SIZE, onnx=True)

onnx_model = ONNXInferenceModule(model, model_out_name="coles-emb.onnx", dl=train_dl).to(accelerator)
batch = next(iter(train_dl))
out = onnx_model(batch)
out

  assert x.payload.size()[1] > 0, "Batch can'not have 0 transactions"


tensor([[-0.9736,  0.9917, -0.3616,  ...,  0.9922,  0.0328,  0.5117],
        [-0.9771,  0.9888, -0.4402,  ...,  0.9932,  0.3062,  0.3467],
        [-0.9810,  0.9922, -0.2993,  ...,  0.9888,  0.4436,  0.5518],
        ...,
        [-0.9771,  0.9883, -0.5537,  ...,  0.9912,  0.1379,  0.6299],
        [-0.9868,  0.9932, -0.4163,  ...,  0.9951,  0.2357,  0.6626],
        [-0.9771,  0.9937, -0.6982,  ...,  0.9937,  0.2837,  0.4929]],
       dtype=torch.float16)

### Inference with pl.trainer

In [None]:
%%pyinstrument

train_embeds = torch.vstack(trainer.predict(onnx_model, train_dl, ))
test_embeds = torch.vstack(trainer.predict(onnx_model, test_dl))

In [12]:
# join target and embeddings

df_target = pd.read_csv('https://huggingface.co/datasets/dllllb/age-group-prediction/resolve/main/train_target.csv?download=true')
df_target = df_target.set_index('client_id')
df_target.rename(columns={"bins": "target"}, inplace=True)

train_df = pd.DataFrame(data=train_embeds, columns=[f'embed_{i}' for i in range(train_embeds.shape[1])])
train_df['client_id'] = [x['client_id'] for x in train]
train_df = train_df.merge(df_target, how='left', on='client_id')

test_df = pd.DataFrame(data=test_embeds, columns=[f'embed_{i}' for i in range(test_embeds.shape[1])])
test_df['client_id'] = [x['client_id'] for x in test]
test_df = test_df.merge(df_target, how='left', on='client_id')

print(train_df.shape, test_df.shape)

(24000, 258) (6000, 258)


In [13]:
embed_columns = [x for x in train_df.columns if x.startswith('embed')]
x_train, y_train = train_df[embed_columns], train_df['target']
x_test, y_test = test_df[embed_columns], test_df['target']

clf = RandomForestClassifier()
clf.fit(x_train, y_train)
clf.score(x_test, y_test)

0.5483333333333333

### Inference w/o pl.trainer (float32)

In [None]:
%%pyinstrument

train_embeds = torch.vstack(onnx_model.predict(train_dl, dtype=torch.float32))
test_embeds = torch.vstack(onnx_model.predict(test_dl, dtype=torch.float32))

In [10]:
# join target and embeddings

df_target = pd.read_csv('https://huggingface.co/datasets/dllllb/age-group-prediction/resolve/main/train_target.csv?download=true')
df_target = df_target.set_index('client_id')
df_target.rename(columns={"bins": "target"}, inplace=True)

train_df = pd.DataFrame(data=train_embeds, columns=[f'embed_{i}' for i in range(train_embeds.shape[1])])
train_df['client_id'] = [x['client_id'] for x in train]
train_df = train_df.merge(df_target, how='left', on='client_id')

test_df = pd.DataFrame(data=test_embeds, columns=[f'embed_{i}' for i in range(test_embeds.shape[1])])
test_df['client_id'] = [x['client_id'] for x in test]
test_df = test_df.merge(df_target, how='left', on='client_id')

print(train_df.shape, test_df.shape)

(24000, 258) (6000, 258)


In [None]:
%%pyinstrument

embed_columns = [x for x in train_df.columns if x.startswith('embed')]
x_train, y_train = train_df[embed_columns], train_df['target']
x_test, y_test = test_df[embed_columns], test_df['target']

clf = RandomForestClassifier()
clf.fit(x_train, y_train)
clf.score(x_test, y_test)

### Inference w/o pl.trainer (float16)

In [None]:
%%pyinstrument

train_embeds = torch.vstack(onnx_model.predict(train_dl))
test_embeds = torch.vstack(onnx_model.predict(test_dl))

In [13]:
# join target and embeddings

df_target = pd.read_csv('https://huggingface.co/datasets/dllllb/age-group-prediction/resolve/main/train_target.csv?download=true')
df_target = df_target.set_index('client_id')
df_target.rename(columns={"bins": "target"}, inplace=True)

train_df = pd.DataFrame(data=train_embeds, columns=[f'embed_{i}' for i in range(train_embeds.shape[1])])
train_df['client_id'] = [x['client_id'] for x in train]
train_df = train_df.merge(df_target, how='left', on='client_id')

test_df = pd.DataFrame(data=test_embeds, columns=[f'embed_{i}' for i in range(test_embeds.shape[1])])
test_df['client_id'] = [x['client_id'] for x in test]
test_df = test_df.merge(df_target, how='left', on='client_id')

print(train_df.shape, test_df.shape)

(24000, 258) (6000, 258)


In [None]:
%%pyinstrument

embed_columns = [x for x in train_df.columns if x.startswith('embed')]
x_train, y_train = train_df[embed_columns], train_df['target']
x_test, y_test = test_df[embed_columns], test_df['target']

clf = RandomForestClassifier()
clf.fit(x_train, y_train)
clf.score(x_test, y_test)

### Regular Inference

In [15]:
train_dl = inference_data_loader(train, num_workers=0, batch_size=BATCH_SIZE)
test_dl = inference_data_loader(test, num_workers=0, batch_size=BATCH_SIZE)

In [None]:
%%pyinstrument

train_embeds = torch.vstack(trainer.predict(model, train_dl, ))
test_embeds = torch.vstack(trainer.predict(model, test_dl))

In [17]:
# join target and embeddings

df_target = pd.read_csv('https://huggingface.co/datasets/dllllb/age-group-prediction/resolve/main/train_target.csv?download=true')
df_target = df_target.set_index('client_id')
df_target.rename(columns={"bins": "target"}, inplace=True)

train_df = pd.DataFrame(data=train_embeds, columns=[f'embed_{i}' for i in range(train_embeds.shape[1])])
train_df['client_id'] = [x['client_id'] for x in train]
train_df = train_df.merge(df_target, how='left', on='client_id')

test_df = pd.DataFrame(data=test_embeds, columns=[f'embed_{i}' for i in range(test_embeds.shape[1])])
test_df['client_id'] = [x['client_id'] for x in test]
test_df = test_df.merge(df_target, how='left', on='client_id')

print(train_df.shape, test_df.shape)

(24000, 258) (6000, 258)


In [None]:
%%pyinstrument

embed_columns = [x for x in train_df.columns if x.startswith('embed')]
x_train, y_train = train_df[embed_columns], train_df['target']
x_test, y_test = test_df[embed_columns], test_df['target']

clf = RandomForestClassifier()
clf.fit(x_train, y_train)
clf.score(x_test, y_test)