# Description

This notebook aims to show the benefits of PTLS models compression using the quantisation techniques.


In [None]:
import torch
print(torch.__version__)

import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = ""

def estimate_size(model: torch.nn.Module):
    const = 1
    size = 0
    for module in model.parameters():
        size += module.numel() * module.element_size()
    for buffer in model.buffers():
        size += buffer.numel() * buffer.element_size()
    return size / const

def gain(base, comp):
    base, comp = estimate_size(base), estimate_size(comp)
    return (base - comp) / base * 100

## Data Aquisition

We'll use Age-Group Prediction Dataset

In [2]:
import torch

import numpy as np
import pandas as pd
import torchmetrics
import pytorch_lightning as pl

from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from functools import partial
from ptls.frames import PtlsDataModule
from ptls.nn import TrxEncoder, RnnSeqEncoder, Head
from ptls.data_load.datasets import MemoryMapDataset
from ptls.preprocessing.pandas.pandas_preprocessor import PandasDataPreprocessor
from ptls.frames.supervised import SeqToTargetDataset, SequenceToTarget
from ptls.data_load.utils import collate_feature_dict
from ptls.frames.inference_module import InferenceModule

In [None]:
df_target = pd.read_csv(
    "https://huggingface.co/datasets/dllllb/age-group-prediction/resolve/main/train_target.csv?download=true"
)


df_target_train, df_target_test = train_test_split(
    df_target, test_size=7000, stratify=df_target["bins"], random_state=142)
df_target_train, df_target_valid = train_test_split(
    df_target_train, test_size=3000, stratify=df_target_train["bins"], random_state=142)
print("Split {} records to train: {}, valid: {}, test: {}".format(
    *[
      len(df)
      for df in [df_target, df_target_train, df_target_valid, df_target_test]
    ]
))


df_trx = pd.read_csv(
    "https://huggingface.co/datasets/dllllb/age-group-prediction/resolve/main/transactions_train.csv.gz?download=true",
    compression="gzip"
)

df_trx_train = pd.merge(df_trx, df_target_train["client_id"], on="client_id", how="inner")
df_trx_valid = pd.merge(df_trx, df_target_valid["client_id"], on="client_id", how="inner")
df_trx_test = pd.merge(df_trx, df_target_test["client_id"], on="client_id", how="inner")
print("Split {} transactions to train: {}, valid: {}, test: {}".format(
    *[len(df) for df in [df_trx, df_trx_train, df_trx_valid, df_trx_test]]))

In [None]:
preprocessor = PandasDataPreprocessor(
    col_id="client_id",
    col_event_time="trans_date",
    event_time_transformation="none",
    cols_category=["small_group"],
    cols_numerical=["amount_rur"],
    return_records=False,
)
df_data_train = preprocessor.fit_transform(df_trx_train)
df_data_valid = preprocessor.transform(df_trx_valid)
df_data_test = preprocessor.transform(df_trx_test)

print(
    "Record in dataset, train {}, valid {}, test {}".format(
        *[len(df) for df in [df_data_train, df_data_valid, df_data_test]]
    )
)
print("Each record is a client with list of transactions")

df_target = df_target.rename(columns={"bins": "target_bin"})

In [None]:
df_data_train = pd.merge(df_data_train, df_target, on="client_id")
df_data_valid = pd.merge(df_data_valid, df_target, on="client_id")
df_data_test = pd.merge(df_data_test, df_target, on="client_id")

df_data_train = df_data_train.to_dict(orient="records")
df_data_valid = df_data_valid.to_dict(orient="records")
df_data_test = df_data_test.to_dict(orient="records")

dataset_train = MemoryMapDataset(df_data_train)
dataset_valid = MemoryMapDataset(df_data_valid)
dataset_test = MemoryMapDataset(df_data_test)

In [6]:
from pytorch_lightning import LightningDataModule
sup_data : LightningDataModule = PtlsDataModule(
    train_data=SeqToTargetDataset(dataset_train, target_col_name="target_bin", target_dtype=torch.long),
    valid_data=SeqToTargetDataset(dataset_valid, target_col_name="target_bin", target_dtype=torch.long),
    test_data=SeqToTargetDataset(dataset_test, target_col_name="target_bin", target_dtype=torch.long),
    train_batch_size=128,
    valid_batch_size=1024,
    train_num_workers=8,
)

In [7]:
# The example batch may be useful for model structure evaluation 

example_batch_coles = next(iter(sup_data.train_dataloader()))[0]

## Model instantiation

RNN-based model

In [8]:
seq_encoder = RnnSeqEncoder(
    trx_encoder=TrxEncoder(
        embeddings={
            "small_group": {"in": 150, "out": 32},
        },
        numeric_values={
            "amount_rur": "log",
        },
        embeddings_noise=0.001,
    ),
    hidden_size=48,
    type='lstm'
)

sup_module = SequenceToTarget(
    seq_encoder=seq_encoder,
    head=Head(input_size=seq_encoder.embedding_size, objective="classification", num_classes=4),
    loss=torch.nn.NLLLoss(),
    metric_list=torchmetrics.Accuracy(task="multiclass", num_classes=4),
    optimizer_partial=partial(torch.optim.Adam),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=4, gamma=0.5),
)

Transformer-based model

In [None]:
import torch
import torchmetrics
from ptls.nn import TrxEncoder, TransformerSeqEncoder, Head

trx_encoder=TrxEncoder(
        embeddings={
            'small_group': {'in': 150, 'out': 31},
        },
        numeric_values={
            'amount_rur': 'log',
        },
        embeddings_noise=0.001
)

trx_encoder.output_size

transformer_params = {
    "n_heads": 1,
    "dim_hidden": 128,
    "n_layers": 4,
}

seq_encoder = TransformerSeqEncoder(
    trx_encoder=trx_encoder,
    **transformer_params
)

transformer_module = SequenceToTarget(
    seq_encoder=seq_encoder,
    head=Head(input_size=seq_encoder.embedding_size, objective='classification', num_classes=4),
    loss=torch.nn.NLLLoss(),
    metric_list=torchmetrics.Accuracy('multiclass', num_classes=4),
    optimizer_partial=partial(torch.optim.Adam),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=4, gamma=0.5),
)

Compressor run

In [9]:
from ptls.fedcore_compression import (
    PTQ_1 as experimental_setup_dynamic, 
    fedcore_fit, 
    eval_computational_metrics
)


In [None]:
fedcore_compressor_rnn = fedcore_fit(sup_module, sup_data, experimental_setup_dynamic, n_cls=4)

In [None]:
fedcore_compressor_transformer = fedcore_fit(transformer_module, sup_data, experimental_setup_dynamic, n_cls=4)

## Computational metrics evaluation

In [None]:
def computational_eval(fedcore_compressor, sup_data):
    computational_results = {}
    computational_results['optimised'] = eval_computational_metrics(
        fedcore_compressor.optimised_model,
        sup_data.test_dataloader(),
        'computational.txt',
        id='optimised',
        n_batches=1,
        device=torch.device('cpu')
    )
    computational_results['original'] = eval_computational_metrics(
        fedcore_compressor.original_model,
        sup_data.test_dataloader(),
        'computational.txt',
        id='original',
        n_batches=1,
        device=torch.device('cpu')
    )
    return computational_results

In [None]:
# RNN-based Model
rnn_computational = computational_eval(fedcore_compressor_rnn, sup_data)

# Transformer-based Model
transformer_computational = computational_eval(fedcore_compressor_transformer, sup_data)

### Quality evaluation

In [25]:
def quality_eval(fedcore_compressor, sup_data):
    trainer = pl.Trainer(
        max_epochs=1,
        accelerator="cpu",
        enable_progress_bar=False,
    )
    quality_metrics = {}
    quality_metrics['original'] = trainer.test(fedcore_compressor.original_model, dataloaders=sup_data.test_dataloader())
    quality_metrics['optimised'] = trainer.test(fedcore_compressor.optimised, dataloaders=sup_data.test_dataloader())
    return quality_metrics

In [None]:
# RNN-based model
rnn_quality = quality_eval(fedcore_compressor_rnn, sup_data)

# Transformer-based model
transformer_quality = quality_eval(fedcore_compressor_transformer, sup_data)