# <font size="7">Libs</font>

In [83]:
import torch

from torch.optim import Adam
from torch.optim.lr_scheduler import StepLR
print('torch version (<2): ', torch.__version__)

import ptls
from ptls.frames import PtlsDataModule
from ptls.frames.coles import losses, sampling_strategies
from ptls.frames.coles import split_strategy

from ptls.frames.inference_module import InferenceModule

from ptls.nn.seq_encoder import agg_feature_seq_encoder
from ptls.nn import RnnSeqEncoder, TrxEncoder, Head
from ptls.nn.seq_encoder.agg_feature_seq_encoder import AggFeatureSeqEncoder

from ptls.data_load.datasets import AugmentationDataset, MemoryMapDataset
from ptls.data_load.augmentations import AllTimeShuffle, DropoutTrx
from ptls.data_load.datasets.parquet_dataset import ParquetDataset
from ptls.data_load.iterable_processing import SeqLenFilter, FeatureFilter
from ptls.data_load.datasets import parquet_file_scan
from ptls.data_load.datasets import ParquetDataset, ParquetFiles, AugmentationDataset
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing import SeqLenFilter, FeatureFilter
from ptls.data_load.augmentations import DropoutTrx
from ptls.data_load.datasets import inference_data_loader
from ptls.data_load.utils import collate_feature_dict

from functools import partial

import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger

import pandas as pd
import numpy as np
import tqdm
import matplotlib.pyplot as plt
from omegaconf import OmegaConf
import hydra
import seaborn as sns
# import umap

torch version (<2):  1.12.1+cu102


In [81]:
plt.style.use('default')
sns.set(rc={'figure.figsize':(8, 6)})
sns.set_style('white')
sns.despine()

<Figure size 800x600 with 0 Axes>

# <font size="5">Data Module</font>

In [82]:
conf = OmegaConf.load('src/ptls-experiments/scenario_rosbank/conf/mles_params.yaml')

module_conf = conf['data_module']

data_conf = OmegaConf.load('src/ptls-experiments/scenario_rosbank/conf/dataset_unsupervised/parquet.yaml')

data_conf['train']['data']['data']['data_files']['file_path'] = 'src/ptls-experiments/scenario_rosbank/data/train_trx.parquet'
data_conf['valid']['data']['data_files']['file_path'] = 'src/ptls-experiments/scenario_rosbank/data/train_trx.parquet'

module_conf['train_data']['data'] = data_conf['train']
module_conf['valid_data']['data'] = data_conf['valid']

data_module = hydra.utils.instantiate(module_conf)

# <font size="7">Model</font>

# <font size="6">Loss Block</font>

In [21]:
import torch
from torch import nn as nn
from torch.nn import functional as F

class L2NormEncoder(nn.Module):
    def __init__(self, eps=1e-9):
        super().__init__()
        self.eps = eps

    def forward(self, x: torch.Tensor):
        return x / (x.pow(2).sum(dim=-1, keepdim=True) + self.eps).pow(0.5)

class ContrastiveLoss(nn.Module):

    def __init__(self, margin, sampling_strategy):
        super(ContrastiveLoss, self).__init__()
        self.margin = margin
        self.pair_selector = sampling_strategy
        
        self.norm = L2NormEncoder()
        
    def forward(self, embeddings, target):
        embeddings = self.norm(embeddings)
        
        positive_pairs, negative_pairs = self.pair_selector.get_pairs(embeddings, target)
        positive_loss = F.pairwise_distance(embeddings[positive_pairs[:, 0]], embeddings[positive_pairs[:, 1]]).pow(2)
        negative_loss = F.relu(
            self.margin - F.pairwise_distance(embeddings[negative_pairs[:, 0]], embeddings[negative_pairs[:, 1]])
        ).pow(2)
        loss = torch.cat([positive_loss, negative_loss], dim=0)
        
        return loss.sum()

In [22]:
class VICRegLoss(nn.Module):
    
    def __init__(self, ):
        super(VICRegLoss, self).__init__()
        
        self._agg_encoder = AggFeatureSeqEncoder(         
            embeddings={
                "mcc": {"in": 100},
                "channel_type": {"in": 4},
                "currency": {"in": 4},
                "trx_category": {"in": 10}
            },

            numeric_values={
                'amount': 'identity',
            },
     
            was_logified=True,
            log_scale_factor=1
        )
        self.instanceNormAggs = nn.InstanceNorm1d(362, affine=True)

        self.norm = L2NormEncoder()
        
    def forward(self, embeddings, aggs):
        aggs = self._agg_encoder(aggs)
        aggs = self.norm(aggs.T)
        
        cov_aggs_embs = (aggs @ embeddings) / embeddings.shape[0]
        cov_loss = cov_aggs_embs.pow_(2).sum()
        
        std_embeddings = torch.sqrt(embeddings.var(dim=0) + 0.0001)
        std_loss = torch.mean(F.relu(1 - std_embeddings))
        
        return (cov_loss, std_loss)

In [23]:
class Loss(nn.Module):
    def __init__(self, contrastiveLoss, vicregLoss):
        super(Loss, self).__init__()
        
        self.contrastiveLoss = contrastiveLoss
        self.vicregLoss = vicregLoss
        
    def forward(self, embeddings, target, aggs):
        
        (cov_loss, std_loss) = self.vicregLoss(embeddings, aggs)
        con_loss = self.contrastiveLoss.forward(embeddings, target)
        
        return ((con_loss, cov_loss, std_loss), 0.55 * con_loss + 1 * cov_loss + 1 * std_loss)

# <font size="6">ABS Module</font>

In [24]:
import torch
import pytorch_lightning as pl
from ptls.data_load.padded_batch import PaddedBatch


class ABSModule(pl.LightningModule):
    @property
    def metric_name(self):
        raise NotImplementedError()

    @property
    def is_requires_reduced_sequence(self):
        raise NotImplementedError()

    def shared_step(self, x, y):
        """

        Args:
            x:
            y:

        Returns: y_h, y

        """
        raise NotImplementedError()

    def __init__(self, validation_metric=None,
                       seq_encoder=None,
                       loss=None,
                       optimizer_partial=None,
                       lr_scheduler_partial=None):
        """
        Parameters
        ----------
        params : dict
            params for creating an encoder
        seq_encoder : torch.nn.Module
            sequence encoder, if not provided, will be constructed from params
        """
        super().__init__()
        # self.save_hyperparameters()

        self._loss = loss
        self._seq_encoder = seq_encoder
        self._seq_encoder.is_reduce_sequence = self.is_requires_reduced_sequence
        self._validation_metric = validation_metric

        self._optimizer_partial = optimizer_partial
        self._lr_scheduler_partial = lr_scheduler_partial
        
    @property
    def seq_encoder(self):
        return self._seq_encoder

    def forward(self, x):
        return self._seq_encoder(x)

    def training_step(self, batch, _):
        
        y_h, y = self.shared_step(*batch)
        
        (con_loss, cov_loss, std_loss), loss = self._loss(y_h, y, batch[0])

        self.log('con_loss', con_loss)
        self.log('cov_loss', cov_loss)
        self.log('std_loss', std_loss)
        self.log('loss', loss)
        
        if type(batch) is tuple:
            x, y = batch
            if isinstance(x, PaddedBatch):
                self.log('seq_len', x.seq_lens.float().mean(), prog_bar=True)
        else:
            # this code should not be reached
            self.log('seq_len', -1, prog_bar=True)
            raise AssertionError('batch is not a tuple')
        return loss

    def validation_step(self, batch, _):
        y_h, y = self.shared_step(*batch)
        self._validation_metric(y_h, y)

    def validation_epoch_end(self, outputs):
        self.log(self.metric_name, self._validation_metric.compute(), prog_bar=True)
        self._validation_metric.reset()

    def configure_optimizers(self):
        optimizer = self._optimizer_partial(self.parameters())
        scheduler = self._lr_scheduler_partial(optimizer)
        
        if isinstance(scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
            scheduler = {
                'scheduler': scheduler,
                'monitor': self.metric_name,
            }
        return [optimizer], [scheduler]

# <font size="6">CoLES Module</font>

In [25]:
from ptls.frames.coles.metric import BatchRecallTopK
from ptls.frames.coles.sampling_strategies import HardNegativePairSelector
from ptls.nn.head import Head
from ptls.nn.seq_encoder.containers import SeqEncoderContainer


class CoLESModule(ABSModule):
    def __init__(self,
                 seq_encoder: SeqEncoderContainer = None,
                 head=None,
                 loss=None,
                 validation_metric=None,
                 optimizer_partial=None,
                 lr_scheduler_partial=None):

        if head is None:
            head = Head(use_norm_encoder=True)

        if loss is None:
            loss = ContrastiveLoss(margin=0.5,
                                   sampling_strategy=HardNegativePairSelector(neg_count=5))

        if validation_metric is None:
            validation_metric = BatchRecallTopK(K=4, metric='cosine')

        super().__init__(validation_metric,
                         seq_encoder,
                         loss,
                         optimizer_partial,
                         lr_scheduler_partial
                        )

        self._head = head
    @property
    def metric_name(self):
        return 'recall_top_k'

    @property
    def is_requires_reduced_sequence(self):
        return True

    def shared_step(self, x, y):
        
        y_h = self(x)
        if self._head is not None:
            y_h = self._head(y_h)
        return y_h, y

# <font size="6">Model</font>


In [26]:
model = CoLESModule(
      validation_metric=ptls.frames.coles.metric.BatchRecallTopK(K=4,
                                                                 metric="cosine"),
      seq_encoder=RnnSeqEncoder(
            trx_encoder=TrxEncoder(
            use_batch_norm_with_lens=True,
            norm_embeddings=False,
            embeddings_noise=0.0003,
            
            embeddings={
                "mcc": {"in": 100, "out": 24},
                "channel_type": {"in": 4, "out": 4},
                "currency": {"in": 4, "out": 4},
                "trx_category": {"in": 10, "out": 4}
            },

            numeric_values={
                'amount': 'identity',
            }
                
            ),
            type="lstm",
            hidden_size=1024,
            bidir=False,
            trainable_starter="static",
      ),
     
      head=Head(
            use_norm_encoder=False,
            input_size=1024,
      ),
        
      loss=Loss(
          ContrastiveLoss(
            margin=0.5,
            sampling_strategy=sampling_strategies.HardNegativePairSelector(neg_count=5),
          ),
          VICRegLoss()
      ),
    
      optimizer_partial=partial(
            torch.optim.Adam, 
            lr=0.004,
            weight_decay=0.0
      ),
    
      lr_scheduler_partial=partial(
            torch.optim.lr_scheduler.StepLR,
            step_size=10,
            gamma=0.9025
      ),
)

In [27]:
models = []
for i in range(1):
    logger = TensorBoardLogger('src/ptls-experiments/scenario_rosbank/lightning_logs',
                               name=f'CoLES VICReg l2Norm, hidden=1024, step={i}')

    trainer = pl.Trainer(
        logger=logger,
        num_sanity_val_steps=0,
        gpus=1,
        auto_select_gpus=False,
        max_epochs=60,
        enable_checkpointing=False,
        deterministic=True
    )

    trainer.fit(model, data_module)
    print(trainer.logged_metrics)
    
    models.append(model)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name               | Type            | Params
-------------------------------------------------------
0 | _loss              | Loss            | 724   
1 | _seq_encoder       | RnnSeqEncoder   | 4.4 M 
2 | _validation_metric | BatchRecallTopK | 0     
3 | _head              | Head            | 0     
-------------------------------------------------------
4.4 M     Trainable params
0         Non-trainable params
4.4 M     Total params
17.433    Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

{'con_loss': 15.092844009399414, 'cov_loss': 2.5963964462280273, 'std_loss': 0.9734358191490173, 'loss': 11.870896339416504, 'seq_len': 50.16666793823242, 'recall_top_k': 0.8799718022346497}


# <font size="6">Inference</font>

In [30]:
for i in range(1):
    iterable_inference_dataset = ParquetDataset(
        data_files=ParquetFiles(['src/ptls-experiments/scenario_rosbank/data/train_trx.parquet',
                                 'src/ptls-experiments/scenario_rosbank/data/test_trx.parquet'],                             

                                ).data_files,
        i_filters=[FeatureFilter(['target_flag', 'cl_id'])],
    )
    next(iter(iterable_inference_dataset))

    inference_dl = torch.utils.data.DataLoader(
        dataset=iterable_inference_dataset,
        collate_fn=collate_feature_dict,
        shuffle=False,
        batch_size=128,
        num_workers=0,
    )
    next(iter(inference_dl)).payload

    mod = InferenceModule(models[i], pandas_output=True, model_out_name='emb')

    pred = pl.Trainer(gpus=1).predict(mod, inference_dl)

    embeddings_train_test = pd.concat(pred, axis=0)
    embeddings_train_test = embeddings_train_test.drop(columns='target_flag')

    embeddings_train_test['cl_id'] = embeddings_train_test['cl_id'].astype('int64')
    embeddings_train_test.to_csv(f'src/ptls-experiments/scenario_rosbank/data/norm_embeddings{i}.csv',
                                 index=False)
    
    print(f'iter: {i}')

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(


Predicting: 0it [00:00, ?it/s]

iter: 0


In [31]:
embeddings_train_test

Unnamed: 0,cl_id,emb_0000,emb_0001,emb_0002,emb_0003,emb_0004,emb_0005,emb_0006,emb_0007,emb_0008,...,emb_1014,emb_1015,emb_1016,emb_1017,emb_1018,emb_1019,emb_1020,emb_1021,emb_1022,emb_1023
0,10003,-0.001731,-1.431841e-07,0.000211,0.000141,-0.000226,-0.226500,0.000022,0.000157,0.185809,...,0.000593,0.001785,0.001677,0.015794,-0.015411,0.001391,0.008914,-0.000612,-0.000997,0.001858
1,10010,0.002428,6.418203e-05,0.000138,0.002095,-0.000329,-0.849132,0.000084,-0.000014,0.253986,...,0.003688,-0.003765,0.000018,0.013684,-0.003174,-0.000692,-0.007369,-0.000346,-0.000337,0.178840
2,10011,0.000059,2.117831e-05,0.000071,0.001216,-0.000091,-0.151976,0.000113,0.000028,0.221929,...,0.004092,-0.000260,0.000517,0.009177,-0.049346,0.000150,0.001325,-0.000213,-0.000085,0.100845
3,10059,0.005948,2.706527e-04,0.000318,-0.000533,0.000541,0.012461,0.014339,-0.000130,0.082362,...,0.004031,-0.001820,0.003598,0.006443,-0.047055,-0.003576,0.007214,-0.000818,0.000883,0.134276
4,10071,0.001036,3.852190e-05,-0.000116,0.000828,-0.000190,-0.359909,0.000026,0.000160,0.122689,...,0.010642,0.001145,0.000165,0.009871,-0.040940,-0.000459,0.000879,-0.000597,-0.000314,0.522061
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100,5712,0.001882,6.773246e-05,-0.000147,0.000326,-0.000180,0.252845,0.000034,-0.000014,0.077719,...,0.002844,0.000137,0.000384,0.012474,-0.006680,-0.000201,0.012806,0.000126,-0.000153,-0.064928
101,7975,0.003234,1.852722e-04,0.001260,0.001907,-0.000263,-0.043849,0.000233,-0.000119,0.088815,...,0.008077,0.000226,0.000811,0.005572,-0.070660,-0.000940,0.002663,0.000113,-0.000045,0.617823
102,8507,-0.000006,4.133244e-06,0.000033,-0.000081,-0.000060,0.121561,0.000076,-0.000003,0.175056,...,0.009895,-0.003344,0.000480,0.010507,-0.072358,-0.000196,-0.000405,-0.000138,-0.000061,0.164011
103,8928,0.000307,1.383075e-05,-0.000068,0.000329,-0.000062,-0.084627,0.000021,0.000069,0.130049,...,0.011867,0.005993,0.000682,0.005578,-0.041317,-0.000251,-0.001775,-0.000395,-0.000176,-0.020858


In [33]:
aggs = pd.read_csv('src/ptls-experiments/scenario_rosbank/data/agg_data.csv')

In [36]:
!pip install seaborn
import seaborn as sns

Collecting seaborn
  Using cached seaborn-0.12.2-py3-none-any.whl (293 kB)
Installing collected packages: seaborn
Successfully installed seaborn-0.12.2


In [38]:
aggs

Unnamed: 0,cl_id,emb_0000,emb_0001,emb_0002,emb_0003,emb_0004,emb_0005,emb_0006,emb_0007,emb_0008,...,emb_0352,emb_0353,emb_0354,emb_0355,emb_0356,emb_0357,emb_0358,emb_0359,emb_0360,emb_0361
0,10003,67.0,351750.900,5250.01370,19958.3850,0.0,0.0,33.0,0.0,1.0,...,0.000,0.0000,0.000,0.00000,0.0,0.0000,24.0,1.0,1.0,1.0
1,10010,109.0,53243.530,488.47278,1487.9280,0.0,0.0,65.0,0.0,2.0,...,0.000,0.0000,0.000,0.00000,0.0,0.0000,20.0,1.0,1.0,1.0
2,10011,133.0,398903.500,2999.27440,8413.3030,0.0,0.0,29.0,5.0,2.0,...,28595.576,0.0000,0.000,0.00000,0.0,19534.5300,25.0,1.0,1.0,3.0
3,10059,12.0,51841.008,4320.08400,5625.0950,0.0,0.0,1.0,0.0,1.0,...,0.000,0.0000,0.000,0.00000,0.0,0.0000,8.0,1.0,1.0,2.0
4,10071,113.0,441330.940,3905.58350,7387.3780,0.0,0.0,17.0,20.0,4.0,...,0.000,0.0000,0.000,0.00000,0.0,212.1320,24.0,1.0,1.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10212,5712,120.0,975464.400,8128.86960,18873.9860,0.0,0.0,26.0,6.0,5.0,...,19849.432,0.0000,14912.876,0.00000,36557.6,0.0000,17.0,1.0,2.0,6.0
10213,7975,52.0,295692.560,5686.39550,10053.2310,0.0,0.0,3.0,8.0,6.0,...,0.000,0.0000,0.000,0.00000,0.0,0.0000,18.0,1.0,1.0,5.0
10214,8507,206.0,1530374.400,7429.00200,20584.1500,0.0,0.0,59.0,41.0,6.0,...,16464.836,0.0000,0.000,0.00000,0.0,0.0000,28.0,1.0,1.0,5.0
10215,8928,84.0,1472858.900,17534.03500,35396.9100,0.0,0.0,12.0,18.0,4.0,...,57744.953,0.0000,0.000,0.00000,0.0,173.9497,20.0,1.0,1.0,6.0


In [82]:
embeddings_train_test['cl_id'] = embeddings_train_test['cl_id'].astype('int64')
embeddings_train_test.to_csv('src/ptls-experiments/scenario_rosbank/data/embeddings.csv',
                             index=False)