# Multicoles modules

In this notebook we will learn how to use multicoles modules. Instead of training one model they train two separete models with constraint on mutual information of built embeddings. There are two approaches:
There are two approaches:
1) Train two model sequentially. So that second model trained while first is frozen.
2) Train both model simultaneously.

## Sequential approach

In [1]:
import os
import numpy as np
import pandas as pd
import torch
import ptls
import pytorch_lightning as pl
import ptls.frames
import ptls.frames.coles
from ptls.data_load.datasets import ParquetFiles, ParquetDataset, MemoryMapDataset
from ptls.frames import PtlsDataModule
from ptls.frames.coles import ColesDataset, ColesIterableDataset
from ptls.frames.supervised import SeqToTargetIterableDataset
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.nn.normalization import L2NormEncoder
from ptls.frames.coles.losses import ContrastiveLoss, MultiContrastiveLoss, CLUBLoss
from ptls.frames.coles.sampling_strategies import HardNegativePairSelector
from ptls.frames.coles.metric import BatchRecallTopK, MultiBatchRecallTopK
from functools import partial
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

  warn(
2024-08-30 06:04:22.028569: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
  torch.utils._pytree._register_pytree_node(


In [2]:
# load data

def get_synthetic_coles_datamodule():
    path = "./syndata/example_data/"

    train_files = ParquetFiles(os.path.join(path, "train"))
    train_dataset = ParquetDataset(train_files, shuffle_files=True)
    eval_files = ParquetFiles(os.path.join(path, "eval"))
    eval_dataset = ParquetDataset(eval_files)

    coles_datamodule = PtlsDataModule(
        train_data=ColesIterableDataset(
            train_dataset,
            splitter=SampleSlices(
                split_count=5,
                cnt_min=50,
                cnt_max=100,
            ),
        ),
        valid_data=ColesIterableDataset(
            eval_dataset,
            splitter=SampleSlices(
                split_count=5,
                cnt_min=50,
                cnt_max=100, ),
        ),
        train_num_workers=4,
        train_batch_size=512,
        valid_num_workers=4,
        valid_batch_size=512,
    )

    return coles_datamodule

data = get_synthetic_coles_datamodule()

In [3]:
# get coles module

def get_coles_module(trx_conf, input_size, hsize):
    pl_module = ptls.frames.coles.CoLESModule(
        validation_metric=BatchRecallTopK(K=4, metric='cosine'),
        seq_encoder=ptls.nn.RnnSeqEncoder(
            trx_encoder=ptls.nn.TrxEncoder(**trx_conf),
            input_size=input_size,
            type='gru',
            hidden_size=hsize,
            is_reduce_sequence=True
        ),
        head=ptls.nn.Head(use_norm_encoder=True),
        loss=ContrastiveLoss(
            margin=1.,
            sampling_strategy=HardNegativePairSelector(neg_count=5),
        ),
        optimizer_partial=partial(torch.optim.Adam, lr=0.001, weight_decay=0.0),
        lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=30, gamma=0.9025)
    )
    return pl_module


trx_conf = {
    'embeddings_noise': 0.001,
    'embeddings': {
        'A': {'in': 64, 'out': 16},
        'B': {'in': 64, 'out': 16},
    },
}

input_size = 16 * 2
hsize = 32    # dimensionaliti of the encoder's hidden space

coles_model = get_coles_module(trx_conf, input_size, hsize)

In [4]:
# train it and save

trainer = pl.Trainer(gpus=[0], max_epochs=10, enable_progress_bar=True)
trainer.fit(coles_model, data)
torch.save(coles_model.seq_encoder.state_dict(), 'first_model.pth')

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name               | Type            | Params
-------------------------------------------------------
0 | _loss              | ContrastiveLoss | 0     
1 | _seq_encoder       | RnnSeqEncoder   | 8.4 K 
2 | _validation_metric | BatchRecallTopK | 0     
3 | _head              | Head            | 0     
-------------------------------------------------------
8.4 K     Trainable params
0         Non-trainable params
8.4 K     Total params
0.034     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [5]:
# now lets test it

def get_synthetic_sup_datamodule():
    path = "./syndata/example_data/"
    
    train_files = ParquetFiles(os.path.join(path, "train"))
    train_dataset = ParquetDataset(train_files, shuffle_files=True)
    test_files = ParquetFiles(os.path.join(path, "eval"))
    test_dataset = ParquetDataset(test_files, shuffle_files=True)

    sup_datamodule = PtlsDataModule(
        train_data=SeqToTargetIterableDataset(train_dataset, target_col_name='class_label', target_dtype=torch.long),
        test_data=SeqToTargetIterableDataset(test_dataset, target_col_name='class_label', target_dtype=torch.long),
        train_batch_size=512,
        test_batch_size=512,
        train_num_workers=4,
        test_num_workers=4,
    )
    return sup_datamodule


def eval_dataloader(model, dl, device='cuda:0'):
    embs, yy = list(), list()
    model.to(device)
    model.eval()
    for batch in dl:
        x, y = batch
        yy.append(y.numpy())
        with torch.no_grad():
            embs.append(model(x.to(device)).cpu().numpy())
    return {'x': np.concatenate(embs, axis=0), 'y': np.concatenate(yy, axis=0)}
        


def eval_embeddings(coles_model, data):
    train_gbm_data = eval_dataloader(coles_model, data.train_dataloader())
    test_gbm_data = eval_dataloader(coles_model, data.test_dataloader())
    return train_gbm_data, test_gbm_data


def gbm(train_gbm_data, test_gbm_data):
    accs = list()
    for gbm_i in range(5):
        gbm_model = LGBMClassifier(**{
              'n_estimators': 50,
              'boosting_type': 'gbdt',
              'objective': 'binary',
              'learning_rate': 0.02,
              'subsample': 0.75,
              'subsample_freq': 1,
              'feature_fraction': 0.75,
              'colsample_bytree': None,
              'max_depth': 12,
              'lambda_l1': 1,
              'reg_alpha': None,
              'lambda_l2': 1,
              'reg_lambda': None,
              'min_data_in_leaf': 50,
              'min_child_samples': None,
              'num_leaves': 50,
              'random_state': 42+gbm_i,
              'n_jobs': 4,
        })
        
        gbm_model.fit(train_gbm_data['x'], train_gbm_data['y'])
        acc = roc_auc_score(test_gbm_data['y'], gbm_model.predict_proba(test_gbm_data['x'])[:, 1])
        accs.append(acc)
    mean, std = np.mean(accs), np.std(accs)
    print(f'mean roc_auc: {mean:.4f} std : {std:.4f}')


eval_datamodule = get_synthetic_sup_datamodule()
train_gbm_data, test_gbm_data = eval_embeddings(coles_model, eval_datamodule)
gbm(train_gbm_data, test_gbm_data)

[LightGBM] [Info] Number of positive: 25600, number of negative: 25600
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002310 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8160
[LightGBM] [Info] Number of data points in the train set: 51200, number of used features: 32
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 25600, number of negative: 25600
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002372 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8160
[LightGBM] [Info] Number of data points in the train set: 51200, number of used features: 32
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 25600, number of negative: 25600
[LightGBM] [Info] Auto-choosing col-wise multi-thre

In [6]:
# lets train second model now
# first of all we need some network to be our estimator for mutual information

class ResNet(torch.nn.Module):
    def __init__(self, h, use_layernorm=True, dropout=0):
        super().__init__()
        layers = list()
        for _ in range(2):
            layers.append(torch.nn.Linear(h, h))
            if use_layernorm:
                layers.append(torch.nn.LayerNorm(h))
            if dropout > 0:
                layers.append(torch.nn.Dropout(dropout))
            layers.append(torch.nn.ReLU())
        self.net = torch.nn.Sequential(*layers)

    def forward(self, inp):
        return self.net(inp) + inp


class ClfDisc(torch.nn.Module):
    def __init__(self, inp1=400, inp2=400, h=512, n_res_blocks=3, use_bn=True, use_l2_norm=True):
        super().__init__()
        layers_a = [torch.nn.Linear(inp1, h), torch.nn.ReLU()]
        layers_a.extend([ResNet(h) for _ in range(n_res_blocks)])
        if use_bn:
            layers_a.append(torch.nn.BatchNorm1d(h, affine=False))
        if use_l2_norm:
            layers_a.append(L2NormEncoder())
        self.a = torch.nn.Sequential(*layers_a)

        layers_b = [torch.nn.Linear(inp2, h), torch.nn.ReLU()]
        layers_b.extend([ResNet(h) for _ in range(n_res_blocks)])
        if use_bn:
            layers_b.append(torch.nn.BatchNorm1d(h, affine=False))
        if use_l2_norm:
            layers_b.append(L2NormEncoder())
        self.b = torch.nn.Sequential(*layers_b)

    def forward(self, domain_a, domain_b):
        a = self.a(domain_a)
        b = self.b(domain_b)
        return -(((a - b) ** 2).sum(axis=-1, keepdims=True))

In [7]:
# now we will define and train second model

def get_multicoles_module(trx_conf, input_size, embed_coef, hsize, clf_hsize, first_model_name):
    coles_loss = ContrastiveLoss(margin=1., sampling_strategy=HardNegativePairSelector(neg_count=5))
    club_loss = CLUBLoss()
    discriminator_model = ClfDisc(inp1=hsize, inp2=hsize, h=clf_hsize)

    seq_encoder = ptls.nn.RnnSeqEncoder(
        trx_encoder=ptls.nn.TrxEncoder(**trx_conf),
        input_size=input_size,
        type='gru',
        hidden_size=hsize,
        is_reduce_sequence=True
    )

    pl_module = ptls.frames.coles.MultiCoLESModule(
        head=ptls.nn.Head(use_norm_encoder=True),
        validation_metric=BatchRecallTopK(K=4, metric='cosine'),
        loss=coles_loss,
        discriminator_loss=club_loss,
        seq_encoder=seq_encoder,
        discriminator=discriminator_model,
        optimizer_partial=partial(torch.optim.Adam, lr=0.001, weight_decay=0.0),
        d_optimizer_partial=partial(torch.optim.Adam, lr=0.001),
        trained_encoders=[first_model_name],
        lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=30, gamma=0.9025),
        coles_coef=1.,
        embed_coef=embed_coef
    )
    return pl_module



embed_coef = 0.1   # discriminator loss coefficent
hsize = 32   # dimensionaliti of the encoder's hidden space
clf_hsize = 32 * 4   # dimensionaliti of the discriminator network's hidden space
first_model_name = 'first_model.pth'   # path to previously trained model

multicoles_model = get_multicoles_module(trx_conf, input_size, embed_coef,
                                         hsize, clf_hsize, first_model_name)

  self.trained_models[i].load_state_dict(torch.load(enc_path))


In [8]:
# train it

trainer = pl.Trainer(gpus=[0], max_epochs=10, enable_progress_bar=True)
trainer.fit(multicoles_model, data)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name                    | Type            | Params
------------------------------------------------------------
0 | _loss                   | ContrastiveLoss | 0     
1 | _seq_encoder            | RnnSeqEncoder   | 8.4 K 
2 | _validation_metric      | BatchRecallTopK | 0     
3 | discriminator_loss      | CLUBLoss        | 0     
4 | trained_models          | ModuleList      | 8.4 K 
5 | discriminator           | ClfDisc         | 209 K 
6 | reference_discriminator | ClfDisc         | 209 K 
7 | _head                   | Head            | 0     
------------------------------------------------------------
427 K     Trainable params
8.4 K     Non-trainable params
436 K     Total params
1.745     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [9]:
# and test it

eval_datamodule = get_synthetic_sup_datamodule()
train_gbm_data, test_gbm_data = eval_embeddings(multicoles_model, eval_datamodule)
gbm(train_gbm_data, test_gbm_data)

[LightGBM] [Info] Number of positive: 25600, number of negative: 25600
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002342 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8160
[LightGBM] [Info] Number of data points in the train set: 51200, number of used features: 32
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 25600, number of negative: 25600
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002556 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8160
[LightGBM] [Info] Number of data points in the train set: 51200, number of used features: 32
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 25600, number of negative: 25600
[LightGBM] [Info] Auto-choosing col-wise multi-thre

## Simultaneous approach

In [10]:
# we don't need frozen model for second approach

def get_multicoles_sml_module(trx_conf, input_size, embed_coef, hsize, clf_hsize):
    coles_loss = MultiContrastiveLoss(margin=1., sampling_strategy=HardNegativePairSelector(neg_count=5))
    club_loss = CLUBLoss()
    discriminator_model = ClfDisc(inp1=hsize, inp2=hsize, h=clf_hsize)

    seq_encoder_constructor = partial(ptls.nn.RnnSeqEncoder,
                                      trx_encoder=ptls.nn.TrxEncoder(**trx_conf),
                                      input_size=input_size,
                                      type='gru',
                                      hidden_size=hsize,
                                      is_reduce_sequence=True
                                      )
    head_constructor = partial(ptls.nn.Head, use_norm_encoder=True)

    pl_module = ptls.frames.coles.MultiCoLESSMLModule(
        seq_encoder_constructor=seq_encoder_constructor,
        head_constructor=head_constructor,
        n_models=2,
        discriminator=discriminator_model,
        loss=coles_loss,
        discriminator_loss=club_loss,
        validation_metric=MultiBatchRecallTopK(n=2, K=4, metric='cosine'),
        optimizer_partial=partial(torch.optim.Adam, lr=0.001, weight_decay=0.0),
        d_optimizer_partial=partial(torch.optim.Adam, lr=0.001),
        lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=30, gamma=0.9025),
        coles_coef=1.,
        embed_coef=embed_coef
    )
    return pl_module


embed_coef = 0.1
hsize = 32
clf_hsize = 32*4

multicoles_sml_model = get_multicoles_sml_module(trx_conf, input_size, embed_coef, hsize, clf_hsize)

In [11]:
# train it

trainer = pl.Trainer(gpus=[0], max_epochs=10, enable_progress_bar=True)
trainer.fit(multicoles_sml_model, data)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name                    | Type                 | Params
-----------------------------------------------------------------
0 | _loss                   | MultiContrastiveLoss | 0     
1 | _seq_encoder            | ParallelModels       | 14.8 K
2 | discriminator_loss      | CLUBLoss             | 0     
3 | discriminator           | ClfDisc              | 209 K 
4 | reference_discriminator | ClfDisc              | 209 K 
-----------------------------------------------------------------
434 K     Trainable params
0         Non-trainable params
434 K     Total params
1.736     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [12]:
# and test it

eval_datamodule = get_synthetic_sup_datamodule()
train_gbm_data, test_gbm_data = eval_embeddings(multicoles_sml_model, eval_datamodule)
gbm(train_gbm_data, test_gbm_data)

[LightGBM] [Info] Number of positive: 25600, number of negative: 25600
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004938 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16320
[LightGBM] [Info] Number of data points in the train set: 51200, number of used features: 64
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 25600, number of negative: 25600
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004922 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16320
[LightGBM] [Info] Number of data points in the train set: 51200, number of used features: 64
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 25600, number of negative: 25600
[LightGBM] [Info] Auto-choosing col-wise multi-th

## Lets compare with usual coles model

In [13]:
input_size = 16 * 2
hsize = 64    # X2 hidden dimentionality

mono_coles_model = get_coles_module(trx_conf, input_size, hsize)

trainer = pl.Trainer(gpus=[0], max_epochs=10, enable_progress_bar=True)
trainer.fit(mono_coles_model, data)

eval_datamodule = get_synthetic_sup_datamodule()
train_gbm_data, test_gbm_data = eval_embeddings(mono_coles_model, eval_datamodule)
gbm(train_gbm_data, test_gbm_data)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name               | Type            | Params
-------------------------------------------------------
0 | _loss              | ContrastiveLoss | 0     
1 | _seq_encoder       | RnnSeqEncoder   | 20.9 K
2 | _validation_metric | BatchRecallTopK | 0     
3 | _head              | Head            | 0     
-------------------------------------------------------
20.9 K    Trainable params
0         Non-trainable params
20.9 K    Total params
0.084     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

[LightGBM] [Info] Number of positive: 25600, number of negative: 25600
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004835 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16320
[LightGBM] [Info] Number of data points in the train set: 51200, number of used features: 64
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 25600, number of negative: 25600
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004985 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16320
[LightGBM] [Info] Number of data points in the train set: 51200, number of used features: 64
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 25600, number of negative: 25600
[LightGBM] [Info] Auto-choosing col-wise multi-th

Multicolor model trained in simultaneous manner has higher score and lower dispersion