In [1]:
import os
import numpy as np
import pandas as pd
import torch
import ptls
import pytorch_lightning as pl
from ptls.data_load.datasets import ParquetFiles, ParquetDataset
from ptls.frames import PtlsDataModule
from ptls.frames.coles import InfoMaxIterableDataset, InfoMaxModule
from ptls.frames.supervised import SeqToTargetIterableDataset
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames.coles.losses import IMContrastiveLoss
from functools import partial
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from itertools import chain
from ptls.frames.coles.sampling_strategies import HardNegativeNSelector

  warn(
2024-09-02 04:52:20.252078: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
  torch.utils._pytree._register_pytree_node(


In [2]:
path = "./syndata/example_data/"

train_files = ParquetFiles(os.path.join(path, "train"))
train_dataset = ParquetDataset(train_files, shuffle_files=True)
eval_files = ParquetFiles(os.path.join(path, "eval"))
eval_dataset = ParquetDataset(eval_files)


infomax_data_module = ptls.frames.PtlsDataModule(
    train_data=InfoMaxIterableDataset(
        outside_split_count=15,
        sample_chains=True,
        neg_cnt_min=50,
        neg_cnt_max=100,
        data=train_dataset,
        splitter=ptls.frames.coles.split_strategy.SampleSlices(
            split_count=5,
            cnt_min=50,
            cnt_max=100,
        ),
    ),
    valid_data=InfoMaxIterableDataset(
        outside_split_count=15,
        sample_chains=True,
        neg_cnt_min=50,
        neg_cnt_max=100,
        data=eval_dataset,
        splitter=ptls.frames.coles.split_strategy.SampleSlices(
            split_count=5,
            cnt_min=50,
            cnt_max=100,
            ),
        ),
        train_batch_size=256,
        train_num_workers=4,
        valid_num_workers=4,
        valid_batch_size=256,
    )

In [3]:
trx_conf = {
    'embeddings_noise': 0.001,
    'embeddings': {
        'A': {'in': 64, 'out': 16},
        'B': {'in': 64, 'out': 16},
    },
}

seq_encoder = torch.nn.Sequential(
    ptls.nn.TrxEncoder(**trx_conf),
    ptls.nn.RnnEncoder(
        input_size=32,
        type='gru',
        hidden_size=32,
        is_reduce_sequence=True,
    )
)

infomax_module = InfoMaxModule(
    seq_encoder=seq_encoder,
    sampler=HardNegativeNSelector(neg_count=5),
    head=ptls.nn.Head(use_norm_encoder=True),
    coles_loss=IMContrastiveLoss(1.0),
    optimizer_partial=partial(torch.optim.Adam, lr=0.001, weight_decay=0.0),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=30, gamma=0.9025)
)

In [4]:
trainer = pl.Trainer(
    gpus=[0],
    max_epochs=10,
    enable_progress_bar=True
)

trainer.fit(infomax_module, infomax_data_module)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name               | Type              | Params
---------------------------------------------------------
0 | _seq_encoder       | Sequential        | 8.4 K 
1 | _validation_metric | MeanMetric        | 0     
2 | _head              | Head              | 0     
3 | _coles_loss        | IMContrastiveLoss | 0     
---------------------------------------------------------
8.4 K     Trainable params
0         Non-trainable params
8.4 K     Total params
0.034     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [5]:
# now lets test it

def get_synthetic_sup_datamodule():
    path = "./syndata/example_data/"
    
    train_files = ParquetFiles(os.path.join(path, "train"))
    train_dataset = ParquetDataset(train_files, shuffle_files=True)
    test_files = ParquetFiles(os.path.join(path, "eval"))
    test_dataset = ParquetDataset(test_files, shuffle_files=True)

    sup_datamodule = PtlsDataModule(
        train_data=SeqToTargetIterableDataset(train_dataset, target_col_name='class_label', target_dtype=torch.long),
        test_data=SeqToTargetIterableDataset(test_dataset, target_col_name='class_label', target_dtype=torch.long),
        train_batch_size=512,
        test_batch_size=512,
        train_num_workers=4,
        test_num_workers=4,
    )
    return sup_datamodule


def eval_dataloader(model, dl, device='cuda:0'):
    embs, yy = list(), list()
    model.to(device)
    model.eval()
    for batch in dl:
        x, y = batch
        yy.append(y.numpy())
        with torch.no_grad():
            embs.append(model(x.to(device)).cpu().numpy())
    return {'x': np.concatenate(embs, axis=0), 'y': np.concatenate(yy, axis=0)}
        


def eval_embeddings(coles_model, data):
    train_gbm_data = eval_dataloader(coles_model, data.train_dataloader())
    test_gbm_data = eval_dataloader(coles_model, data.test_dataloader())
    return train_gbm_data, test_gbm_data


def gbm(train_gbm_data, test_gbm_data):
    accs = list()
    for gbm_i in range(5):
        gbm_model = LGBMClassifier(**{
              'n_estimators': 50,
              'boosting_type': 'gbdt',
              'objective': 'binary',
              'learning_rate': 0.02,
              'subsample': 0.75,
              'subsample_freq': 1,
              'feature_fraction': 0.75,
              'colsample_bytree': None,
              'max_depth': 12,
              'lambda_l1': 1,
              'reg_alpha': None,
              'lambda_l2': 1,
              'reg_lambda': None,
              'min_data_in_leaf': 50,
              'min_child_samples': None,
              'num_leaves': 50,
              'random_state': 42+gbm_i,
              'n_jobs': 4,
        })
        
        gbm_model.fit(train_gbm_data['x'], train_gbm_data['y'])
        acc = roc_auc_score(test_gbm_data['y'], gbm_model.predict_proba(test_gbm_data['x'])[:, 1])
        accs.append(acc)
    mean, std = np.mean(accs), np.std(accs)
    print(f'mean roc_auc: {mean:.4f} std : {std:.4f}')


eval_datamodule = get_synthetic_sup_datamodule()
train_gbm_data, test_gbm_data = eval_embeddings(infomax_module, eval_datamodule)
gbm(train_gbm_data, test_gbm_data)

[LightGBM] [Info] Number of positive: 25600, number of negative: 25600
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002878 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8160
[LightGBM] [Info] Number of data points in the train set: 51200, number of used features: 32
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 25600, number of negative: 25600
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003183 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8160
[LightGBM] [Info] Number of data points in the train set: 51200, number of used features: 32
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 25600, number of negative: 25600
[LightGBM] [Info] Auto-choosing col-wise multi-thre