In [1]:
%load_ext autoreload
%autoreload 2

import torch.multiprocessing
torch.multiprocessing.set_sharing_strategy('file_system')

# import logging
import torch
import pytorch_lightning as pl
import warnings
import numpy as np

warnings.filterwarnings('ignore')
# logging.getLogger("pytorch_lightning").setLevel(logging.ERROR)

2023-12-19 06:41:47.177738: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
from ptls.data_load.datasets import CategoryFeature, State, HMM


def get_state_features(const_p, beta_a, beta_b):
    feats = {
        #"s_feat_1": CategoryFeature(n=10, dist_type='uniform'),
        "s_feat_2": CategoryFeature(n=40, dist_type='const', dist_args={'p': const_p}),
        "s_feat_3": CategoryFeature(n=50, dist_type='beta', dist_args={'a': beta_a, 'b': beta_b}),
    }
    return feats


def get_h_state_features(const_p, beta_a, beta_b):
    feats = {
        "h_feat_1": CategoryFeature(n=10, dist_type='beta', dist_args={'a': beta_a, 'b': beta_b}),
        "h_feat_2": CategoryFeature(n=50, dist_type='const', dist_args={'p': const_p}),
    }
    return feats


def gen_a_b(min=1, max=50):
    a = np.random.randint(min, max)
    if np.random.rand()>=0.5:
        mult = 0.8 + 0.4 * np.random.rand()
        b = int(a * mult)
        if b < 1:
            b = 1
    else:
        b = np.random.randint(min, max)
    return a, b


def gen_p(n, drop_percentile=30):
    p = np.random.randn(n)
    p = np.exp(p)/np.sum(np.exp(p))
    v = np.percentile(p, drop_percentile)
    p = np.where(p>=v, p, 0)
    p = p/p.sum()
    return p


states = [
    State(get_state_features(gen_p(40), *gen_a_b()), ind=i) for i in range(16)
]


hidden_states = [
    State(get_h_state_features(gen_p(50), *gen_a_b()), ind=i) for i in range(4)
]


state_transition_tensor = np.random.randn(16*16*4).reshape(16,16,4)
exp_state_transition_tensor = np.exp(state_transition_tensor)
state_transition_tensor = exp_state_transition_tensor / exp_state_transition_tensor.sum(axis=1, keepdims=True)


hidden_state_transition_matrix = np.eye(4) + 0.001
hidden_state_transition_matrix /= np.sum(hidden_state_transition_matrix, axis=1, keepdims=True)


test_hmm = HMM(states, hidden_states, state_transition_tensor, hidden_state_transition_matrix, noise=0.)

In [3]:
test_seq = test_hmm.gen_seq()

In [4]:
hidden_state_transition_matrix = np.eye(4) + 0.08
hidden_state_transition_matrix /= np.sum(hidden_state_transition_matrix, axis=1, keepdims=True)

hmms = list()
for i in range(2):
    
    state_transition_tensor = np.random.randn(16*16*4).reshape(16,16,4)
    if i==0:
        state_transition_tensor += 3
    elif i==1:
        state_transition_tensor -= 3
    
    exp_state_transition_tensor = np.exp(state_transition_tensor)
    state_transition_tensor = exp_state_transition_tensor / \
                              exp_state_transition_tensor.sum(axis=1, keepdims=True)
    
    hmms.append(HMM(states, hidden_states, state_transition_tensor,
                    hidden_state_transition_matrix, noise=0.))


In [5]:
from functools import partial
from ptls.data_load.datasets import SyntheticDataset
from ptls.frames.supervised import SeqToTargetDataset, SequenceToTarget
from ptls.frames import PtlsDataModule

In [6]:
dataset_train = SyntheticDataset(hmms, seq_len=100, dataset_size=10000)
dataset_valid = SyntheticDataset(hmms, seq_len=100, dataset_size=1000)
dataset_test = SyntheticDataset(hmms, seq_len=100, dataset_size=5000)

In [7]:
sup_data = PtlsDataModule(
    train_data=SeqToTargetDataset(dataset_train, target_col_name='class_label', target_dtype=torch.long),
    valid_data=SeqToTargetDataset(dataset_valid, target_col_name='class_label', target_dtype=torch.long),
    test_data=SeqToTargetDataset(dataset_test, target_col_name='class_label', target_dtype=torch.long),
    train_batch_size=128,
    valid_batch_size=256,
    train_num_workers=16,
)

In [8]:
import torch
import torchmetrics
from ptls.nn import TrxEncoder, RnnSeqEncoder, Head


seq_encoder = RnnSeqEncoder(
    trx_encoder=TrxEncoder(
        embeddings={
            #'s_feat_1': {'in': 100, 'out': 32},
            's_feat_2': {'in': 40, 'out': 32},
            's_feat_3': {'in': 50, 'out': 32},
            'h_feat_1': {'in': 10, 'out': 32},
            'h_feat_2': {'in': 50, 'out': 32},
        },
        embeddings_noise=0.001,
    ),
    hidden_size=128,
)

sup_module = SequenceToTarget(
    seq_encoder=seq_encoder,
    head=Head(input_size=seq_encoder.embedding_size, objective='classification', num_classes=2),
    loss=torch.nn.CrossEntropyLoss(),
    metric_list=torchmetrics.Accuracy(),
    optimizer_partial=partial(torch.optim.Adam),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=4, gamma=0.9),
)

In [9]:
import pytorch_lightning as pl
trainer = pl.Trainer(
    max_epochs=100,
    #gpus=1 if torch.cuda.is_available() else 0,
    #gpus = 0,
    gpus=[0],
    enable_progress_bar=False,
)
trainer.fit(sup_module, sup_data)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name          | Type             | Params
---------------------------------------------------
0 | seq_encoder   | RnnSeqEncoder    | 104 K 
1 | head          | Head             | 258   
2 | loss          | CrossEntropyLoss | 0     
3 | train_metrics | ModuleDict       | 0     
4 | valid_metrics | ModuleDict       | 0     
5 | test_metrics  | ModuleDict       | 0     
---------------------------------------------------
104 K     Trainable params
0         Non-trainable params
104 K     Total params
0.417     Total estimated model params size (MB)


In [10]:
print(trainer.logged_metrics)

{'loss': tensor(0.1954), 'seq_len': tensor(100.), 'y': tensor(0.5000), 'val_loss': tensor(0.2552), 'valid/Accuracy': tensor(0.8920), 'train/Accuracy': tensor(0.8746)}


### Data loader

In [9]:
from ptls.data_load.datasets import SyntheticDataset
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.frames.coles import ColesDataset
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames import PtlsDataModule


dataset_train = SyntheticDataset(hmms, seq_len=400, dataset_size=10000)
dataset_valid = SyntheticDataset(hmms, seq_len=400, dataset_size=1000)
dataset_test = SyntheticDataset(hmms, seq_len=400, dataset_size=5000)


train_dl = PtlsDataModule(
    train_data=ColesDataset(
        dataset_train,
        splitter=SampleSlices(
            split_count=5,
            cnt_min=20,
            cnt_max=50,
        ),
    ),
    valid_data=ColesDataset(
        dataset_valid,
        splitter=SampleSlices(
            split_count=5,
            cnt_min=20,
            cnt_max=50,),
    ),
    train_num_workers=16,
    train_batch_size=256,
    valid_num_workers=2,
    valid_batch_size=256,
)

### Model definition

In [10]:
from functools import partial
from ptls.nn import TrxEncoder, RnnSeqEncoder
from ptls.frames.coles import CoLESModule


seq_encoder = RnnSeqEncoder(
    trx_encoder=TrxEncoder(
        embeddings={
            #'s_feat_1': {'in': 100, 'out': 32},
            's_feat_2': {'in': 40, 'out': 32},
            's_feat_3': {'in': 50, 'out': 32},
            'h_feat_1': {'in': 10, 'out': 32},
            'h_feat_2': {'in': 50, 'out': 32},
        },
        embeddings_noise=0.001,
    ),
    hidden_size=128,
    type='gru'
)

model = CoLESModule(
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=0.001),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=5, gamma=0.9),
)

In [11]:
import torch
import pytorch_lightning as pl

import logging

trainer = pl.Trainer(
    max_epochs=30,
    gpus=[0],
    enable_progress_bar=False,
)
trainer.fit(model, train_dl)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name               | Type            | Params
-------------------------------------------------------
0 | _loss              | ContrastiveLoss | 0     
1 | _seq_encoder       | RnnSeqEncoder   | 104 K 
2 | _validation_metric | BatchRecallTopK | 0     
3 | _head              | Head            | 0     
-------------------------------------------------------
104 K     Trainable params
0         Non-trainable params
104 K     Total params
0.416     Total estimated model params size (MB)


In [12]:
print(trainer.logged_metrics)

{'loss': tensor(0.0573, device='cuda:0'), 'seq_len': tensor(34.9750, device='cuda:0'), 'recall_top_k': tensor(0.1123, device='cuda:0')}


In [18]:
from tqdm import tqdm

xx, yy = list(), list()
dl = iter(sup_data.train_dataloader())
for batch in tqdm(dl):
    
    with torch.no_grad():
        x, y = batch
        yy.append(y.numpy())
        x = model(x.to('cuda:0')).detach().cpu().numpy()
        xx.append(x)

xx = np.concatenate(xx, axis=0)
yy = np.concatenate(yy, axis=0)

100%|███████████████████████████████████████████| 79/79 [00:04<00:00, 16.09it/s]


In [19]:
xx_test, yy_test = list(), list()
dl = iter(sup_data.train_dataloader())
for batch in tqdm(dl):
    
    with torch.no_grad():
        x, y = batch
        yy_test.append(y.numpy())
        x = model(x.to('cuda:0')).detach().cpu().numpy()
        xx_test.append(x)

xx_test = np.concatenate(xx_test, axis=0)
yy_test = np.concatenate(yy_test, axis=0)

100%|███████████████████████████████████████████| 79/79 [00:04<00:00, 16.00it/s]


In [57]:
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()

In [58]:
clf.fit(xx, yy)

In [59]:
y_pred = clf.predict_proba(xx_test)[:,1]

In [60]:
roc_auc_score(yy_test, y_pred)

0.53591118