In [1]:
%load_ext autoreload
%autoreload 2

import os
from collections import defaultdict
from tqdm import tqdm
from IPython.display import clear_output
import torch.multiprocessing
torch.multiprocessing.set_sharing_strategy('file_system')

# import logging
import torch
import pytorch_lightning as pl
import warnings
import numpy as np
import pandas as pd


from functools import partial
from ptls.data_load.datasets import SyntheticDataset, ParquetFiles, ParquetDataset
from ptls.frames.supervised import SeqToTargetDataset, SeqToTargetIterableDataset, SequenceToTarget
from ptls.frames import PtlsDataModule
from functools import partial


from ptls.data_load.datasets import SyntheticDataset
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.frames.coles import ColesDataset
from ptls.frames.coles.split_strategy import SampleSlices


import torch
import torchmetrics
from ptls.nn import TrxEncoder, RnnSeqEncoder, Head
from ptls.frames.coles import CoLESModule


import pytorch_lightning as pl
import pickle

warnings.filterwarnings('ignore')
# logging.getLogger("pytorch_lightning").setLevel(logging.ERROR)

2024-01-26 02:04:41.705765: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
from ptls.data_load.datasets import CategoryFeature, ConstFeature, State, HMM
from ptls.data_load.datasets import SquareSampler, SphereSampler, PlaneClassAssigner, TransitionTensorGenerator


n_states = 8
n_hidden_states = 4
hidden_state_c = 0.7
sphere_r = 16
hidden_sphere_r = 4
delta_shift = None
use_pairs_to_states = True


def get_state_features(const_p, beta_a, beta_b, c_value=None):
    feats = {
        "s_feat_1": ConstFeature(value = c_value),
        #"s_feat_2": CategoryFeature(n=10, dist_type='const', dist_args={'p': const_p}),
        #"s_feat_3": CategoryFeature(n=20, dist_type='beta', dist_args={'a': beta_a, 'b': beta_b}),
        
    }
    return feats


def get_h_state_features(const_p, beta_a, beta_b, c_value=None):
    feats = {
        "h_feat_1": ConstFeature(value = c_value),
        #"h_feat_1": CategoryFeature(n=20, dist_type='beta', dist_args={'a': beta_a, 'b': beta_b}),
        #"h_feat_2": CategoryFeature(n=10, dist_type='const', dist_args={'p': const_p}),
    }
    return feats


def gen_a_b(min=1, max=50):
    a = np.random.randint(min, max)
    if np.random.rand()>=0.5:
        mult = 0.8 + 0.4 * np.random.rand()
        b = int(a * mult)
        if b < 1:
            b = 1
    else:
        b = np.random.randint(min, max)
    return a, b


def gen_p(n, drop_percentile=30):
    p = np.random.randn(n)
    p = np.exp(p)/np.sum(np.exp(p))
    v = np.percentile(p, drop_percentile)
    p = np.where(p>=v, p, 0)
    p = p/p.sum()
    return p


states = [
    State(get_state_features(gen_p(10), *gen_a_b(), c_value=i), ind=i) for i in range(n_states)
]


hidden_states = [
    State(get_h_state_features(gen_p(10), *gen_a_b(), c_value=i), ind=i) for i in range(n_hidden_states)
]


hidden_state_new = (1 - hidden_state_c) / (n_hidden_states - 1)
hidden_state_transition_matrix = np.eye(n_hidden_states) * (hidden_state_c - hidden_state_new)
hidden_state_transition_matrix += np.ones((n_hidden_states, n_hidden_states)) * hidden_state_new
default_hidden_state_transition_matrix = hidden_state_transition_matrix


def sample_h_mtx(n):
    sampler = SphereSampler(n_hidden_states)
    h_mtx = sampler.sample(n, to_matrix=False)
    h_mtx = h_mtx / np.sqrt((h_mtx**2).sum(axis=-1, keepdims=True)) * hidden_sphere_r
    h_mtx = h_mtx.reshape(h_mtx.shape[0], n_hidden_states, n_hidden_states)
    h_mtx = np.exp(h_mtx)/np.exp(h_mtx).sum(axis=-1, keepdims=True)
    return list(h_mtx)


def get_datasets(n=10000, n_train=5000, n_eval=2000, n_test=3000, noise=0., delta_shift=10, assigner=None):
    n_train, n_eval, n_test, n = int(n_train), int(n_eval), int(n_test), int(n)
    assert sum([n_train, n_eval, n_test]) == n

    hidden_state_transition_matrix = sample_h_mtx(n)
    noise_hidden_state_transition_matrix = sample_h_mtx(n)
    
    sampler = SphereSampler(n_states)
        
    tensor_gen = TransitionTensorGenerator(sampler, assigner, n_hidden_states)
    pos_tensors, neg_tensors = tensor_gen.gen_tensors(int(n), soft_norm=True, sphere_norm=True, sphere_r=sphere_r)


    get_hmm = partial(HMM, states=states, hidden_states=hidden_states, noise=noise)


    train_pos_hmms = get_hmm(state_transition_tensors=pos_tensors[:n_train],
                             hidden_state_transition_matrix=hidden_state_transition_matrix[:n_train],
                             noise_hidden_state_transition_matrix=noise_hidden_state_transition_matrix[:n_train])
    train_neg_hmms = get_hmm(state_transition_tensors=neg_tensors[:n_train],
                             hidden_state_transition_matrix=hidden_state_transition_matrix[:n_train],
                             noise_hidden_state_transition_matrix=noise_hidden_state_transition_matrix[:n_train])


    valid_pos_hmms = get_hmm(state_transition_tensors=pos_tensors[n_train:n_eval+n_train],
                             hidden_state_transition_matrix=hidden_state_transition_matrix[n_train:n_eval+n_train],
                             noise_hidden_state_transition_matrix=noise_hidden_state_transition_matrix[n_train:n_eval+n_train])
    valid_neg_hmms = get_hmm(state_transition_tensors=neg_tensors[n_train:n_eval+n_train],
                             hidden_state_transition_matrix=hidden_state_transition_matrix[n_train:n_eval+n_train],
                             noise_hidden_state_transition_matrix=noise_hidden_state_transition_matrix[n_train:n_eval+n_train])


    test_pos_hmms = get_hmm(state_transition_tensors=pos_tensors[n_eval+n_train:],
                            hidden_state_transition_matrix=hidden_state_transition_matrix[n_eval+n_train:],
                            noise_hidden_state_transition_matrix=noise_hidden_state_transition_matrix[n_eval+n_train:])
    test_neg_hmms = get_hmm(state_transition_tensors=neg_tensors[n_eval+n_train:],
                            hidden_state_transition_matrix=hidden_state_transition_matrix[n_eval+n_train:],
                            noise_hidden_state_transition_matrix=noise_hidden_state_transition_matrix[n_eval+n_train:])


    dataset_train = SyntheticDataset([train_pos_hmms, train_neg_hmms], seq_len=512)
    dataset_valid = SyntheticDataset([valid_pos_hmms, valid_neg_hmms], seq_len=512)
    dataset_test = SyntheticDataset([test_pos_hmms, test_neg_hmms], seq_len=512)
    
    return dataset_train, dataset_valid, dataset_test

In [3]:
def get_sup_datamodule(noise, n_train=5000, n_eval=2000, n_test=3000, assigner=None):
    n = n_train + n_eval + n_test
    
    dataset_train, dataset_valid, dataset_test = get_datasets(n=n, n_train=n_train, n_eval=n_eval,
                                                              n_test=n_test, noise=noise, assigner=assigner)

    sup_data = PtlsDataModule(
        train_data=SeqToTargetDataset(dataset_train, target_col_name='class_label', target_dtype=torch.long),
        valid_data=SeqToTargetDataset(dataset_valid, target_col_name='class_label', target_dtype=torch.long),
        test_data=SeqToTargetDataset(dataset_test, target_col_name='class_label', target_dtype=torch.long),
        train_batch_size=256,
        valid_batch_size=256,
        test_batch_size=256,
        train_num_workers=16,
        valid_num_workers=16,
        test_num_workers=16
    )
    return sup_data

In [4]:
pairs_to_states_tensor = np.arange(n_states * n_states).reshape(n_states, n_states)
h_pairs_to_states_tensor = np.arange(n_hidden_states * n_hidden_states).reshape(n_hidden_states, n_hidden_states)


def pairs_to_states(s):
    return pairs_to_states_tensor[s[:-1], s[1:]]


def h_pairs_to_states(s):
    return h_pairs_to_states_tensor[s[:-1], s[1:]]


def trunc_h_states(h):
    return h[:-1]


def trunc_t(t):
    return t[:-1]

In [5]:
def write_dataset(main_folder, assigner, noise=0.,
                  train_num_files=200, eval_num_files=100, test_num_files=100,
                  n_train=256*10, n_eval=256*10, n_test=256*10):
    
    train_folder = os.path.join(main_folder, "train")
    eval_folder = os.path.join(main_folder, "eval")
    test_folder = os.path.join(main_folder, "test")
    os.makedirs(train_folder, exist_ok=True)
    os.makedirs(eval_folder, exist_ok=True)
    os.makedirs(test_folder, exist_ok=True)
    
    for fn in tqdm(range(max(train_num_files, eval_num_files, test_num_files))):
        data = get_sup_datamodule(noise, n_train, n_eval, n_test, assigner=assigner)
        
        if fn < train_num_files:
            df = defaultdict(list)
            for i, batch in enumerate(data.train_dataloader()):
                x, y = batch
                x_d = x.payload
                for k in x_d:
                    df[k].extend(x_d[k].int().tolist())
                df['class_label'].extend(y.int().tolist())
            df = pd.DataFrame(df)
            
            if use_pairs_to_states:
                df["s_feat_1"] = df["s_feat_1"].map(pairs_to_states)
                df["h_feat_1"] = df["h_feat_1"].map(h_pairs_to_states)
                df["event_time"] = df["event_time"].map(trunc_t)
            
            df.to_parquet(os.path.join(train_folder, "train_"+str(fn)+".parquet"))
        
        
        if fn < eval_num_files:
            df = defaultdict(list)
            for i, batch in enumerate(data.val_dataloader()):
                x, y = batch
                x_d = x.payload
                for k in x_d:
                    df[k].extend(x_d[k].int().tolist())
                df['class_label'].extend(y.int().tolist())
            df = pd.DataFrame(df)
            
            if use_pairs_to_states:
                df["s_feat_1"] = df["s_feat_1"].map(pairs_to_states)
                df["h_feat_1"] = df["h_feat_1"].map(h_pairs_to_states)
                df["event_time"] = df["event_time"].map(trunc_t)
            
            df.to_parquet(os.path.join(eval_folder, "eval_"+str(fn)+".parquet"))
        
        
        if fn < test_num_files:
            df = defaultdict(list)
            for i, batch in enumerate(data.test_dataloader()):
                x, y = batch
                x_d = x.payload
                for k in x_d:
                    df[k].extend(x_d[k].int().tolist())
                df['class_label'].extend(y.int().tolist())
            df = pd.DataFrame(df)
            
            if use_pairs_to_states:
                df["s_feat_1"] = df["s_feat_1"].map(pairs_to_states)
                df["h_feat_1"] = df["h_feat_1"].map(h_pairs_to_states)
                df["event_time"] = df["event_time"].map(trunc_t)
            
            df.to_parquet(os.path.join(test_folder, "test_"+str(fn)+".parquet"))

In [6]:
#'''
assigner = PlaneClassAssigner(n_states, n_hidden_states, delta_shift=delta_shift)
assigner.set_random_vector()

with open("assigner.pickle", "wb") as f:
    pickle.dump(assigner, f)
#'''

In [7]:
#'''
with open("states.pickle", "wb") as f:
    pickle.dump(states, f)

with open("hidden_states.pickle", "wb") as f:
    pickle.dump(hidden_states, f)

with open("assigner.pickle", "wb") as f:
    pickle.dump(assigner, f)
#'''

In [8]:
#"""
with open("states.pickle", "rb") as f:
    states = pickle.load(f)

with open("hidden_states.pickle", "rb") as f:
    hidden_states = pickle.load(f)

with open("assigner.pickle", "rb") as f:
    assigner = pickle.load(f)
#"""

In [9]:
write_dataset("syndata/cor_distinct_h_transitions_noise_0", assigner, noise=0.,
              train_num_files=250, eval_num_files=50, test_num_files=0,
              n_train=256*4, n_eval=256*4, n_test=256*4)

  1%|▎                                        | 2/250 [00:29<1:00:51, 14.73s/it]


KeyboardInterrupt: 

In [6]:
def get_sup_data(path):
    train_files = ParquetFiles(os.path.join(path, "train"))
    train_dataset = ParquetDataset(train_files, shuffle_files=True)
    eval_files = ParquetFiles(os.path.join(path, "eval"))
    eval_dataset = ParquetDataset(eval_files, shuffle_files=True)
    test_files = ParquetFiles(os.path.join(path, "test"))
    test_dataset = ParquetDataset(test_files, shuffle_files=True)
    
    sup_data = PtlsDataModule(
        train_data=SeqToTargetIterableDataset(train_dataset, target_col_name='class_label', target_dtype=torch.long),
        valid_data=SeqToTargetIterableDataset(eval_dataset, target_col_name='class_label', target_dtype=torch.long),
        #test_data=SeqToTargetIterableDataset(test_dataset, target_col_name='class_label', target_dtype=torch.long),
        train_batch_size=256,
        valid_batch_size=256,
        #test_batch_size=256,
        train_num_workers=4,
        valid_num_workers=4,
        #test_num_workers=4
    )
    return sup_data

In [7]:
def get_sup_model():
    seq_encoder = RnnSeqEncoder(
        trx_encoder=TrxEncoder(
            embeddings={
                #'s_feat_1': {'in': 100, 'out': 32},
                's_feat_2': {'in': 10, 'out': 32},
                's_feat_3': {'in': 20, 'out': 32},
                'h_feat_1': {'in': 20, 'out': 32},
                'h_feat_2': {'in': 10, 'out': 32},
            },
            embeddings_noise=0.001,
        ),
        hidden_size=128,
        type='lstm',
    )

    sup_module = SequenceToTarget(
        seq_encoder=seq_encoder,
        head=Head(input_size=seq_encoder.embedding_size, objective='classification', num_classes=2),
        loss=torch.nn.NLLLoss(),
        metric_list=torchmetrics.Accuracy(),
        optimizer_partial=partial(torch.optim.Adam, lr=1e-4),
        lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=4, gamma=0.9),
    )
    return sup_module

In [8]:
#for path in ["syndata/noise_0/", "syndata/noise_05/", "syndata/noise_1/"]:
for path in ["syndata/small_noise_0/"]:
    trainer = pl.Trainer(
        max_epochs=50,
        gpus=[0],
        enable_progress_bar=False,
    )
    
    sup_data = get_sup_data(path)
    sup_module = get_sup_model()
    
    trainer.fit(sup_module, sup_data)
    clear_output()

In [17]:
def test_embed_model(model):
    model.to('cuda:0')
    
    xx, yy = list(), list()
    dl = iter(sup_data.train_dataloader())
    for batch in dl:
    
        with torch.no_grad():
            x, y = batch
            yy.append(y.numpy())
            x = model(x.to('cuda:0')).detach().cpu().numpy()
            xx.append(x)

    xx = np.concatenate(xx, axis=0)
    yy = np.concatenate(yy, axis=0)
    
    xx_eval, yy_eval = list(), list()
    dl = iter(sup_data.eval_dataloader())
    for batch in dl:

        with torch.no_grad():
            x, y = batch
            yy_eval.append(y.numpy())
            x = model(x.to('cuda:0')).detach().cpu().numpy()
            xx_eval.append(x)

    xx_eval = np.concatenate(xx_eval, axis=0)
    yy_eval = np.concatenate(yy_eval, axis=0)
    
    
    xx_test, yy_test = list(), list()
    dl = iter(sup_data.test_dataloader())
    for batch in dl:

        with torch.no_grad():
            x, y = batch
            yy_test.append(y.numpy())
            x = model(x.to('cuda:0')).detach().cpu().numpy()
            xx_test.append(x)

    xx_test = np.concatenate(xx_test, axis=0)
    yy_test = np.concatenate(yy_test, axis=0)
    
    
    clf = LGBMClassifier(max_depth=-1)
    clf.fit(xx, yy, eval_set=(xx_eval, yy_eval), eval_metric="auc")
    
    y_pred = clf.predict_proba(xx)[:,1]
    train_score = roc_auc_score(yy, y_pred)
    
    y_pred = clf.predict_proba(xx_eval)[:,1]
    eval_score = roc_auc_score(yy_eval, y_pred)
    
    y_pred = clf.predict_proba(xx_test)[:,1]
    test_score = roc_auc_score(yy_test, y_pred)
    
    return train_score, eval_score, test_score

### Data loader

In [18]:
train_dl = PtlsDataModule(
    train_data=ColesDataset(
        dataset_train,
        splitter=SampleSlices(
            split_count=5,
            cnt_min=20,
            cnt_max=50,
        ),
    ),
    valid_data=ColesDataset(
        dataset_valid,
        splitter=SampleSlices(
            split_count=5,
            cnt_min=20,
            cnt_max=50,),
    ),
    train_num_workers=16,
    train_batch_size=256,
    valid_num_workers=16,
    valid_batch_size=256,
)

NameError: name 'dataset_train' is not defined

### Model definition

In [11]:
seq_encoder = RnnSeqEncoder(
    trx_encoder=TrxEncoder(
        embeddings={
            #'s_feat_1': {'in': 100, 'out': 32},
            's_feat_2': {'in': 16, 'out': 32},
            's_feat_3': {'in': 32, 'out': 32},
            'h_feat_1': {'in': 32, 'out': 32},
            'h_feat_2': {'in': 16, 'out': 32},
        },
        embeddings_noise=0.001,
    ),
    hidden_size=128,
    type='lstm'
)

model = CoLESModule(
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=0.001),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=5, gamma=0.9),
)

In [12]:
import torch
import pytorch_lightning as pl

import logging

trainer = pl.Trainer(
    max_epochs=50,
    gpus=[0],
    enable_progress_bar=False,
)
trainer.fit(model, train_dl)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name               | Type            | Params
-------------------------------------------------------
0 | _loss              | ContrastiveLoss | 0     
1 | _seq_encoder       | RnnSeqEncoder   | 134 K 
2 | _validation_metric | BatchRecallTopK | 0     
3 | _head              | Head            | 0     
-------------------------------------------------------
134 K     Trainable params
0         Non-trainable params
134 K     Total params
0.537     Total estimated model params size (MB)


In [13]:
print(trainer.logged_metrics)

{'loss': tensor(0.0531), 'seq_len': tensor(33.7000), 'recall_top_k': tensor(0.2499)}


In [16]:
model.to('cuda:0')

CoLESModule(
  (_loss): ContrastiveLoss()
  (_seq_encoder): RnnSeqEncoder(
    (trx_encoder): TrxEncoder(
      (embeddings): ModuleDict(
        (s_feat_2): NoisyEmbedding(
          10, 32, padding_idx=0
          (dropout): Dropout(p=0, inplace=False)
        )
        (s_feat_3): NoisyEmbedding(
          20, 32, padding_idx=0
          (dropout): Dropout(p=0, inplace=False)
        )
        (h_feat_1): NoisyEmbedding(
          20, 32, padding_idx=0
          (dropout): Dropout(p=0, inplace=False)
        )
        (h_feat_2): NoisyEmbedding(
          10, 32, padding_idx=0
          (dropout): Dropout(p=0, inplace=False)
        )
      )
      (custom_embeddings): ModuleDict()
    )
    (seq_encoder): RnnEncoder(
      (rnn): LSTM(128, 128, batch_first=True)
      (reducer): LastStepEncoder()
    )
  )
  (_validation_metric): BatchRecallTopK()
  (_head): Head(
    (model): Sequential(
      (0): L2NormEncoder()
    )
  )
)

In [47]:
from tqdm import tqdm

xx, yy = list(), list()

for i in range(20):
    dl = iter(sup_data.train_dataloader())
    for batch in tqdm(dl):
    
        with torch.no_grad():
            x, y = batch
            yy.append(y.numpy())
            x = model(x.to('cuda:0')).detach().cpu().numpy()
            xx.append(x)

xx = np.concatenate(xx, axis=0)
yy = np.concatenate(yy, axis=0)

100%|███████████████████████████████████████████| 79/79 [00:10<00:00,  7.69it/s]
100%|███████████████████████████████████████████| 79/79 [00:10<00:00,  7.70it/s]
100%|███████████████████████████████████████████| 79/79 [00:10<00:00,  7.59it/s]
100%|███████████████████████████████████████████| 79/79 [00:10<00:00,  7.66it/s]
100%|███████████████████████████████████████████| 79/79 [00:10<00:00,  7.74it/s]
100%|███████████████████████████████████████████| 79/79 [00:10<00:00,  7.71it/s]
100%|███████████████████████████████████████████| 79/79 [00:10<00:00,  7.67it/s]
100%|███████████████████████████████████████████| 79/79 [00:10<00:00,  7.51it/s]
100%|███████████████████████████████████████████| 79/79 [00:10<00:00,  7.70it/s]
100%|███████████████████████████████████████████| 79/79 [00:10<00:00,  7.64it/s]
100%|███████████████████████████████████████████| 79/79 [00:11<00:00,  6.88it/s]
100%|███████████████████████████████████████████| 79/79 [00:10<00:00,  7.57it/s]
100%|███████████████████████

In [18]:
xx_test, yy_test = list(), list()
dl = iter(sup_data.test_dataloader())
for batch in tqdm(dl):
    
    with torch.no_grad():
        x, y = batch
        yy_test.append(y.numpy())
        x = model(x.to('cuda:0')).detach().cpu().numpy()
        xx_test.append(x)

xx_test = np.concatenate(xx_test, axis=0)
yy_test = np.concatenate(yy_test, axis=0)

100%|███████████████████████████████████████████| 79/79 [00:10<00:00,  7.74it/s]


In [85]:
from lightgbm import LGBMClassifier

clf = LGBMClassifier(max_depth=-1)
clf.fit(xx, yy, eval_set=(xx_test, yy_test), eval_metric="auc")

[LightGBM] [Info] Number of positive: 100000, number of negative: 100000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.041687 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 32640
[LightGBM] [Info] Number of data points in the train set: 200000, number of used features: 128
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


In [86]:
y_pred = clf.predict_proba(xx_test)[:,1]
roc_auc_score(yy_test, y_pred)

0.5959359200000001

In [87]:
y_pred = clf.predict_proba(xx)[:,1]
roc_auc_score(yy, y_pred)

0.6708364566

In [88]:
get_hmm = partial(HMM, states=states, hidden_states=hidden_states,
                  hidden_state_transition_matrix=hidden_state_transition_matrix, noise=1.)

train_pos_hmms = get_hmm(state_transition_tensors=pos_tensors[:5000])
train_neg_hmms = get_hmm(state_transition_tensors=neg_tensors[:5000])

valid_pos_hmms = get_hmm(state_transition_tensors=pos_tensors[5000:7000])
valid_neg_hmms = get_hmm(state_transition_tensors=neg_tensors[5000:7000])

test_pos_hmms = get_hmm(state_transition_tensors=pos_tensors[7000:])
test_neg_hmms = get_hmm(state_transition_tensors=neg_tensors[7000:])



dataset_train = SyntheticDataset([train_pos_hmms, train_neg_hmms], seq_len=200)
dataset_valid = SyntheticDataset([valid_pos_hmms, valid_neg_hmms], seq_len=200)
dataset_test = SyntheticDataset([test_pos_hmms, test_neg_hmms], seq_len=200)



sup_data = PtlsDataModule(
    train_data=SeqToTargetDataset(dataset_train, target_col_name='class_label', target_dtype=torch.long),
    valid_data=SeqToTargetDataset(dataset_valid, target_col_name='class_label', target_dtype=torch.long),
    test_data=SeqToTargetDataset(dataset_test, target_col_name='class_label', target_dtype=torch.long),
    train_batch_size=128,
    valid_batch_size=256,
    train_num_workers=16,
)



train_dl = PtlsDataModule(
    train_data=ColesDataset(
        dataset_train,
        splitter=SampleSlices(
            split_count=5,
            cnt_min=20,
            cnt_max=50,
        ),
    ),
    valid_data=ColesDataset(
        dataset_valid,
        splitter=SampleSlices(
            split_count=5,
            cnt_min=20,
            cnt_max=50,),
    ),
    train_num_workers=16,
    train_batch_size=256,
    valid_num_workers=16,
    valid_batch_size=256,
)



seq_encoder = RnnSeqEncoder(
    trx_encoder=TrxEncoder(
        embeddings={
            #'s_feat_1': {'in': 100, 'out': 32},
            's_feat_2': {'in': 10, 'out': 32},
            's_feat_3': {'in': 20, 'out': 32},
            'h_feat_1': {'in': 20, 'out': 32},
            'h_feat_2': {'in': 10, 'out': 32},
        },
        embeddings_noise=0.001,
    ),
    hidden_size=128,
    type='lstm'
)

model = CoLESModule(
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=0.001),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=5, gamma=0.9),
)

In [89]:
trainer = pl.Trainer(
    max_epochs=50,
    gpus=[0],
    enable_progress_bar=False,
)
trainer.fit(model, train_dl)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name               | Type            | Params
-------------------------------------------------------
0 | _loss              | ContrastiveLoss | 0     
1 | _seq_encoder       | RnnSeqEncoder   | 134 K 
2 | _validation_metric | BatchRecallTopK | 0     
3 | _head              | Head            | 0     
-------------------------------------------------------
134 K     Trainable params
0         Non-trainable params
134 K     Total params
0.537     Total estimated model params size (MB)
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f9ec5f1b0a0>
Traceback (most recent call last):
  File "/home/al/Applications/miniconda3/envs/rlbnb/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 1478, in __del__
    self._shutdown_workers()
  File "/home/al/Applications/

Exception ignored in: Exception ignored in: 
      File "/home/al/Applications/miniconda3/envs/rlbnb/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 1478, in __del__
    AssertionError<function _MultiProcessingDataLoaderIter.__del__ at 0x7f9ec5f1b0a0>    <function _MultiProcessingDataLoaderIter.__del__ at 0x7f9ec5f1b0a0>assert self._parent_pid == os.getpid(), 'can only test a child process'    self._shutdown_workers(): 
self._shutdown_workers()
self._shutdown_workers()

can only test a child processTraceback (most recent call last):
AssertionError
Traceback (most recent call last):

  File "/home/al/Applications/miniconda3/envs/rlbnb/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 1461, in _shutdown_workers

  File "/home/al/Applications/miniconda3/envs/rlbnb/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 1478, in __del__
  File "/home/al/Applications/miniconda3/envs/rlbnb/lib/python3.10/site-packages/torch/utils/data/dataloader

In [90]:
print(trainer.logged_metrics)

{'loss': tensor(0.0529), 'seq_len': tensor(34.5125), 'recall_top_k': tensor(0.2531)}


In [91]:
model.to('cuda:0')

CoLESModule(
  (_loss): ContrastiveLoss()
  (_seq_encoder): RnnSeqEncoder(
    (trx_encoder): TrxEncoder(
      (embeddings): ModuleDict(
        (s_feat_2): NoisyEmbedding(
          10, 32, padding_idx=0
          (dropout): Dropout(p=0, inplace=False)
        )
        (s_feat_3): NoisyEmbedding(
          20, 32, padding_idx=0
          (dropout): Dropout(p=0, inplace=False)
        )
        (h_feat_1): NoisyEmbedding(
          20, 32, padding_idx=0
          (dropout): Dropout(p=0, inplace=False)
        )
        (h_feat_2): NoisyEmbedding(
          10, 32, padding_idx=0
          (dropout): Dropout(p=0, inplace=False)
        )
      )
      (custom_embeddings): ModuleDict()
    )
    (seq_encoder): RnnEncoder(
      (rnn): LSTM(128, 128, batch_first=True)
      (reducer): LastStepEncoder()
    )
  )
  (_validation_metric): BatchRecallTopK()
  (_head): Head(
    (model): Sequential(
      (0): L2NormEncoder()
    )
  )
)

In [92]:
from tqdm import tqdm

xx, yy = list(), list()

for i in range(20):
    dl = iter(sup_data.train_dataloader())
    for batch in tqdm(dl):
    
        with torch.no_grad():
            x, y = batch
            yy.append(y.numpy())
            x = model(x.to('cuda:0')).detach().cpu().numpy()
            xx.append(x)

xx = np.concatenate(xx, axis=0)
yy = np.concatenate(yy, axis=0)

100%|███████████████████████████████████████████| 79/79 [00:12<00:00,  6.28it/s]
100%|███████████████████████████████████████████| 79/79 [00:12<00:00,  6.37it/s]
100%|███████████████████████████████████████████| 79/79 [00:12<00:00,  6.33it/s]
100%|███████████████████████████████████████████| 79/79 [00:12<00:00,  6.41it/s]
100%|███████████████████████████████████████████| 79/79 [00:12<00:00,  6.19it/s]
100%|███████████████████████████████████████████| 79/79 [00:12<00:00,  6.44it/s]
100%|███████████████████████████████████████████| 79/79 [00:12<00:00,  6.22it/s]
100%|███████████████████████████████████████████| 79/79 [00:12<00:00,  6.37it/s]
100%|███████████████████████████████████████████| 79/79 [00:12<00:00,  6.26it/s]
100%|███████████████████████████████████████████| 79/79 [00:12<00:00,  6.30it/s]
100%|███████████████████████████████████████████| 79/79 [00:12<00:00,  6.39it/s]
100%|███████████████████████████████████████████| 79/79 [00:12<00:00,  6.36it/s]
100%|███████████████████████

In [93]:
xx_test, yy_test = list(), list()
dl = iter(sup_data.train_dataloader())
for batch in tqdm(dl):
    
    with torch.no_grad():
        x, y = batch
        yy_test.append(y.numpy())
        x = model(x.to('cuda:0')).detach().cpu().numpy()
        xx_test.append(x)

xx_test = np.concatenate(xx_test, axis=0)
yy_test = np.concatenate(yy_test, axis=0)

100%|███████████████████████████████████████████| 79/79 [00:12<00:00,  6.27it/s]


In [94]:
clf = LGBMClassifier(max_depth=-1)
clf.fit(xx, yy, eval_set=(xx_test, yy_test), eval_metric="auc")

[LightGBM] [Info] Number of positive: 100000, number of negative: 100000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015971 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 32640
[LightGBM] [Info] Number of data points in the train set: 200000, number of used features: 128
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


In [95]:
y_pred = clf.predict_proba(xx_test)[:,1]
roc_auc_score(yy_test, y_pred)

0.5920572000000001

In [96]:
y_pred = clf.predict_proba(xx)[:,1]
roc_auc_score(yy, y_pred)

0.6710661666

In [15]:
a = np.random.randn(4)
a = a / np.sqrt((a**2).sum()) * 4
a = np.exp(a) / np.exp(a).sum()
a

array([0.55487161, 0.06509573, 0.00810829, 0.37192438])

In [16]:
list(np.arange(25).reshape(5,5))

[array([0, 1, 2, 3, 4]),
 array([5, 6, 7, 8, 9]),
 array([10, 11, 12, 13, 14]),
 array([15, 16, 17, 18, 19]),
 array([20, 21, 22, 23, 24])]