In [1]:
%load_ext autoreload
%autoreload 2

import os
from collections import defaultdict
from tqdm import tqdm
from IPython.display import clear_output
import torch.multiprocessing
torch.multiprocessing.set_sharing_strategy('file_system')

import logging
import torch
import pytorch_lightning as pl
import warnings
import numpy as np
import pandas as pd


from functools import partial
from ptls.data_load.datasets import SyntheticDataset, ParquetFiles, ParquetDataset
from ptls.frames.supervised import SeqToTargetDataset, SeqToTargetIterableDataset, SequenceToTarget
from ptls.frames import PtlsDataModule
from functools import partial


from ptls.data_load.datasets import SyntheticDataset
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.frames.coles import ColesDataset, ColesIterableDataset
from ptls.frames.coles.split_strategy import SampleSlices


from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
import torch
import torchmetrics
from ptls.nn import TrxEncoder, RnnSeqEncoder, Head, TransformerSeqEncoder
from ptls.frames.coles import CoLESModule


import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger


warnings.filterwarnings('ignore')
# logging.getLogger("pytorch_lightning").setLevel(logging.ERROR)

from time import sleep

n_states = 64
n_hidden_states = 4
hidden_state_c = 0.7

In [2]:
sleep(45*60)

In [3]:
path_to_data = "syndata/new_data_0/"
noise_level = "new_data_more_modalities_noise_0"
path_to_log = "new_data_more_modalities_noise_0.txt"
gpu_n = 0

n_states = 64
n_hidden_states = 16

with open(path_to_log, "w") as f:
    pass

In [4]:
def test_sup_model(model, sup_data, exp_name):
    model.to('cuda:' + str(gpu_n))
    
    yy_pred_eval, yy_eval = list(), list()
    dl = iter(sup_data.val_dataloader())
    for batch in dl:

        with torch.no_grad():
            x, y = batch
            yy_eval.append(y.numpy())
            y_pred = model(x.to('cuda:' + str(gpu_n))).detach().cpu().numpy()
            yy_pred_eval.append(y_pred)

    yy_pred_eval = np.concatenate(yy_pred_eval, axis=0)[:, 1]
    yy_eval = np.concatenate(yy_eval, axis=0)

    score = roc_auc_score(yy_eval, yy_pred_eval)

    with open(path_to_log, "a") as f:
        f.writelines([f'{exp_name},{noise_level},{score}\n'])
    
    return score

In [5]:
def get_sup_data(path):
    train_files = ParquetFiles(os.path.join(path, "train"))
    train_dataset = ParquetDataset(train_files, shuffle_files=True)
    eval_files = ParquetFiles(os.path.join(path, "eval"))
    eval_dataset = ParquetDataset(eval_files, shuffle_files=True)
    test_files = ParquetFiles(os.path.join(path, "test"))
    test_dataset = ParquetDataset(test_files, shuffle_files=True)
    
    sup_data = PtlsDataModule(
        train_data=SeqToTargetIterableDataset(train_dataset, target_col_name='class_label', target_dtype=torch.long),
        valid_data=SeqToTargetIterableDataset(eval_dataset, target_col_name='class_label', target_dtype=torch.long),
        #test_data=SeqToTargetIterableDataset(test_dataset, target_col_name='class_label', target_dtype=torch.long),
        train_batch_size=512,
        valid_batch_size=512,
        #test_batch_size=256,
        train_num_workers=4,
        valid_num_workers=4,
        #test_num_workers=4
    )
    return sup_data

In [6]:
def get_sup_model():
    seq_encoder = RnnSeqEncoder(
        trx_encoder=TrxEncoder(
            embeddings={
                'B': {'in': n_states, 'out': 16},
                'A': {'in': n_hidden_states, 'out': 16},
                'D': {'in': n_states, 'out': 16},
                'C': {'in': n_hidden_states, 'out': 16},
            },
            embeddings_noise=0.001,
        ),
        hidden_size=256,
        type='lstm',
    )

    sup_module = SequenceToTarget(
        seq_encoder=seq_encoder,
        head=Head(input_size=seq_encoder.embedding_size, objective='classification', num_classes=2),
        loss=torch.nn.NLLLoss(),
        metric_list=torchmetrics.Accuracy(),
        optimizer_partial=partial(torch.optim.Adam, lr=0.001),
        lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=4, gamma=0.9),
    )
    return sup_module

In [7]:
tb_logger = TensorBoardLogger(save_dir="lightning_logs", name="early_fusion_supervised_" + str(noise_level))

trainer = pl.Trainer(
    max_epochs=13,
    gpus=[gpu_n],
    enable_progress_bar=False,
    logger=tb_logger
)

sup_data = get_sup_data(path_to_data)
sup_model = get_sup_model()

trainer.fit(sup_model, sup_data)
clear_output()

test_sup_model(sup_model, sup_data, "early_fusion_supervised")

0.9285660399260915

# States Only Supervised

In [8]:
def get_states_sup_model():
    seq_encoder = RnnSeqEncoder(
        trx_encoder=TrxEncoder(
            embeddings={
                'B': {'in': n_states, 'out': 16},
            },
            embeddings_noise=0.001,
        ),
        hidden_size=256,
        type='lstm',
    )

    sup_module = SequenceToTarget(
        seq_encoder=seq_encoder,
        head=Head(input_size=seq_encoder.embedding_size, objective='classification', num_classes=2),
        loss=torch.nn.NLLLoss(),
        metric_list=torchmetrics.Accuracy(),
        optimizer_partial=partial(torch.optim.Adam, lr=0.001),
        lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=4, gamma=0.9),
    )
    return sup_module

In [9]:
tb_logger = TensorBoardLogger(save_dir="lightning_logs", name="states_only_supervised_" + str(noise_level))

trainer = pl.Trainer(
    max_epochs=13,
    gpus=[gpu_n],
    enable_progress_bar=False,
    logger=tb_logger
)

sup_data = get_sup_data(path_to_data)
sup_model = get_states_sup_model()

trainer.fit(sup_model, sup_data)
clear_output()

test_sup_model(sup_model, sup_data, "states_only_supervised")

0.7615754571571675

# Early Fusion

In [10]:
def test_embed_model(model, sup_data, exp_name):
    model.to('cuda:' + str(gpu_n))
    
    xx, yy = list(), list()
    dl = iter(sup_data.train_dataloader())
    for batch in dl:
    
        with torch.no_grad():
            x, y = batch
            yy.append(y.numpy())
            x = model(x.to('cuda:' + str(gpu_n))).detach().cpu().numpy()
            xx.append(x)

    xx = np.concatenate(xx, axis=0)
    yy = np.concatenate(yy, axis=0)
    
    xx_eval, yy_eval = list(), list()
    dl = iter(sup_data.val_dataloader())
    for batch in dl:

        with torch.no_grad():
            x, y = batch
            yy_eval.append(y.numpy())
            x = model(x.to('cuda:' + str(gpu_n))).detach().cpu().numpy()
            xx_eval.append(x)

    xx_eval = np.concatenate(xx_eval, axis=0)
    yy_eval = np.concatenate(yy_eval, axis=0)

    n = int(xx_eval.shape[0]/2)
    xx_test = xx_eval[:n]
    yy_test = yy_eval[:n]

    xx_eval = xx_eval[n:]
    yy_eval = yy_eval[n:]
    
    
    clf = LGBMClassifier(max_depth=-1)
    clf.fit(xx, yy, eval_set=(xx_eval, yy_eval), eval_metric="auc")
    
    y_pred = clf.predict_proba(xx)[:,1]
    train_score = roc_auc_score(yy, y_pred)
    
    y_pred = clf.predict_proba(xx_eval)[:,1]
    eval_score = roc_auc_score(yy_eval, y_pred)
    
    y_pred = clf.predict_proba(xx_test)[:,1]
    test_score = roc_auc_score(yy_test, y_pred)

    with open(path_to_log, "a") as f:
        f.writelines([f'{exp_name},{noise_level},{test_score}\n'])
    
    return train_score, eval_score, test_score

In [11]:
def get_coles_data(path):
    train_files = ParquetFiles(os.path.join(path, "train"))
    train_dataset = ParquetDataset(train_files, shuffle_files=True)
    eval_files = ParquetFiles(os.path.join(path, "eval"))
    eval_dataset = ParquetDataset(eval_files, shuffle_files=True)
    test_files = ParquetFiles(os.path.join(path, "test"))
    test_dataset = ParquetDataset(test_files, shuffle_files=True)

    train_dl = PtlsDataModule(
        train_data=ColesIterableDataset(
            train_dataset,
            splitter=SampleSlices(
                split_count=5,
                cnt_min=70,
                cnt_max=100,
            ),
        ),
        valid_data=ColesIterableDataset(
            eval_dataset,
            splitter=SampleSlices(
                split_count=5,
                cnt_min=70,
                cnt_max=100,),
        ),
        train_num_workers=4,
        train_batch_size=512,
        valid_num_workers=4,
        valid_batch_size=512,
    )
    
    return train_dl

In [12]:
def get_coles_model():
    '''
    seq_encoder = TransformerSeqEncoder(
        trx_encoder=TrxEncoder(
            embeddings={
                's_feat_1': {'in': n_states, 'out': 16},
                'h_feat_1': {'in': n_hidden_states, 'out': 16},
            },
            embeddings_noise=0.001,
        ),
    )
    '''
    seq_encoder = RnnSeqEncoder(
        trx_encoder=TrxEncoder(
            embeddings={
                'B': {'in': n_states, 'out': 16},
                'A': {'in': n_hidden_states, 'out': 16},
                'D': {'in': n_states, 'out': 16},
                'C': {'in': n_hidden_states, 'out': 16},
            },
            embeddings_noise=0.001,
        ),
        hidden_size=256,
        type='lstm',
    )
    
    model = CoLESModule(
        seq_encoder=seq_encoder,
        optimizer_partial=partial(torch.optim.Adam, lr=0.001),
        lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=5, gamma=0.9),
    )
    return model

In [13]:
tb_logger = TensorBoardLogger(save_dir="lightning_logs", name="early_fusion_coles_" + str(noise_level))

coles_dl = get_coles_data(path_to_data)
coles_model = get_coles_model()

trainer = pl.Trainer(
    max_epochs=50,
    gpus=[gpu_n],
    enable_progress_bar=False,
    logger=tb_logger
)
trainer.fit(coles_model, coles_dl)
clear_output()

test_embed_model(coles_model, sup_data, "early_fusion_coles")

[LightGBM] [Info] Number of positive: 128063, number of negative: 127937
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.096783 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 65280
[LightGBM] [Info] Number of data points in the train set: 256000, number of used features: 256
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500246 -> initscore=0.000984
[LightGBM] [Info] Start training from score 0.000984


(0.8042297443229891, 0.790790162128666, 0.7870238850449083)

# States Only Coles

In [14]:
def get_coles_states_model():
    seq_encoder = RnnSeqEncoder(
        trx_encoder=TrxEncoder(
            embeddings={
                'B': {'in': n_states, 'out': 16},
            },
            embeddings_noise=0.001,
        ),
        hidden_size=256,
        type='lstm',
    )
    
    model = CoLESModule(
        seq_encoder=seq_encoder,
        optimizer_partial=partial(torch.optim.Adam, lr=0.001),
        lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=5, gamma=0.9),
    )
    return model

In [15]:
tb_logger = TensorBoardLogger(save_dir="lightning_logs", name="states_only_coles_" + str(noise_level))

#coles_dl = get_coles_data(path_to_data)
model_s = get_coles_states_model()

trainer = pl.Trainer(
    max_epochs=50,
    gpus=[gpu_n],
    enable_progress_bar=False,
    logger=tb_logger
)
trainer.fit(model_s, coles_dl)
clear_output()

test_embed_model(model_s, sup_data, "states_only_coles")

[LightGBM] [Info] Number of positive: 128063, number of negative: 127937
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.099854 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 65280
[LightGBM] [Info] Number of data points in the train set: 256000, number of used features: 256
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500246 -> initscore=0.000984
[LightGBM] [Info] Start training from score 0.000984


(0.796303059419363, 0.7744222883801596, 0.7733039959007282)

# Late Fusion

In [16]:
def test_late_embed_model(models, sup_data, exp_name):
    for m in models:
        m.to('cuda:' + str(gpu_n))
    
    xx, yy = list(), list()
    dl = iter(sup_data.train_dataloader())
    for batch in dl:
    
        with torch.no_grad():
            x, y = batch
            yy.append(y.numpy())
            xi = list()
            for m in models:
                x_m = m(x.to('cuda:' + str(gpu_n))).detach().cpu().numpy()
                xi.append(x_m)
            x = np.concatenate(xi, axis=-1)
            xx.append(x)

    xx = np.concatenate(xx, axis=0)
    yy = np.concatenate(yy, axis=0)
    
    xx_eval, yy_eval = list(), list()
    dl = iter(sup_data.val_dataloader())
    for batch in dl:

        with torch.no_grad():
            x, y = batch
            yy_eval.append(y.numpy())
            xi = list()
            for m in models:
                x_m = m(x.to('cuda:' + str(gpu_n))).detach().cpu().numpy()
                xi.append(x_m)
            x = np.concatenate(xi, axis=-1)
            xx_eval.append(x)

    xx_eval = np.concatenate(xx_eval, axis=0)
    yy_eval = np.concatenate(yy_eval, axis=0)

    n = int(xx_eval.shape[0]/2)
    xx_test = xx_eval[:n]
    yy_test = yy_eval[:n]

    xx_eval = xx_eval[n:]
    yy_eval = yy_eval[n:]
    
    
    clf = LGBMClassifier(max_depth=-1)
    clf.fit(xx, yy, eval_set=(xx_eval, yy_eval), eval_metric="auc")
    
    y_pred = clf.predict_proba(xx)[:,1]
    train_score = roc_auc_score(yy, y_pred)
    
    y_pred = clf.predict_proba(xx_eval)[:,1]
    eval_score = roc_auc_score(yy_eval, y_pred)
    
    y_pred = clf.predict_proba(xx_test)[:,1]
    test_score = roc_auc_score(yy_test, y_pred)

    with open(path_to_log, "a") as f:
        f.writelines([f'{exp_name},{noise_level},{test_score}\n'])
    
    return train_score, eval_score, test_score

In [17]:
def get_h_coles_models(letter, nn):
    seq_encoder_h = RnnSeqEncoder(
        trx_encoder=TrxEncoder(
            embeddings={
                letter: {'in': nn, 'out': 16},
            },
            embeddings_noise=0.001,
        ),
        hidden_size=256,
        type='lstm',
    )
    
    model_h = CoLESModule(
        seq_encoder=seq_encoder_h,
        optimizer_partial=partial(torch.optim.Adam, lr=0.001),
        lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=5, gamma=0.9),
    )
    return model_h

In [None]:
unimodal_models = [model_s]
for l, ni in zip(['A', 'C', 'D'], [n_hidden_states, n_hidden_states, n_states]):
    tb_logger = TensorBoardLogger(save_dir="lightning_logs", name="h_states_only_coles_" + str(noise_level) + "_modality_" + l)
    model_h = get_h_coles_models(l, ni)
    
    trainer = pl.Trainer(
        max_epochs=50,
        gpus=[gpu_n],
        enable_progress_bar=False,
        logger=tb_logger
    )
    trainer.fit(model_h, coles_dl)
    clear_output()
    unimodal_models.append(model_h)

test_late_embed_model(unimodal_models, sup_data, "late_fusion_coles")

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Missing logger folder: lightning_logs/h_states_only_coles_new_data_more_modalities_noise_0_modality_C
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name               | Type            | Params
-------------------------------------------------------
0 | _loss              | ContrastiveLoss | 0     
1 | _seq_encoder       | RnnSeqEncoder   | 281 K 
2 | _validation_metric | BatchRecallTopK | 0     
3 | _head              | Head            | 0     
-------------------------------------------------------
281 K     Trainable params
0         Non-trainable params
281 K     Total params
1.124     Total estimated model params size (MB)


In [None]:
#noise1 = (0.8007202911224365, 0.783103692626953, 0.780410511779785)

In [None]:
#noise05 = (0.9408758544464111, 0.9335008178710937, 0.9345132614135743)

In [None]:
#noise02 = (0.9177455666351317, 0.9057953903198244, 0.9047988479614257)

In [None]:
#noise0 = (0.9709027706604004, 0.9667222106933593, 0.9655987045288088)