In [1]:
%load_ext autoreload
%autoreload 2

#import multiprocessing
#multiprocessing.set_start_method('fork')

import torch.multiprocessing
torch.multiprocessing.set_sharing_strategy('file_system')

import os
from collections import defaultdict
from tqdm import tqdm
from IPython.display import clear_output

import logging
import torch
import pytorch_lightning as pl
import warnings
import numpy as np
import pandas as pd


from functools import partial
from ptls.data_load.datasets import SyntheticDataset, ParquetFiles, ParquetDataset
from ptls.frames.supervised import SeqToTargetDataset, SeqToTargetIterableDataset, SequenceToTarget
from ptls.frames import PtlsDataModule
from functools import partial


from ptls.data_load.datasets import SyntheticDataset
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.frames.coles import ColesDataset, ColesIterableDataset
from ptls.frames.coles.split_strategy import SampleSlices


from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
import torch
import torchmetrics
from ptls.nn import TrxEncoder, RnnSeqEncoder, Head, TransformerSeqEncoder, SphereHead
from ptls.frames.coles import CoLESModule, MultiCoLESModule
from ptls.frames.coles.losses import CLUBLoss, MultiContrastiveLoss, MultiLoss, ContrastiveLoss
from ptls.frames.coles.metric import MultiBatchRecallTopK, BatchAccuracy
from ptls.frames.coles.sampling_strategies import HardNegativePairSelector
from ptls.nn.normalization import L2NormEncoder


import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger

from ptls.data_load.datasets import Config, MonoTargetSyntheticDatasetWriter


warnings.filterwarnings('ignore')
# logging.getLogger("pytorch_lightning").setLevel(logging.ERROR)

from time import sleep

In [2]:
gpu_n = 1
n_models=1

n_s = 8
n_hs = 4
seq_len = 1024

a_sat = 0.
b_sat = 1.
noise = 0.

n_states = n_s ** 2
n_hidden_states = n_hs ** 2

name = "corr_new_data_"+ "_".join([str(x) for x in [a_sat, b_sat, noise]])
path_to_data = "syndata/" + name + "/"
noise_level = name
path_to_log = name + ".txt"

#with open(path_to_log, "w") as f:
#    pass

In [3]:
def get_config(noise):
    chain_confs = {
        "A": (n_hs, 4, noise),
        "B": (n_s, 16, 0),
    }

    state_from = ["A"]
    state_to = ["B"]

    labeling_conf = {
        0: {"A": a_sat,
            "B": b_sat}
    }

    config = Config(chain_confs, state_from, state_to, labeling_conf)
    return config


config = get_config(noise)
writer = MonoTargetSyntheticDatasetWriter(config, path=path_to_data, seq_len=seq_len,
                                          n_train_files=250, n_eval_files=50, n_test_files=0,
                                          train_per_file=256*4, eval_per_file=256*4, test_per_file=256*4,
                                          save_config_name="corr_config", save=False, load=True, n_procs=4)
#writer.write_dataset()

In [4]:
def test_sup_model(model, sup_data, exp_name):
    model.to('cuda:' + str(gpu_n))
    
    yy_pred_eval, yy_eval = list(), list()
    dl = iter(sup_data.val_dataloader())
    for batch in dl:

        with torch.no_grad():
            x, y = batch
            yy_eval.append(y.numpy())
            y_pred = model(x.to('cuda:' + str(gpu_n))).detach().cpu().numpy()
            yy_pred_eval.append(y_pred)

    yy_pred_eval = np.concatenate(yy_pred_eval, axis=0)[:, 1]
    yy_eval = np.concatenate(yy_eval, axis=0)

    score = roc_auc_score(yy_eval, yy_pred_eval)

    with open(path_to_log, "a") as f:
        f.writelines([f'{exp_name},{noise_level},{score}\n'])
    
    return score

In [5]:
def get_sup_data(path):
    train_files = ParquetFiles(os.path.join(path, "train"))
    train_dataset = ParquetDataset(train_files, shuffle_files=True)
    eval_files = ParquetFiles(os.path.join(path, "eval"))
    eval_dataset = ParquetDataset(eval_files, shuffle_files=True)
    test_files = ParquetFiles(os.path.join(path, "test"))
    test_dataset = ParquetDataset(test_files, shuffle_files=True)
    
    sup_data = PtlsDataModule(
        train_data=SeqToTargetIterableDataset(train_dataset, target_col_name='class_label', target_dtype=torch.long),
        valid_data=SeqToTargetIterableDataset(eval_dataset, target_col_name='class_label', target_dtype=torch.long),
        #test_data=SeqToTargetIterableDataset(test_dataset, target_col_name='class_label', target_dtype=torch.long),
        train_batch_size=512,
        valid_batch_size=512,
        #test_batch_size=256,
        train_num_workers=4,
        valid_num_workers=4,
        #test_num_workers=4
    )
    return sup_data

In [6]:
def get_sup_model():
    seq_encoder = RnnSeqEncoder(
        trx_encoder=TrxEncoder(
            embeddings={
                'B': {'in': n_states, 'out': 16},
                'A': {'in': n_hidden_states, 'out': 16},
            },
            embeddings_noise=0.001,
        ),
        hidden_size=256,
        type='lstm',
    )

    sup_module = SequenceToTarget(
        seq_encoder=seq_encoder,
        head=Head(input_size=seq_encoder.embedding_size, objective='classification', num_classes=2),
        loss=torch.nn.NLLLoss(),
        metric_list=torchmetrics.Accuracy(),
        optimizer_partial=partial(torch.optim.Adam, lr=0.001),
        lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=4, gamma=0.9),
    )
    return sup_module

In [7]:
tb_logger = TensorBoardLogger(save_dir="lightning_logs", name="early_fusion_supervised_" + str(noise_level))
'''
trainer = pl.Trainer(
    max_epochs=13,
    gpus=[gpu_n],
    enable_progress_bar=False,
    logger=tb_logger
)
'''
sup_data = get_sup_data(path_to_data)
#sup_model = get_sup_model()

#trainer.fit(sup_model, sup_data)
#clear_output()

#test_sup_model(sup_model, sup_data, "early_fusion_supervised")

# States Only Supervised

In [8]:
def get_states_sup_model():
    seq_encoder = RnnSeqEncoder(
        trx_encoder=TrxEncoder(
            embeddings={
                'B': {'in': n_states, 'out': 16},
                #'A': {'in': n_hidden_states, 'out': 16},
            },
            embeddings_noise=0.001,
        ),
        hidden_size=256,
        type='lstm',
    )

    sup_module = SequenceToTarget(
        seq_encoder=seq_encoder,
        head=Head(input_size=seq_encoder.embedding_size, objective='classification', num_classes=2),
        loss=torch.nn.NLLLoss(),
        metric_list=torchmetrics.Accuracy(),
        optimizer_partial=partial(torch.optim.Adam, lr=0.001),
        lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=4, gamma=0.9),
    )
    return sup_module

In [9]:
tb_logger = TensorBoardLogger(save_dir="lightning_logs", name="states_only_supervised_" + str(noise_level))
'''
trainer = pl.Trainer(
    max_epochs=13,
    gpus=[gpu_n],
    enable_progress_bar=False,
    logger=tb_logger
)
'''
#sup_data = get_sup_data(path_to_data)
#sup_model = get_states_sup_model()

#trainer.fit(sup_model, sup_data)
#clear_output()

#test_sup_model(sup_model, sup_data, "states_only_supervised")

'\ntrainer = pl.Trainer(\n    max_epochs=13,\n    gpus=[gpu_n],\n    enable_progress_bar=False,\n    logger=tb_logger\n)\n'

# Early Fusion

In [10]:
def test_embed_model(model, sup_data, exp_name):
    model.to('cuda:' + str(gpu_n))
    
    xx, yy = list(), list()
    dl = iter(sup_data.train_dataloader())
    for batch in dl:
    
        with torch.no_grad():
            x, y = batch
            yy.append(y.numpy())

            x_list = list()
            if model.trained_models is not None:
                x_list.extend([m(x.to('cuda:' + str(gpu_n))).detach().cpu().numpy() for m in model.trained_models])
            x_list.append(model(x.to('cuda:' + str(gpu_n))).detach().cpu().numpy())
            
            x = np.concatenate(x_list, axis=-1)
            xx.append(x)

    xx = np.concatenate(xx, axis=0)
    yy = np.concatenate(yy, axis=0)
    
    xx_eval, yy_eval = list(), list()
    dl = iter(sup_data.val_dataloader())
    for batch in dl:

        with torch.no_grad():
            x, y = batch
            yy_eval.append(y.numpy())
            
            x_list = list()
            if model.trained_models is not None:
                x_list.extend([m(x.to('cuda:' + str(gpu_n))).detach().cpu().numpy() for m in model.trained_models])
            x_list.append(model(x.to('cuda:' + str(gpu_n))).detach().cpu().numpy())
            
            x = np.concatenate(x_list, axis=-1)
            xx_eval.append(x)

    xx_eval = np.concatenate(xx_eval, axis=0)
    yy_eval = np.concatenate(yy_eval, axis=0)

    n = int(xx_eval.shape[0]/2)
    xx_test = xx_eval[:n]
    yy_test = yy_eval[:n]

    xx_eval = xx_eval[n:]
    yy_eval = yy_eval[n:]
    
    
    clf = LGBMClassifier(max_depth=-1)
    clf.fit(xx, yy, eval_set=(xx_eval, yy_eval), eval_metric="auc")
    
    y_pred = clf.predict_proba(xx)[:,1]
    train_score = roc_auc_score(yy, y_pred)
    
    y_pred = clf.predict_proba(xx_eval)[:,1]
    eval_score = roc_auc_score(yy_eval, y_pred)
    
    y_pred = clf.predict_proba(xx_test)[:,1]
    test_score = roc_auc_score(yy_test, y_pred)

    with open(path_to_log, "a") as f:
        f.writelines([f'{exp_name},{noise_level},{test_score}\n'])
    
    return train_score, eval_score, test_score

In [11]:
def get_coles_data(path):
    train_files = ParquetFiles(os.path.join(path, "train"))
    train_dataset = ParquetDataset(train_files, shuffle_files=True)
    eval_files = ParquetFiles(os.path.join(path, "eval"))
    eval_dataset = ParquetDataset(eval_files, shuffle_files=True)
    test_files = ParquetFiles(os.path.join(path, "test"))
    test_dataset = ParquetDataset(test_files, shuffle_files=True)

    train_dl = PtlsDataModule(
        train_data=ColesIterableDataset(
            train_dataset,
            splitter=SampleSlices(
                split_count=5,
                cnt_min=70,
                cnt_max=100,
            ),
        ),
        valid_data=ColesIterableDataset(
            eval_dataset,
            splitter=SampleSlices(
                split_count=5,
                cnt_min=70,
                cnt_max=100,),
        ),
        train_num_workers=4,
        train_batch_size=512,
        valid_num_workers=4,
        valid_batch_size=512,
    )
    
    return train_dl

In [12]:
def get_coles_model():
    coles_loss = ContrastiveLoss(margin=1,
                                 sampling_strategy=HardNegativePairSelector(neg_count=5))
    club_loss = CLUBLoss(log_var=-2, emb_coef=1, prob_coef=1.)
    metric = MultiBatchRecallTopK(n=2,
                                  K=4,
                                  metric='cosine',)

    discriminator_model = torch.nn.Sequential(torch.nn.Linear(128, 512), torch.nn.BatchNorm1d(512), torch.nn.ReLU(), 
                                              torch.nn.Linear(512, 512), torch.nn.BatchNorm1d(512), torch.nn.ReLU(),
                                              torch.nn.Linear(512, 128), L2NormEncoder())
    
    seq_encoder = RnnSeqEncoder(
        trx_encoder=TrxEncoder(
            embeddings={
                'B': {'in': n_states, 'out': 16},
                'A': {'in': n_hidden_states, 'out': 16},
            },
            embeddings_noise=0.001,
        ),
        hidden_size=128,
        type='lstm',
    )
    
    model = MultiCoLESModule(
        head=Head(use_norm_encoder=True),
        #validation_metric = metric,
        loss=coles_loss,
        discriminator_loss=club_loss,
        seq_encoder=seq_encoder,
        discriminator=discriminator_model,
        optimizer_partial=partial(torch.optim.Adam, lr=0.001),
        d_optimizer_partial=partial(torch.optim.Adam, lr=0.01),
        trained_encoders=['first_model.pth'],
        lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=5, gamma=0.9),
        coles_coef=1.,
        embed_coef=0.011,
        g_step_every=1
    )
    return model

In [13]:
tb_logger = TensorBoardLogger(save_dir="lightning_logs", name="multi_early_fusion_coles_" + str(noise_level))

coles_dl = get_coles_data(path_to_data)
coles_model = get_coles_model()

trainer = pl.Trainer(
    max_epochs=50,
    gpus=[gpu_n],
    enable_progress_bar=False,
    logger=tb_logger
)
trainer.fit(coles_model, coles_dl)
clear_output()

test_embed_model(coles_model, sup_data, "multi_early_fusion_coles")

[LightGBM] [Info] Number of positive: 128000, number of negative: 128000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.231626 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 65280
[LightGBM] [Info] Number of data points in the train set: 256000, number of used features: 256
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


(0.6643004255981445, 0.6079705287590461, 0.6174298368513145)

In [14]:
test_embed_model(coles_model, sup_data, "multi_early_fusion_coles")

[LightGBM] [Info] Number of positive: 128000, number of negative: 128000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.098480 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 65280
[LightGBM] [Info] Number of data points in the train set: 256000, number of used features: 256
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


(0.664540282409668, 0.6125706067170339, 0.6167011075980888)

In [15]:
test_embed_model(coles_model, sup_data, "multi_early_fusion_coles")

[LightGBM] [Info] Number of positive: 128000, number of negative: 128000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.327252 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 65280
[LightGBM] [Info] Number of data points in the train set: 256000, number of used features: 256
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


(0.6646085024414061, 0.6151257985962694, 0.6139756519995754)

In [16]:
test_embed_model(coles_model, sup_data, "multi_early_fusion_coles")

[LightGBM] [Info] Number of positive: 128000, number of negative: 128000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.511083 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 65280
[LightGBM] [Info] Number of data points in the train set: 256000, number of used features: 256
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


(0.6646171783447266, 0.6149339119727834, 0.6136042424911665)

In [15]:
coles_model.embed_coef = 0.
coles_model.coles_coef = 0.

In [18]:
tb_logger = TensorBoardLogger(save_dir="lightning_logs", name="multi_early_fusion_coles_" + str(noise_level))

trainer = pl.Trainer(
    max_epochs=50,
    gpus=[gpu_n],
    enable_progress_bar=False,
    logger=tb_logger
)
trainer.fit(coles_model, coles_dl)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name               | Type            | Params
-------------------------------------------------------
0 | _loss              | ContrastiveLoss | 0     
1 | _seq_encoder       | RnnSeqEncoder   | 84.4 K
2 | _validation_metric | BatchRecallTopK | 0     
3 | discriminator_loss | CLUBLoss        | 0     
4 | trained_models     | ModuleList      | 84.4 K
5 | discriminator      | Sequential      | 396 K 
6 | _head              | Head            | 0     
-------------------------------------------------------
480 K     Trainable params
84.4 K    Non-trainable params
565 K     Total params
2.260     Total estimated model params size (MB)
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f8bdafe6cb0>
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-pack

In [15]:
test_embed_model(coles_model, sup_data, "multi_early_fusion_coles")

Exception ignored in: <function _releaseLock at 0x7fb8931835b0>
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/logging/__init__.py", line 228, in _releaseLock
    def _releaseLock():
KeyboardInterrupt: 
Exception ignored in: <function _releaseLock at 0x7fb8931835b0>
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/logging/__init__.py", line 228, in _releaseLock
    def _releaseLock():
KeyboardInterrupt: 

KeyboardInterrupt



# States Only Coles

In [14]:
def get_coles_states_model():
    seq_encoder = RnnSeqEncoder(
        trx_encoder=TrxEncoder(
            embeddings={
                'B': {'in': n_states, 'out': 16},
                #'A': {'in': n_hidden_states, 'out': 16},
            },
            embeddings_noise=0.001,
        ),
        hidden_size=256,
        type='lstm',
    )
    
    model = CoLESModule(
        seq_encoder=seq_encoder,
        optimizer_partial=partial(torch.optim.Adam, lr=0.001),
        lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=5, gamma=0.9),
    )
    return model

In [15]:
tb_logger = TensorBoardLogger(save_dir="lightning_logs", name="states_only_coles_" + str(noise_level))

#coles_dl = get_coles_data(path_to_data)
model_s = get_coles_states_model()

trainer = pl.Trainer(
    max_epochs=50,
    gpus=[gpu_n],
    enable_progress_bar=False,
    logger=tb_logger
)
trainer.fit(model_s, coles_dl)
clear_output()

test_embed_model(model_s, sup_data, "states_only_coles")

ValueError: Expected 2D array, got 1D array instead:
array=[ 0.55528545 -0.06221164  0.16263163 ... -0.20286767 -0.3722022
  0.6936094 ].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

# Late Fusion

In [None]:
def test_late_embed_model(models, sup_data, exp_name):
    for m in models:
        m.to('cuda:' + str(gpu_n))
    
    xx, yy = list(), list()
    dl = iter(sup_data.train_dataloader())
    for batch in dl:
    
        with torch.no_grad():
            x, y = batch
            yy.append(y.numpy())
            xi = list()
            for m in models:
                x_m = m(x.to('cuda:' + str(gpu_n))).detach().cpu().numpy()
                xi.append(x_m)
            x = np.concatenate(xi, axis=-1)
            xx.append(x)

    xx = np.concatenate(xx, axis=0)
    yy = np.concatenate(yy, axis=0)
    
    xx_eval, yy_eval = list(), list()
    dl = iter(sup_data.val_dataloader())
    for batch in dl:

        with torch.no_grad():
            x, y = batch
            yy_eval.append(y.numpy())
            xi = list()
            for m in models:
                x_m = m(x.to('cuda:' + str(gpu_n))).detach().cpu().numpy()
                xi.append(x_m)
            x = np.concatenate(xi, axis=-1)
            xx_eval.append(x)

    xx_eval = np.concatenate(xx_eval, axis=0)
    yy_eval = np.concatenate(yy_eval, axis=0)

    n = int(xx_eval.shape[0]/2)
    xx_test = xx_eval[:n]
    yy_test = yy_eval[:n]

    xx_eval = xx_eval[n:]
    yy_eval = yy_eval[n:]
    
    
    clf = LGBMClassifier(max_depth=-1)
    clf.fit(xx, yy, eval_set=(xx_eval, yy_eval), eval_metric="auc")
    
    y_pred = clf.predict_proba(xx)[:,1]
    train_score = roc_auc_score(yy, y_pred)
    
    y_pred = clf.predict_proba(xx_eval)[:,1]
    eval_score = roc_auc_score(yy_eval, y_pred)
    
    y_pred = clf.predict_proba(xx_test)[:,1]
    test_score = roc_auc_score(yy_test, y_pred)

    with open(path_to_log, "a") as f:
        f.writelines([f'{exp_name},{noise_level},{test_score}\n'])
    
    return train_score, eval_score, test_score

In [None]:
def get_h_coles_models(letter, nn):
    seq_encoder_h = RnnSeqEncoder(
        trx_encoder=TrxEncoder(
            embeddings={
                letter: {'in': nn, 'out': 16},
            },
            embeddings_noise=0.001,
        ),
        hidden_size=256,
        type='lstm',
    )
    
    model_h = CoLESModule(
        seq_encoder=seq_encoder_h,
        optimizer_partial=partial(torch.optim.Adam, lr=0.001),
        lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=5, gamma=0.9),
    )
    return model_h

In [None]:
unimodal_models = [model_s]
for l, ni in zip(['A'], [n_hidden_states]):
    tb_logger = TensorBoardLogger(save_dir="lightning_logs", name="h_states_only_coles_" + str(noise_level) + "_modality_" + l)
    model_h = get_h_coles_models(l, ni)
    
    trainer = pl.Trainer(
        max_epochs=50,
        gpus=[gpu_n],
        enable_progress_bar=False,
        logger=tb_logger
    )
    trainer.fit(model_h, coles_dl)
    clear_output()
    unimodal_models.append(model_h)

test_late_embed_model(unimodal_models, sup_data, "late_fusion_coles")

In [None]:
#noise1 = (0.8007202911224365, 0.783103692626953, 0.780410511779785)

In [None]:
#noise05 = (0.9408758544464111, 0.9335008178710937, 0.9345132614135743)

In [None]:
#noise02 = (0.9177455666351317, 0.9057953903198244, 0.9047988479614257)

In [None]:
#noise0 = (0.9709027706604004, 0.9667222106933593, 0.9655987045288088)