In [None]:
%load_ext autoreload
%autoreload 2
#%pip install prettytable

In [None]:
# import basic package
import os
import pandas as pd
import warnings
import json

warnings.filterwarnings("ignore")
# import the necessary package
from baseline.PyOD import PYOD
from baseline.OE_GPLVM.aeb_gplvm import AEB_GPLVM, NNEncoder
from gpytorch.likelihoods import GaussianLikelihood
from gpytorch.mlls import VariationalELBO
from gpytorch.distributions import MultivariateNormal
from gpytorch.priors import NormalPrior, MultivariateNormalPrior

from utils.data_generator import DataGenerator
from utils.myutils import Utils
import matplotlib.pyplot as plt

plt.style.use("ggplot")
import torch
import numpy as np
from tqdm import trange

datagenerator = DataGenerator()  # data generator
utils = Utils()  # utils function

In [None]:
# dataset and model list / dict
dataset_list = [
    "01_ALOI",
    "02_annthyroid",
    "03_backdoor",
    "04_breastw",
    "05_campaign",
    "06_cardio",
    "07_Cardiotocography",
    "08_celeba",
    "09_census",
    "99_linear",
]

In [39]:
dataset = dataset_list[1]
datagenerator.dataset = dataset
data = datagenerator.generator(la=1.0, realistic_synthetic_mode=None, noise_type=None)

current noise type: None
{'Samples': 7200, 'Features': 6, 'Anomalies': 534, 'Anomalies Ratio(%)': 7.42}


In [40]:
Y_train = torch.tensor(data["X_train"], dtype=torch.float32)
Y_test = torch.tensor(data["X_test"], dtype=torch.float32)
lb_train = torch.tensor(data["y_train"], dtype=torch.float32)
lb_test = torch.tensor(data["y_test"], dtype=torch.float32)

In [125]:
model_dict = {}
noise_trace_dict = {}
N = len(Y_train)
data_dim = Y_train.shape[1]
latent_dim = 2
n_inducing = 25
batch_size = 100
n_epochs = 1000
nn_layers = (10, 5)
lr = 0.01

# Define prior for X, Initialise model, likelihood, elbo and optimizer and latent variable encoder
X_prior_mean = torch.zeros(N, latent_dim)  # shape: N x Q
prior_x = MultivariateNormalPrior(X_prior_mean, torch.eye(X_prior_mean.shape[1]))
encoder = NNEncoder(N, latent_dim, prior_x, data_dim, layers=nn_layers)
model = AEB_GPLVM(N, data_dim, latent_dim, n_inducing, encoder, nn_layers=nn_layers)
likelihood = GaussianLikelihood()
optimizer = torch.optim.Adam(
    [{"params": model.parameters()}, {"params": likelihood.parameters()}], lr
)
elbo = VariationalELBO(likelihood, model, num_data=len(Y_train), combine_terms=False)
# Model params
print(f'Training model params for model {"GPLVM"}')
model.get_trainable_param_names()
loss_list = []
noise_trace = []

Training model params for model GPLVM
+-----------------------------------------------------------------------+------------+
|                                Modules                                | Parameters |
+-----------------------------------------------------------------------+------------+
|                  variational_strategy.inducing_points                 |     50     |
|    variational_strategy._variational_distribution.variational_mean    |    150     |
| variational_strategy._variational_distribution.chol_variational_covar |    3750    |
|                          X.mu_layers.0.weight                         |     60     |
|                           X.mu_layers.0.bias                          |     10     |
|                          X.mu_layers.1.weight                         |     50     |
|                           X.mu_layers.1.bias                          |     5      |
|                          X.mu_layers.2.weight                         |     10     |
|    

In [126]:
# Provando o conceito
batch_idx_a = np.arange(1, 30)
batch_idx_n = np.arange(32, 100)
batch_index = np.hstack([batch_idx_a, batch_idx_n])
ratio = len(batch_idx_a) / len(batch_index)

In [127]:
output_batch_a = model(model.sample_latent_variable(Y_train)[batch_idx_a])
output_batch_n = model(model.sample_latent_variable(Y_train)[batch_idx_n])
output_batch_full = model(model.sample_latent_variable(Y_train)[batch_index])

In [128]:
elbo(output_batch_full, Y_train[batch_index].T)

(tensor([-1.4931, -1.2369, -1.2455, -1.2804, -1.3314, -1.2645],
        grad_fn=<DivBackward0>),
 tensor([3.2168e-09, 2.4599e-09, 1.8922e-09, 2.6491e-09, 2.0814e-09, 2.6491e-09],
        grad_fn=<DivBackward0>),
 tensor([0., 0., 0., 0., 0., 0.]),
 tensor([0.3833, 0.3833, 0.3833, 0.3833, 0.3833, 0.3833],
        grad_fn=<AddBackward0>))

In [129]:
target = Y_train[batch_index]
n_train = len(Y_train)

In [138]:
def _expected_log_prob(target):  # X, indices):
    output = model(model.sample_latent_variable(target))
    exp_log_prob = likelihood.expected_log_prob(target.T, output)
    return exp_log_prob

def _kl_divergence_variational(target):
    ll_shape = torch.zeros_like(target.T)
    klu = ll_shape.T.add_(
        model.variational_strategy.kl_divergence()
        .div(batch_size)
        .div((n_train))
    ).T
    return klu

In [139]:
_kl_divergence_variational(target).shape

torch.Size([6, 97])

torch.Size([6, 97])

In [106]:
torch.zeros(llk_fn().shape)

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0.

In [None]:
# O método forward da classe VariationalELBO, chama a superclasse _ApproximateMarginalLogLikelihood
# que retorna log_likelihood - kl_divergence + log_prior - added_loss, quando você pede pra combinar os termos
# que  é o PADRÃO. Se você passar combine_terms = False, ele te retorna uma tupla com cada um deles
log_likelihood, kl_divergence, log_prior, added_loss = elbo(
    output_batch_full, Y_train[batch_index].T
)
terms = dict(
    log_likelihood=log_likelihood,
    kl_divergence=kl_divergence,
    log_prior=log_prior,
    added_loss=added_loss,
)
terms

In [None]:
# Minha duvida era se eu conseguia separar as likelihoods e voltar para o resultado original
# por conta dos termos KL não era trivial, a principio fazer isso
log_likelihood, kl_divergence, log_prior, added_loss = elbo(
    output_batch_n, Y_train[batch_idx_n].T
)
terms = dict(
    log_likelihood=log_likelihood,
    kl_divergence=kl_divergence,
    log_prior=log_prior,
    added_loss=added_loss,
)
terms

In [None]:
# Aparentemente os termos KL e Added Loss que também é um KL, são constantes nas chamadas do elbo
# então posso somente armazena-los e focar na log_likelihood
log_likelihood, kl_divergence, log_prior, added_loss = elbo(
    output_batch_n, Y_train[batch_idx_n].T
)
terms = dict(
    log_likelihood=log_likelihood,
    kl_divergence=kl_divergence,
    log_prior=log_prior,
    added_loss=added_loss,
)
terms

In [None]:
# Essa é o método principal da classe VariationalELBO, o problem é que ele já vem somado no shape 1xDIM_DADOS
# mas preciso saber esse termo por cada ponto do batch/teste
terms = {
    "_log_likelihood_term": elbo._log_likelihood_term(
        output_batch_full, Y_train[batch_index].T
    )
}
terms

In [None]:
# Eu posso pegar toda a matriz, utilizando a chamada da likelihood
exp_log_prob = elbo.likelihood.expected_log_prob(
    Y_train[batch_index].T, output_batch_full
)
print(exp_log_prob, exp_log_prob.shape)

In [None]:
# LEMBRANDO QUE ---> exp_log_prob.sum(-1).div(batch_size) == elbo(target = Y_train[batch_index].T, variational_dist_f= output_batch_full)[0]
## IMPORTANTE!!! Se atentar aos gradienets, basta multiplicar exp_log_prob.sum(-1).div(batch_size) por *1 para ver o B.O.
batch_size = len(batch_index)
exp_log_prob.sum(-1).div(batch_size)

In [None]:
# LL TOTAL
log_likelihood_total = elbo(
    target=Y_train[batch_index].T, variational_dist_f=output_batch_full
)[0]

# Depois de muito sofrer pensando isso, percebi que posso fazer passar chamar o elbo
# pra cada conjunto, anomalo e normal, e depois recuperar o log_likelihood total
# se eu fizer uma soma ponderada dos log_likelihoods
# Minha preocupacao aqui era por que na instaciação da classe você precisa
# passar o num_data, só que esse termo se não diz respeito ao tamanno do batch
# e sim ao tamanho do treino total. Em todos os experimentos do GPLVM
# ele é instanciado utilizado num_data = len(Y_train), logo isso não interfere
# no calculo do elbo. Também é legal ressaltar que os termos KL não mudam
# mesmo que os batchs mudem.

output_batch_a = model(model.sample_latent_variable(Y_train)[batch_idx_a])
output_batch_n = model(model.sample_latent_variable(Y_train)[batch_idx_n])
target_a = Y_train[batch_idx_a].T
target_n = Y_train[batch_idx_n].T
ll_normal = elbo(output_batch_n, target_n)[0] * (
    1 - ratio
)  # <- Isso não é o elbo. É a log_likelihood --> Se atentar ao [0]
ll_anomaly = elbo(output_batch_a, target_a)[0] * ratio

In [None]:
log_likelihood_total

In [None]:
ll_normal + ll_anomaly

In [None]:
def get_loe_index(score, ratio, batch_index):
    _, idx_n = torch.topk(
        score, int(score.shape[0] * (1 - ratio)), largest=False, sorted=False
    )
    _, idx_a = torch.topk(
        score, int(score.shape[0] * (ratio)), largest=True, sorted=False
    )
    return batch_index[idx_n], batch_index[idx_a]

In [None]:
def get_log_likelihood_LOE(Y_train, lb_train, idx_n, idx_a, ratio, method="blind"):
    output_batch_a = model(model.sample_latent_variable(Y_train)[idx_a])
    output_batch_n = model(model.sample_latent_variable(Y_train)[idx_n])
    target_a = Y_train[idx_a].T
    target_n = Y_train[idx_n].T

    ll_normal = elbo(output_batch_n, target_n)[0] * (1 - ratio)
    ll_anomaly = elbo(output_batch_a, target_a)[0] * ratio

    if method == "blind":
        log_likelihood = (ll_normal + ll_anomaly).div(1)
    elif method == "hard":
        log_likelihood = (ll_normal - ll_anomaly).div(1)
    elif method == "soft":
        log_likelihood = (ll_normal - 0.5 * (ll_anomaly + ll_normal)).div(1)
    else:
        raise NotImplementedError

    return log_likelihood

In [None]:
def elbo_loe(model, elbo, batch_size, Y_train, y_train, method="refine"):
    if method == "refine":
        batch_index = model._get_normal_batch_idx(batch_size, y_train)
        output_batch = model(model.sample_latent_variable(Y_train)[batch_index])

        log_likelihood, kl_divergence, log_prior, added_loss = elbo(
            output_batch, Y_train[batch_index].T
        )

    elif method in ["hard", "soft", "blind"]:
        batch_index, ratio = model._get_individual_batch_idx(batch_size, y_train)
        Y_target = Y_train[batch_index]
        output_batch = model(model.sample_latent_variable(Y_train)[batch_index])

        _, kl_divergence, log_prior, added_loss = elbo(output_batch, Y_target.T)
        likelihood_per_point = elbo.likelihood.expected_log_prob(
            Y_target.T, output_batch
        ).sum(0)
        loe_idx_n, loe_idx_a = get_loe_index(likelihood_per_point, ratio, batch_index)
        log_likelihood = get_log_likelihood_LOE(
            Y_train, y_train, loe_idx_n, loe_idx_a, ratio
        )

    loss = -(log_likelihood - kl_divergence + log_prior - added_loss).sum()

    return loss

In [None]:
iterator = trange(1000, leave=True)
batch_size = 100
for i in iterator:
    model.train()
    optimizer.zero_grad()
    loss = elbo_loe(model, elbo, batch_size, Y_train, lb_train, method="refine")
    loss_list.append(loss.item())
    noise_trace.append(np.round(likelihood.noise_covar.noise.item(), 3))
    if i % 50 == 0:
        iterator.set_description(
            "Loss: " + str(float(np.round(loss.item(), 2))) + ", iter no: " + str(i)
        )
    loss.backward()
    optimizer.step()

model.store(loss_list, likelihood)
noise_trace_dict["GPLVM"] = noise_trace
X_train_mean = model.get_X_mean(Y_train)

In [None]:
with torch.no_grad():
    model.eval()
    likelihood.eval()
y_pred_mean, y_pred_covar = model.reconstruct_y(Y_test)
dist_pred = MultivariateNormal(y_pred_mean, y_pred_covar)
elbo_per_point = elbo.likelihood.expected_log_prob(Y_test.T, dist_pred).sum(0)

In [None]:
utils.metric(y_true=lb_test, y_score=elbo_per_point.detach().numpy())

In [None]:
## amostra -> nn_ecoder --> model.decoder --> amostra_reconstruida
## ou seja, vira uma variavel latente, depois passa pelo modelo
## para se tornar de novo algo no espaço dos dados
X_test_mean, X_test_covar = model.predict_latent(Y_test)
Y_test_mean, Y_test_covar = model.reconstruct_y(Y_test)

In [None]:
def scatter_Y_test(Y_test, lb_test):
    idx_n = np.where(lb_test == 0)
    idx_a = np.where(lb_test == 1)
    plt.scatter(Y_test_mean[0, idx_a], Y_test_mean[1, idx_a])
    plt.scatter(Y_test_mean[0, idx_n], Y_test_mean[1, idx_n])

In [None]:
scatter_Y_test(Y_test, lb_test)

In [None]:
# seed for reproducible results
seed = 42

# clf = clf(seed=seed, model_name=name)
# clf = clf.fit(X_train=data["X_train"], y_train=data["y_train"])
# score = clf.predict_score(data["X_test"])
# result = utils.metric(y_true=data["y_test"], y_score=score)
# df_AUCROC.loc[dataset, name] = result["aucroc"]
# df_AUCPR.loc[dataset, name] = result["aucpr"]