<a href="https://colab.research.google.com/github/flywithu/cornac/blob/master/examples/RecVAE_Example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [216]:
!pip install cornac==1.17 bottleneck



In [217]:
try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False

In [218]:
FILE_PREFIX="."
if IN_COLAB:
  from google.colab import drive
  drive.mount('/content/drive')
  FILE_PREFIX="/content/drive/MyDrive/mycornac"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [219]:
import sys
# sys.path.insert(0,'/content/drive/MyDrive/daicon/msr')
if "." not in sys.path:
  sys.path.insert(0,FILE_PREFIX)

In [220]:
# !wget https://raw.githubusercontent.com/flywithu/RecVAE/master/utils.py -O utils.py

In [221]:
!pwd

/content


In [222]:
print (sys.path)

['/content/drive/MyDrive/mycornac', '/content/drive/MyDrive/mycornac', '/content', '/env/python', '/usr/lib/python310.zip', '/usr/lib/python3.10', '/usr/lib/python3.10/lib-dynload', '', '/usr/local/lib/python3.10/dist-packages', '/usr/lib/python3/dist-packages', '/usr/local/lib/python3.10/dist-packages/IPython/extensions', '/root/.ipython']


In [260]:


def load_train_data(csv_file, n_items, n_users, global_indexing=False):
    tp = pd.read_csv(csv_file)

    n_users = n_users if global_indexing else tp['uid'].max() + 1

    rows, cols = tp['uid'], tp['sid']
    data = sparse.csr_matrix((np.ones_like(rows),
                             (rows, cols)), dtype='float64',
                             shape=(n_users, n_items))
    return data


def load_tr_te_data(csv_file_tr, csv_file_te, n_items, n_users, global_indexing=False):
    tp_tr = pd.read_csv(csv_file_tr)
    tp_te = pd.read_csv(csv_file_te)

    if global_indexing:
        start_idx = 0
        end_idx = len(unique_uid) - 1
    else:
        start_idx = min(tp_tr['uid'].min(), tp_te['uid'].min())
        end_idx = max(tp_tr['uid'].max(), tp_te['uid'].max())

    rows_tr, cols_tr = tp_tr['uid'] - start_idx, tp_tr['sid']
    rows_te, cols_te = tp_te['uid'] - start_idx, tp_te['sid']

    data_tr = sparse.csr_matrix((np.ones_like(rows_tr),
                             (rows_tr, cols_tr)), dtype='float64', shape=(end_idx - start_idx + 1, n_items))
    data_te = sparse.csr_matrix((np.ones_like(rows_te),
                             (rows_te, cols_te)), dtype='float64', shape=(end_idx - start_idx + 1, n_items))
    return data_tr, data_te


def get_data(dataset, global_indexing=False):
    unique_sid = list()
    with open(os.path.join(dataset, 'unique_sid.txt'), 'r') as f:
        for line in f:
            unique_sid.append(line.strip())

    unique_uid = list()
    with open(os.path.join(dataset, 'unique_uid.txt'), 'r') as f:
        for line in f:
            unique_uid.append(line.strip())

    n_items = len(unique_sid)
    n_users = len(unique_uid)

    train_data = load_train_data(os.path.join(dataset, 'train.csv'), n_items, n_users, global_indexing=global_indexing)


    vad_data_tr, vad_data_te = load_tr_te_data(os.path.join(dataset, 'validation_tr.csv'),
                                               os.path.join(dataset, 'validation_te.csv'),
                                               n_items, n_users,
                                               global_indexing=global_indexing)

    test_data_tr, test_data_te = load_tr_te_data(os.path.join(dataset, 'test_tr.csv'),
                                                 os.path.join(dataset, 'test_te.csv'),
                                                 n_items, n_users,
                                                 global_indexing=global_indexing)

    data = train_data, vad_data_tr, vad_data_te, test_data_tr, test_data_te
    data = (x.astype('float32') for x in data)

    return data


def ndcg(X_pred, heldout_batch, k=100):
    '''
    normalized discounted cumulative gain@k for binary relevance
    ASSUMPTIONS: all the 0's in heldout_data indicate 0 relevance
    '''

    #X_pred : array
    #heldout_batch : tuple
    # print(f"Xpred :{len(X_pred)} ")
    # print(f"heldout_batch: {len(heldout_batch)} ")
    # print(f"SHAPE : {X_pred.shape}")
    # print(f"Xpred:0 : {np.isnan(X_pred).sum()}")
    # print(f"Xpred:inf : {np.isinf(X_pred).sum()}")

    # print(f"heldout_batch:0 : {heldout_batch.size - heldout_batch.getnnz()}")

    batch_users = X_pred.shape[0]
    # print("BN")
    idx_topk_part = bn.argpartition(-X_pred, k, axis=1)
    # print(f"idx_topk_part {idx_topk_part.shape}")
    topk_part = X_pred[np.arange(batch_users)[:, np.newaxis],
                       idx_topk_part[:, :k]]
    idx_part = np.argsort(-topk_part, axis=1)
    # X_pred[np.arange(batch_users)[:, np.newaxis], idx_topk] is the sorted
    # topk predicted score
    idx_topk = idx_topk_part[np.arange(batch_users)[:, np.newaxis], idx_part]
    # build the discount template
    tp = 1. / np.log2(np.arange(2, k + 2))

    DCG = (heldout_batch[np.arange(batch_users)[:, np.newaxis],
                         idx_topk].toarray() * tp).sum(axis=1)
    IDCG = np.array([(tp[:min(n, k)]).sum()
                     for n in heldout_batch.getnnz(axis=1)])

    # print(f"DCG: {DCG.shape}")
    # print(f"DCG.Nan: {np.isnan(DCG).sum()}")
    # print(f"DCG.INf: {np.isinf(DCG).sum()}")
    # print(f"DCG.0: {np.sum(DCG==0)}")

    # print(f"IDCG: {IDCG.shape}")
    # print(f"IDCG.Nan: {np.isnan(IDCG).sum()}")
    # print(f"IDCG.INf: {np.isinf(IDCG).sum()}")
    # print(f"IDCG.0: {np.sum(IDCG==0)}")


    value = DCG / IDCG

    # print(f"value: {value.shape}")
    # print(f"value.Nan: {np.isnan(value).sum()}")
    # print(f"value.INf: {np.isinf(value).sum()}")

    value = np.nan_to_num(value, nan=0.0)

    return value


def recall(X_pred, heldout_batch, k=100):
    batch_users = X_pred.shape[0]

    idx = bn.argpartition(-X_pred, k, axis=1)
    X_pred_binary = np.zeros_like(X_pred, dtype=bool)
    X_pred_binary[np.arange(batch_users)[:, np.newaxis], idx[:, :k]] = True

    X_true_binary = (heldout_batch > 0).toarray()
    tmp = (np.logical_and(X_true_binary, X_pred_binary).sum(axis=1)).astype(
        np.float32)
    recall = tmp / np.minimum(k, X_true_binary.sum(axis=1))

    # print(f"value: {recall.shape}")
    # print(f"value.Nan: {np.isnan(recall).sum()}")
    # print(f"value.INf: {np.isinf(recall).sum()}")

    recall = np.nan_to_num(recall, nan=0.0)

    return recall






In [261]:
import numpy as np
from copy import deepcopy

import torch
import torch.nn as nn
import torch.nn.functional as F


def swish(x):
    return x.mul(torch.sigmoid(x))

def log_norm_pdf(x, mu, logvar):
    return -0.5*(logvar + np.log(2 * np.pi) + (x - mu).pow(2) / logvar.exp())


class CompositePrior(nn.Module):
    def __init__(self, hidden_dim, latent_dim, input_dim, mixture_weights=[3/20, 3/4, 1/10]):
        super(CompositePrior, self).__init__()

        self.mixture_weights = mixture_weights

        self.mu_prior = nn.Parameter(torch.Tensor(1, latent_dim), requires_grad=False)
        self.mu_prior.data.fill_(0)

        self.logvar_prior = nn.Parameter(torch.Tensor(1, latent_dim), requires_grad=False)
        self.logvar_prior.data.fill_(0)

        self.logvar_uniform_prior = nn.Parameter(torch.Tensor(1, latent_dim), requires_grad=False)
        self.logvar_uniform_prior.data.fill_(10)

        self.encoder_old = Encoder(hidden_dim, latent_dim, input_dim)
        self.encoder_old.requires_grad_(False)

    def forward(self, x, z):
        post_mu, post_logvar = self.encoder_old(x, 0)

        stnd_prior = log_norm_pdf(z, self.mu_prior, self.logvar_prior)
        post_prior = log_norm_pdf(z, post_mu, post_logvar)
        unif_prior = log_norm_pdf(z, self.mu_prior, self.logvar_uniform_prior)

        gaussians = [stnd_prior, post_prior, unif_prior]
        gaussians = [g.add(np.log(w)) for g, w in zip(gaussians, self.mixture_weights)]

        density_per_gaussian = torch.stack(gaussians, dim=-1)

        return torch.logsumexp(density_per_gaussian, dim=-1)


class Encoder(nn.Module):
    def __init__(self, hidden_dim, latent_dim, input_dim, eps=1e-1):
        super(Encoder, self).__init__()

        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.ln1 = nn.LayerNorm(hidden_dim, eps=eps)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.ln2 = nn.LayerNorm(hidden_dim, eps=eps)
        self.fc3 = nn.Linear(hidden_dim, hidden_dim)
        self.ln3 = nn.LayerNorm(hidden_dim, eps=eps)
        self.fc4 = nn.Linear(hidden_dim, hidden_dim)
        self.ln4 = nn.LayerNorm(hidden_dim, eps=eps)
        self.fc5 = nn.Linear(hidden_dim, hidden_dim)
        self.ln5 = nn.LayerNorm(hidden_dim, eps=eps)
        self.fc_mu = nn.Linear(hidden_dim, latent_dim)
        self.fc_logvar = nn.Linear(hidden_dim, latent_dim)

    def forward(self, x, dropout_rate):
        x = F.normalize(x)
        x = F.dropout(x, dropout_rate, training=self.training)

        h1 = self.ln1(swish(self.fc1(x)))
        h2 = self.ln2(swish(self.fc2(h1) + h1))
        h3 = self.ln3(swish(self.fc3(h2) + h1 + h2))
        h4 = self.ln4(swish(self.fc4(h3) + h1 + h2 + h3))
        h5 = self.ln5(swish(self.fc5(h4) + h1 + h2 + h3 + h4))
        return self.fc_mu(h5), self.fc_logvar(h5)


class VAE(nn.Module):
    def __init__(self, hidden_dim, latent_dim, input_dim):
        super(VAE, self).__init__()

        self.encoder = Encoder(hidden_dim, latent_dim, input_dim)
        self.prior = CompositePrior(hidden_dim, latent_dim, input_dim)
        self.decoder = nn.Linear(latent_dim, input_dim)

    def reparameterize(self, mu, logvar):
        if self.training:
            std = torch.exp(0.5*logvar)
            eps = torch.zeros_like(std).normal_(mean=0, std=0.01)
            return mu + eps * std
        else:
            return mu

    def forward(self, user_ratings, beta=None, gamma=1, dropout_rate=0.5, calculate_loss=True):
        mu, logvar = self.encoder(user_ratings, dropout_rate=dropout_rate)
        z = self.reparameterize(mu, logvar)
        x_pred = self.decoder(z)

        # iszero = torch.sum(torch.isnan(x_pred)).item()
        # print(f"z ::: {iszero}")
        # import time
        # time.sleep(10)


        if calculate_loss:
            if gamma:
                norm = user_ratings.sum(dim=-1)
                kl_weight = gamma * norm
            elif beta:
                kl_weight = beta

            mll = (F.log_softmax(x_pred, dim=-1) * user_ratings).sum(dim=-1).mean()
            kld = (log_norm_pdf(z, mu, logvar) - self.prior(user_ratings, z)).sum(dim=-1).mul(kl_weight).mean()
            negative_elbo = -(mll - kld)



            return (mll, kld), negative_elbo

        else:
            return x_pred

    def update_prior(self):
        self.prior.encoder_old.load_state_dict(deepcopy(self.encoder.state_dict()))

In [262]:
import numpy as np

import torch
from torch import optim

import random
from copy import deepcopy
import os
import pandas as pd
from scipy import sparse

import bottleneck as bn


# import argparse
# parser = argparse.ArgumentParser()
# parser.add_argument('--dataset', type=str, default='/content/drive/MyDrive/mycornac')
# parser.add_argument('--hidden-dim', type=int, default=600)
# parser.add_argument('--latent-dim', type=int, default=200)
# parser.add_argument('--batch-size', type=int, default=500)
# parser.add_argument('--beta', type=float, default=None)
# parser.add_argument('--gamma', type=float, default=0.005)
# parser.add_argument('--lr', type=float, default=5e-4)
# parser.add_argument('--n-epochs', type=int, default=50)
# parser.add_argument('--n-enc_epochs', type=int, default=3)
# parser.add_argument('--n-dec_epochs', type=int, default=1)
# parser.add_argument('--not-alternating', type=bool, default=False)
# args = parser.parse_args()

seed = 1337
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

data = get_data('/content/drive/MyDrive/mycornac/data/20m/pro_sg')
train_data, valid_in_data, valid_out_data, test_in_data, test_out_data = data
train_data = train_data[:-477]

In [263]:
print(train_data.shape)

(116000, 20101)


In [264]:
print(valid_in_data.shape)

(10000, 20101)


In [265]:
print(valid_out_data.shape)

(10000, 20101)


In [266]:
print(test_in_data.shape)

(10000, 20101)


In [267]:
print(test_out_data.shape)

(10000, 20101)


In [270]:
hidden_dim = 600
latent_dim = 200
batch_size = 500
beta = None
gamma = 0.005
lr = 5e-4
n_epochs = 50
n_enc_epochs = 3
n_dec_epochs = 1
not_alternating = False
def generate(batch_size, device, data_in, data_out=None, shuffle=False, samples_perc_per_epoch=1):
    assert 0 < samples_perc_per_epoch <= 1

    total_samples = data_in.shape[0]
    samples_per_epoch = int(total_samples * samples_perc_per_epoch)

    if shuffle:
        idxlist = np.arange(total_samples)
        np.random.shuffle(idxlist)
        idxlist = idxlist[:samples_per_epoch]
    else:
        idxlist = np.arange(samples_per_epoch)

    for st_idx in range(0, samples_per_epoch, batch_size):
        end_idx = min(st_idx + batch_size, samples_per_epoch)
        idx = idxlist[st_idx:end_idx]

        # idx_len = len(idx)
        # print(f"end_idx:{end_idx}")
        # if idx_len != 500:
        #   print(f"idx:{len(idx)}")
        #   break

        yield Batch(device, idx, data_in, data_out)


class Batch:
    def __init__(self, device, idx, data_in, data_out=None):
        self._device = device
        self._idx = idx
        self._data_in = data_in
        self._data_out = data_out

    def get_idx(self):
        return self._idx

    def get_idx_to_dev(self):
        return torch.LongTensor(self.get_idx()).to(self._device)

    def get_ratings(self, is_out=False):
        data = self._data_out if is_out else self._data_in
        return data[self._idx]

    def get_ratings_to_dev(self, is_out=False):
        return torch.Tensor(
            self.get_ratings(is_out).toarray()
        ).to(self._device)


def evaluate(model, data_in, data_out, metrics, samples_perc_per_epoch=1, batch_size=500):
    metrics = deepcopy(metrics)
    model.eval()

    for m in metrics:
        m['score'] = []

    for batch in generate(batch_size=batch_size,
                          device=device,
                          data_in=data_in,
                          data_out=data_out,
                          samples_perc_per_epoch=samples_perc_per_epoch
                         ):

        ratings_in = batch.get_ratings_to_dev()
        ratings_out = batch.get_ratings(is_out=True)



        ratings_pred = model(ratings_in, calculate_loss=False).cpu().detach().numpy()



        if not (data_in is data_out):
            # print("Pred : INF")
            ratings_pred[batch.get_ratings().nonzero()] = -np.inf

        for m in metrics:
            m['score'].append(m['metric'](ratings_pred, ratings_out, k=m['k']))

    for m in metrics:
        m['score'] = np.concatenate(m['score']).mean()

    return [x['score'] for x in metrics]


def run(model, opts, train_data, batch_size, n_epochs, beta, gamma, dropout_rate):
    model.train()
    for epoch in range(n_epochs):
        # print(f'epoch {epoch}')
        for batch in generate(batch_size=batch_size, device=device, data_in=train_data, shuffle=True):
            ratings = batch.get_ratings_to_dev()
            for optimizer in opts:
                optimizer.zero_grad()

            _, loss = model(ratings, beta=beta, gamma=gamma, dropout_rate=dropout_rate)
            loss.backward()

            for optimizer in opts:
                optimizer.step()


model_kwargs = {
    'hidden_dim': hidden_dim,
    'latent_dim': latent_dim,
    'input_dim': train_data.shape[1]
}
metrics = [{'metric': ndcg, 'k': 100}]

best_ndcg = -np.inf
train_scores, valid_scores = [], []

model = VAE(**model_kwargs).to(device)
model_best = VAE(**model_kwargs).to(device)

learning_kwargs = {
    'model': model,
    'train_data': train_data,
    'batch_size': batch_size,
    'beta': beta,
    'gamma': gamma
}

decoder_params = set(model.decoder.parameters())
encoder_params = set(model.encoder.parameters())

optimizer_encoder = optim.Adam(encoder_params, lr=lr)
optimizer_decoder = optim.Adam(decoder_params, lr=lr)


for epoch in range(n_epochs):

    if not_alternating:
        run(opts=[optimizer_encoder, optimizer_decoder], n_epochs=1, dropout_rate=0.5, **learning_kwargs)
    else:
        run(opts=[optimizer_encoder], n_epochs=n_enc_epochs, dropout_rate=0.5, **learning_kwargs)
        model.update_prior()
        run(opts=[optimizer_decoder], n_epochs=n_dec_epochs, dropout_rate=0, **learning_kwargs)

    # print("TrainScore")
    train_scores.append(
        evaluate(model, train_data, train_data, metrics, 0.01)[0]
    )
    # print("ValidScore")

    valid_scores.append(
        evaluate(model, valid_in_data, valid_out_data, metrics, 1)[0]
    )

    if valid_scores[-1] > best_ndcg:
        best_ndcg = valid_scores[-1]
        model_best.load_state_dict(deepcopy(model.state_dict()))


    print(f'epoch {epoch} | valid ndcg@100: {valid_scores[-1]:.4f} | ' +
          f'best valid: {best_ndcg:.4f} | train ndcg@100: {train_scores[-1]:.4f}')



test_metrics = [{'metric': ndcg, 'k': 100}, {'metric': recall, 'k': 20}, {'metric': recall, 'k': 50}]

final_scores = evaluate(model_best, test_in_data, test_out_data, test_metrics)

for metric, score in zip(test_metrics, final_scores):
    print(f"{metric['metric'].__name__}@{metric['k']}:\t{score:.4f}")

  value = DCG / IDCG


epoch 0 | valid ndcg@100: 0.3092 | best valid: 0.3092 | train ndcg@100: 0.6656
epoch 1 | valid ndcg@100: 0.3784 | best valid: 0.3784 | train ndcg@100: 0.7431
epoch 2 | valid ndcg@100: 0.3944 | best valid: 0.3944 | train ndcg@100: 0.7557
epoch 3 | valid ndcg@100: 0.4015 | best valid: 0.4015 | train ndcg@100: 0.7668
epoch 4 | valid ndcg@100: 0.4060 | best valid: 0.4060 | train ndcg@100: 0.7722
epoch 5 | valid ndcg@100: 0.4079 | best valid: 0.4079 | train ndcg@100: 0.7743
epoch 6 | valid ndcg@100: 0.4114 | best valid: 0.4114 | train ndcg@100: 0.7814
epoch 7 | valid ndcg@100: 0.4124 | best valid: 0.4124 | train ndcg@100: 0.7836
epoch 8 | valid ndcg@100: 0.4128 | best valid: 0.4128 | train ndcg@100: 0.7833
epoch 9 | valid ndcg@100: 0.4135 | best valid: 0.4135 | train ndcg@100: 0.7842
epoch 10 | valid ndcg@100: 0.4158 | best valid: 0.4158 | train ndcg@100: 0.7893
epoch 11 | valid ndcg@100: 0.4159 | best valid: 0.4159 | train ndcg@100: 0.7908
epoch 12 | valid ndcg@100: 0.4170 | best valid: 0.

  recall = tmp / np.minimum(k, X_true_binary.sum(axis=1))


ndcg@100:	0.4282
recall@20:	0.4009
recall@50:	0.5322
