In [75]:
import argparse
from multiprocessing import Process,Manager
import numpy as np
from scipy import sparse
import time
import torch
import torch.nn as nn
import torch.optim as optim
from tensorboardX import SummaryWriter

import models
import data
import metric

In [None]:
# Set Configs

In [74]:
##  Set the random seed manually for reproductibility.
seed = 1
torch.manual_seed(seed)

In [4]:
# device = torch.device("cuda")
device = torch.device("cpu")

In [34]:
# Load Data
loader = data.DataLoader('ml-20m')

n_items = loader.load_n_items()
train_data = loader.load_data('train')
vad_data_tr, vad_data_te = loader.load_data('validation')
test_data_tr, test_data_te = loader.load_data('test')

N = train_data.shape[0]
idxlist = list(range(N))

print("# of items:{}".format(n_items))

# of items:20101


In [39]:
# Build the model

p_dims = [200, 600, n_items]
model = models.MultiVAE(p_dims).to(device)

print(f"Model Structure:{model}\n")
for name, param in model.named_parameters():
    print(f"Layer: {name} | Size: {param.size()} | Values : {param[:2]} \n")

optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=0.00)
criterion = models.loss_function

Model Structure:MultiVAE(
  (q_layers): ModuleList(
    (0): Linear(in_features=20101, out_features=600, bias=True)
    (1): Linear(in_features=600, out_features=400, bias=True)
  )
  (p_layers): ModuleList(
    (0): Linear(in_features=200, out_features=600, bias=True)
    (1): Linear(in_features=600, out_features=20101, bias=True)
  )
  (drop): Dropout(p=0.5, inplace=False)
)

Layer: q_layers.0.weight | Size: torch.Size([600, 20101]) | Values : tensor([[-0.0020, -0.0059, -0.0157,  ...,  0.0019,  0.0114,  0.0084],
        [ 0.0109,  0.0066, -0.0101,  ..., -0.0036, -0.0175, -0.0043]],
       grad_fn=<SliceBackward0>) 

Layer: q_layers.0.bias | Size: torch.Size([600]) | Values : tensor([-0.0006, -0.0001], grad_fn=<SliceBackward0>) 

Layer: q_layers.1.weight | Size: torch.Size([400, 600]) | Values : tensor([[-0.0276,  0.0433,  0.0281,  ..., -0.1540,  0.0572, -0.0212],
        [ 0.1012,  0.0527,  0.0555,  ...,  0.0192,  0.0471,  0.0580]],
       grad_fn=<SliceBackward0>) 

Layer: q_layers.

In [9]:
# TensorboardX Writer

writer = SummaryWriter()

In [None]:
# Train

In [24]:
BATCH_SIZE = 500
TOTAL_ANNEAL_STEPS = 200000
ANNEAL_CAP = 0.2
LOG_INTERVAL = 100
EPOCHS = 2
# EPOCHS = 200
SAVE_PATH = 'model.pt'

In [16]:
def sparse2torch_sparse(data):
    """
    Convert scipy sparse matrix to torch sparse tensor with L2 Normalization
    This is much faster than naive use of torch.FloatTensor(data.toarray())
    https://discuss.pytorch.org/t/sparse-tensor-use-cases/22047/2
    """
    samples = data.shape[0]
    features = data.shape[1]
    coo_data = data.tocoo()
    indices = torch.LongTensor([coo_data.row, coo_data.col])
    row_norms_inv = 1 / np.sqrt(data.sum(1))
    row2val = {i : row_norms_inv[i].item() for i in range(samples)}
    values = np.array([row2val[r] for r in coo_data.row])
    t = torch.sparse.FloatTensor(indices, torch.from_numpy(values).float(), [samples, features])
    return t

In [17]:
def naive_sparse2tensor(data):
    return torch.FloatTensor(data.toarray())

In [18]:
def train():
    # Turn on training mode
    model.train()
    train_loss = 0.0
    start_time = time.time()
    global update_count

    np.random.shuffle(idxlist)
    
    for batch_idx, start_idx in enumerate(range(0, N, BATCH_SIZE)):
        end_idx = min(start_idx + BATCH_SIZE, N)
        data = train_data[idxlist[start_idx:end_idx]]
        data = naive_sparse2tensor(data).to(device)

        if TOTAL_ANNEAL_STEPS > 0:
            anneal = min(ANNEAL_CAP, 
                            1. * update_count / TOTAL_ANNEAL_STEPS)
        else:
            anneal = ANNEAL_CAP

        optimizer.zero_grad()
        recon_batch, mu, logvar = model(data)
        
        loss = criterion(recon_batch, data, mu, logvar, anneal)
        loss.backward()
        train_loss += loss.item()
        optimizer.step()

        update_count += 1

        if batch_idx % LOG_INTERVAL == 0 and batch_idx > 0:
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:4d}/{:4d} batches | ms/batch {:4.2f} | '
                    'loss {:4.2f}'.format(
                        epoch, batch_idx, len(range(0, N, BATCH_SIZE)),
                        elapsed * 1000 / LOG_INTERVAL,
                        train_loss / LOG_INTERVAL))
            
            # Log loss to tensorboard
            n_iter = (epoch - 1) * len(range(0, N, BATCH_SIZE)) + batch_idx
            writer.add_scalars('data/loss', {'train': train_loss / LOG_INTERVAL}, n_iter)

            start_time = time.time()
            train_loss = 0.0

In [71]:
def evaluate(data_tr, data_te):
    # Turn on evaluation mode
    model.eval()
    total_loss = 0.0
    global update_count
    e_idxlist = list(range(data_tr.shape[0]))
    e_N = data_tr.shape[0]
    n100_list = []
    r20_list = []
    r50_list = []
    
    with torch.no_grad():
        for start_idx in range(0, e_N, BATCH_SIZE):
            end_idx = min(start_idx + BATCH_SIZE, N)
            data = data_tr[e_idxlist[start_idx:end_idx]]
            heldout_data = data_te[e_idxlist[start_idx:end_idx]]
    
            # cno : avoid users who have no clicks in heldout_data
            u_idxlist_wo_any_iteracts = [i for i, x in enumerate(heldout_data.toarray().sum(axis=1)) if x >0]
            data = data[u_idxlist_wo_any_iteracts]
            heldout_data = heldout_data[u_idxlist_wo_any_iteracts]
            
            data_tensor = naive_sparse2tensor(data).to(device)

            if TOTAL_ANNEAL_STEPS > 0:
                anneal = min(ANNEAL_CAP, 
                               1. * update_count / TOTAL_ANNEAL_STEPS)
            else:
                anneal = ANNEAL_CAP

            recon_batch, mu, logvar = model(data_tensor)

            loss = criterion(recon_batch, data_tensor, mu, logvar, anneal)
            total_loss += loss.item()

            # Exclude examples from training set
            recon_batch = recon_batch.cpu().numpy()
            recon_batch[data.nonzero()] = -np.inf

            n100 = metric.NDCG_binary_at_k_batch(recon_batch, heldout_data, 100)
            r20 = metric.Recall_at_k_batch(recon_batch, heldout_data, 20)
            r50 = metric.Recall_at_k_batch(recon_batch, heldout_data, 50)

            n100_list.append(n100)
            r20_list.append(r20)
            r50_list.append(r50)
 
    total_loss /= len(range(0, e_N, BATCH_SIZE))
    n100_list = np.concatenate(n100_list)
    r20_list = np.concatenate(r20_list)
    r50_list = np.concatenate(r50_list)

    return total_loss, np.mean(n100_list), np.mean(r20_list), np.mean(r50_list)

In [43]:
best_n100 = -np.inf
update_count = 0

# At any point you can hit Ctrl + C to break out of training early.
try:
    for epoch in range(1, EPOCHS + 1):
        epoch_start_time = time.time()
        train()
        val_loss, n100, r20, r50 = evaluate(vad_data_tr, vad_data_te)
        print('-' * 89)
        print('| end of epoch {:3d} | time: {:4.2f}s | valid loss {:4.2f} | '
                'n100 {:5.3f} | r20 {:5.3f} | r50 {:5.3f}'.format(
                    epoch, time.time() - epoch_start_time, val_loss,
                    n100, r20, r50))
        print('-' * 89)

        n_iter = epoch * len(range(0, N, BATCH_SIZE))
        writer.add_scalars('data/loss', {'valid': val_loss}, n_iter)
        writer.add_scalar('data/n100', n100, n_iter)
        writer.add_scalar('data/r20', r20, n_iter)
        writer.add_scalar('data/r50', r50, n_iter)

        # Save the model if the n100 is the best we've seen so far.
        if n100 > best_n100:
            with open(SAVE_PATH, 'wb') as f:
                torch.save(model, f)
            best_n100 = n100

except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')


| epoch   1 |  100/ 233 batches | ms/batch 303.88 | loss 500.64
| epoch   1 |  200/ 233 batches | ms/batch 310.54 | loss 489.75
[[ 2.5297163  3.7215118  3.393741  ... -8.467041  -6.306715  -5.451406 ]
 [ 9.12305    6.7557364  9.549453  ... -5.997225  -3.2053828 -3.043894 ]
 [ 1.2308468  3.624673   3.4980795 ... -7.4248533 -5.880217  -6.5763063]
 ...
 [ 8.72814    7.5815115       -inf ... -6.526969  -3.9660323 -2.9830413]
 [ 7.0993967  8.468171   8.501972  ... -7.9216433 -4.5560913 -4.716062 ]
 [ 5.424831   5.341018   5.2050986 ... -9.023245  -5.535694  -5.905982 ]]
9849490
[[ 2.0144613  1.7551389  3.0682101 ... -6.842961  -5.2082124 -5.1703897]
 [ 2.1583645  5.862558   4.262864  ... -7.9869204 -6.508235  -5.5605593]
 [ 0.9931089       -inf  5.202327  ... -7.002199  -6.327378  -4.776379 ]
 ...
 [ 1.7197454  5.0536904  3.7882335 ... -8.038582  -6.2557335 -4.508086 ]
 [ 3.3442798  3.187984   3.17373   ... -5.679262  -2.9605439 -4.352285 ]
 [ 3.926261   6.18427         -inf ... -8.404306  

In [26]:
# Load the best saved model.
MODEL_PATH = SAVE_PATH
with open(SAVE_PATH, 'rb') as f:
    model = torch.load(f)

In [72]:
# Run on test data.
test_loss, n100, r20, r50 = evaluate(test_data_tr, test_data_te)
print('=' * 89)
print('| End of training | test loss {:4.2f} | n100 {:4.2f} | r20 {:4.2f} | '
        'r50 {:4.2f}'.format(test_loss, n100, r20, r50))
print('=' * 89)

| End of training | test loss 392.09 | n100 0.38 | r20 0.35 | r50 0.48


In [98]:
def gumbel_inverse(x):
    return -np.log(-np.log(x))

def evaluate_expectation(data_tr, data_te, n_sampling=1):
    # Turn on evaluation mode
    model.eval()
    total_loss = 0.0
    global update_count
    e_idxlist = list(range(data_tr.shape[0]))
    e_N = data_tr.shape[0]
    n100_list = []
    r20_list = []
    r50_list = []
    n100_list_per_sampling = []
    r20_list_per_sampling = []
    r50_list_per_sampling = []
    
    with torch.no_grad():
        for start_idx in range(0, e_N, BATCH_SIZE):
            end_idx = min(start_idx + BATCH_SIZE, N)
            data = data_tr[e_idxlist[start_idx:end_idx]]
            heldout_data = data_te[e_idxlist[start_idx:end_idx]]
    
            u_idxlist_wo_any_iteracts = [i for i, x in enumerate(heldout_data.toarray().sum(axis=1)) if x >0]
            data = data[u_idxlist_wo_any_iteracts]
            heldout_data = heldout_data[u_idxlist_wo_any_iteracts]
            
            data_tensor = naive_sparse2tensor(data).to(device)

            if TOTAL_ANNEAL_STEPS > 0:
                anneal = min(ANNEAL_CAP, 
                               1. * update_count / TOTAL_ANNEAL_STEPS)
            else:
                anneal = ANNEAL_CAP

            recon_batch, mu, logvar = model(data_tensor)

            loss = criterion(recon_batch, data_tensor, mu, logvar, anneal)
            total_loss += loss.item()

            # Exclude examples from training set
            recon_batch = recon_batch.cpu().numpy()
            # recon_batch[data.nonzero()] = -np.inf

            for l in range(n_sampling):
                # Add Gumbel samples
                np.random.seed(seed=l)
                recon_batch_gumbel_sampled = np.vectorize(np.log)(recon_batch) + np.vectorize(gumbel_inverse)(np.random.uniform(size=recon_batch.shape))
                # recon_batch_gumbel_sampled = recon_batch + np.vectorize(gumbel_inverse)(np.random.uniform(size=recon_batch.shape))
                recon_batch_gumbel_sampled[data.nonzero()] = -np.inf

                n100_list_per_sampling.append(metric.NDCG_binary_at_k_batch(recon_batch_gumbel_sampled, heldout_data, 100))
                r20_list_per_sampling.append(metric.Recall_at_k_batch(recon_batch_gumbel_sampled, heldout_data, 20))
                r50_list_per_sampling.append(metric.Recall_at_k_batch(recon_batch_gumbel_sampled, heldout_data, 50))

            n100_list.append(np.concatenate(n100_list_per_sampling))
            r20_list.append(np.concatenate(r20_list_per_sampling))
            r50_list.append(np.concatenate(r50_list_per_sampling))
    
    total_loss /= len(range(0, e_N, BATCH_SIZE))
    n100_list = np.concatenate(n100_list)
    r20_list = np.concatenate(r20_list)
    r50_list = np.concatenate(r50_list)

    return total_loss, n100_list, r20_list, r50_list

In [68]:
# Load the best saved model.
MODEL_PATH = SAVE_PATH
with open(SAVE_PATH, 'rb') as f:
    model = torch.load(f)

In [93]:
test_data_tr.size

575323

In [94]:
test_data_te.size

138922

In [73]:
# Run on test data.
test_loss, n100_list, r20_list, r50_list = evaluate_expectation(test_data_tr, test_data_te, n_sampling=10)
print('=' * 89)
print('| End of training | test loss {:4.2f} | n100 {:4.2f}({:4.2f}) | r20 {:4.2f}({:4.2f}) | '
        'r50 {:4.2f}({:4.2f})'.format(test_loss, np.mean(n100_list), np.std(n100_list)/np.sqrt(len(n100_list)), np.mean(r20_list), np.std(r20_list)/np.sqrt(len(r20_list)), np.mean(r50_list), np.std(r50_list)/np.sqrt(len(r50_list))))
print('=' * 89)

| End of training | test loss 392.09 | n100 0.19(0.00) | r20 0.15(0.00) | r50 0.29(0.00)


In [None]:
# Run on test data.
test_loss, n100_list, r20_list, r50_list = evaluate_expectation(test_data_tr, test_data_te, n_sampling=2)
print('=' * 89)
print('| End of training | test loss {:4.2f} | n100 {:4.2f}({:4.2f}) | r20 {:4.2f}({:4.2f}) | '
        'r50 {:4.2f}({:4.2f})'.format(test_loss, np.mean(n100_list), np.std(n100_list)/np.sqrt(len(n100_list)), np.mean(r20_list), np.std(r20_list)/np.sqrt(len(r20_list)), np.mean(r50_list), np.std(r50_list)/np.sqrt(len(r50_list))))
print('=' * 89)

In [97]:
np.mean(n100_list)

0.00028954382097746783

In [78]:
np.std(n100_list)/np.sqrt(len(n100_list))

0.00014261365424167457

In [77]:
np.mean(n100_list)

0.1937028339562343

In [87]:
a = np.random.uniform(size=[10,20])
a

array([[0.16077533, 0.07577172, 0.09345934, 0.87696311, 0.73854203,
        0.97151376, 0.23231911, 0.01203572, 0.79877261, 0.65784319,
        0.54884241, 0.99816819, 0.89532786, 0.03135016, 0.80002473,
        0.10875814, 0.90268886, 0.13096327, 0.46018749, 0.25533866],
       [0.56584901, 0.06728489, 0.55103154, 0.53706195, 0.84778836,
        0.55772548, 0.85846841, 0.11341186, 0.76989669, 0.39311703,
        0.83199332, 0.08058243, 0.38528321, 0.29842911, 0.75579901,
        0.5148304 , 0.1684486 , 0.61322506, 0.30408593, 0.63281014],
       [0.48189487, 0.91280326, 0.05150981, 0.11489613, 0.02540707,
        0.32868316, 0.53711695, 0.70723588, 0.24563607, 0.40671792,
        0.96080528, 0.97333487, 0.30816792, 0.30772908, 0.94006582,
        0.69751954, 0.55233989, 0.92514497, 0.45471685, 0.33267487],
       [0.91547462, 0.79492209, 0.21760611, 0.96670685, 0.83208177,
        0.53495635, 0.66099052, 0.5519144 , 0.11217892, 0.20314909,
        0.85909234, 0.18021768, 0.948474  , 0

In [88]:
np.vectorize(gumbel_inverse)(a)

array([[-6.03084264e-01, -9.47801091e-01, -8.62986511e-01,
         2.03034398e+00,  1.19376752e+00,  3.54391892e+00,
        -3.78192130e-01, -1.48611174e+00,  1.49308273e+00,
         8.70388805e-01,  5.10919089e-01,  6.30153416e+00,
         2.20214896e+00, -1.24200121e+00,  1.50007852e+00,
        -7.96889345e-01,  2.27908977e+00, -7.09433047e-01,
         2.53446483e-01, -3.11274954e-01],
       [ 5.63122921e-01, -9.92814511e-01,  5.17576305e-01,
         4.75391181e-01,  1.80105704e+00,  5.38045392e-01,
         1.87989980e+00, -7.77823433e-01,  1.34132502e+00,
         6.86558720e-02,  1.69319534e+00, -9.23653438e-01,
         4.73258105e-02, -1.89977898e-01,  1.27303785e+00,
         4.09596994e-01, -5.77244964e-01,  7.15345206e-01,
        -1.74327150e-01,  7.81792978e-01],
       [ 3.14670613e-01,  2.39431763e+00, -1.08720853e+00,
        -7.71832094e-01, -1.30093470e+00, -1.06754482e-01,
         4.75555936e-01,  1.06018698e+00, -3.39257098e-01,
         1.05765706e-01,  3.2

In [90]:
-np.log(-np.log(a))

array([[-6.03084264e-01, -9.47801091e-01, -8.62986511e-01,
         2.03034398e+00,  1.19376752e+00,  3.54391892e+00,
        -3.78192130e-01, -1.48611174e+00,  1.49308273e+00,
         8.70388805e-01,  5.10919089e-01,  6.30153416e+00,
         2.20214896e+00, -1.24200121e+00,  1.50007852e+00,
        -7.96889345e-01,  2.27908977e+00, -7.09433047e-01,
         2.53446483e-01, -3.11274954e-01],
       [ 5.63122921e-01, -9.92814511e-01,  5.17576305e-01,
         4.75391181e-01,  1.80105704e+00,  5.38045392e-01,
         1.87989980e+00, -7.77823433e-01,  1.34132502e+00,
         6.86558720e-02,  1.69319534e+00, -9.23653438e-01,
         4.73258105e-02, -1.89977898e-01,  1.27303785e+00,
         4.09596994e-01, -5.77244964e-01,  7.15345206e-01,
        -1.74327150e-01,  7.81792978e-01],
       [ 3.14670613e-01,  2.39431763e+00, -1.08720853e+00,
        -7.71832094e-01, -1.30093470e+00, -1.06754482e-01,
         4.75555936e-01,  1.06018698e+00, -3.39257098e-01,
         1.05765706e-01,  3.2

In [91]:
-np.log(-np.log(0.16077533))

-0.6030842547304646