# Load Data

In [19]:
import numpy as np
import pandas as pd
import os
import sys
import shutil
import datetime
import bottleneck as bn
import matplotlib.pyplot as plt
from scipy import sparse

import tensorflow as tf
from tensorflow.contrib.layers import apply_regularization, l2_regularizer

In [20]:
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [21]:
mainPath = 'C:\\Users\\nizhe\\Desktop\\python code\\ml-20m'

In [22]:
def load_data(data_dir):
    
    unique_sid = list()
    with open(os.path.join(data_dir, 'unique_sid.txt'), 'r') as f:
        for line in f:
            unique_sid.append(line.strip())
    
    n_items = len(unique_sid)
    
    tp = pd.read_csv(os.path.join(data_dir, 'train.csv'))
    tp_tr = pd.read_csv(os.path.join(data_dir, 'vad_tr.csv'))
    tp_te = pd.read_csv(os.path.join(data_dir, 'vad_te.csv'))
    
    n_users = tp['uid'].max() + 1
    start_idx = min(tp_tr['uid'].min(), tp_te['uid'].min())
    end_idx = max(tp_tr['uid'].max(), tp_te['uid'].max())
    
    rows, cols = tp['uid'], tp['sid']
    rows_tr, cols_tr = tp_tr['uid'] - start_idx, tp_tr['sid']
    rows_te, cols_te = tp_te['uid'] - start_idx, tp_te['sid']
    
    data = sparse.csr_matrix((np.ones_like(rows), (rows, cols)), dtype = 'float64', shape=(n_users, n_items))
    data_tr = sparse.csr_matrix((np.ones_like(rows_tr), (rows_tr, cols_tr)), dtype = 'float64', shape = (end_idx - start_idx + 1, n_items))
    data_te = sparse.csr_matrix((np.ones_like(rows_te),
                             (rows_te, cols_te)), dtype = 'float64', shape = (end_idx - start_idx + 1, n_items))
    
    return unique_sid, n_items, data, data_tr, data_te

In [23]:
data_dir = mainPath + '\\data'
unique_sid, n_items, train_data, vad_data_tr, vad_data_te = load_data(data_dir)

# Build the model

In [24]:
class MultiVAE(object):
    def __init__(self, p_dims, q_dims = None, lam = 0.01, lr = 1e-3, random_seed = None):
        self.p_dims = p_dims
        if q_dims is None:
            self.q_dims = p_dims[::-1]
        else:
            assert q_dims[0] == p_dims[-1], "Input and output dimension must equal each other for autoencoders."
            assert q_dims[-1] == p_dims[0], "Latent dimension for p- and q-network mismatches."
            self.q_dims = q_dims
        self.dims = self.q_dims + self.p_dims[1:]
        self.lam = lam
        self.lr = lr
        self.random_seed = random_seed
        self.construct_placeholders()

    def construct_placeholders(self):
        self.input_ph = tf.placeholder(dtype = tf.float32, shape = [None, self.dims[0]])
        self.keep_prob_ph = tf.placeholder_with_default(1.0, shape = None)
        self.is_training_ph = tf.placeholder_with_default(0., shape = None)
        self.anneal_ph = tf.placeholder_with_default(1., shape = None)
        
    def q_graph(self):
        mu_q, std_q, KL = None, None, None
        h = tf.nn.dropout(tf.nn.l2_normalize(self.input_ph, 1), self.keep_prob_ph)
        
        for i, (w, b) in enumerate(zip(self.weights_q, self.biases_q)):
            h = tf.matmul(h, w) + b
    
            if i != len(self.weights_q) - 1:
                h = tf.nn.tanh(h)
            else:
                mu_q = h[:, :self.q_dims[-1]]
                logvar_q = h[:, self.q_dims[-1]:]
                std_q = tf.exp(0.5 * logvar_q)
                KL = tf.reduce_mean(tf.reduce_sum(0.5 * (-logvar_q + tf.exp(logvar_q) + mu_q**2 - 1), axis = 1))
        return mu_q, std_q, KL

    def p_graph(self, z):
        h = z
        
        for i, (w, b) in enumerate(zip(self.weights_p, self.biases_p)):
            h = tf.matmul(h, w) + b
            
            if i != len(self.weights_p) - 1:
                h = tf.nn.tanh(h)
        return h

    def forward_pass(self):
        # q-network
        mu_q, std_q, KL = self.q_graph()
        epsilon = tf.random_normal(tf.shape(std_q))

        sampled_z = mu_q + self.is_training_ph * epsilon * std_q

        # p-network
        logits = self.p_graph(sampled_z)
        
        return tf.train.Saver(), logits, KL

    def construct_weights(self):
        self.weights_q, self.biases_q = [], []
        
        for i, (d_in, d_out) in enumerate(zip(self.q_dims[:-1], self.q_dims[1:])):
            if i == len(self.q_dims[:-1]) - 1:
                # we need two sets of parameters for mean and variance,
                # respectively
                d_out *= 2
            weight_key = "weight_q_{}to{}".format(i, i+1)
            bias_key = "bias_q_{}".format(i+1)
            
            self.weights_q.append(tf.get_variable(
                name = weight_key, shape = [d_in, d_out],
                initializer = tf.contrib.layers.xavier_initializer(
                    seed = self.random_seed)))
            
            self.biases_q.append(tf.get_variable(
                name = bias_key, shape =[d_out],
                initializer = tf.truncated_normal_initializer(
                    stddev = 0.001, seed = self.random_seed)))
            
            # add summary stats
            tf.summary.histogram(weight_key, self.weights_q[-1])
            tf.summary.histogram(bias_key, self.biases_q[-1])
            
        self.weights_p, self.biases_p = [], []

        for i, (d_in, d_out) in enumerate(zip(self.p_dims[:-1], self.p_dims[1:])):
            weight_key = "weight_p_{}to{}".format(i, i+1)
            bias_key = "bias_p_{}".format(i+1)
            self.weights_p.append(tf.get_variable(
                name = weight_key, shape =[d_in, d_out],
                initializer = tf.contrib.layers.xavier_initializer(
                    seed = self.random_seed)))
            
            self.biases_p.append(tf.get_variable(
                name = bias_key, shape =[d_out],
                initializer = tf.truncated_normal_initializer(
                    stddev = 0.001, seed=self.random_seed)))
            
            # add summary stats
            tf.summary.histogram(weight_key, self.weights_p[-1])
            tf.summary.histogram(bias_key, self.biases_p[-1])
            
    def build_graph(self):
        self.construct_weights()

        saver, logits, KL = self.forward_pass()
        log_softmax_var = tf.nn.log_softmax(logits)

        neg_ll = -tf.reduce_mean(tf.reduce_sum(
            log_softmax_var * self.input_ph,
            axis = -1))
        # apply regularization to weights
        reg = l2_regularizer(self.lam)
        
        reg_var = apply_regularization(reg, self.weights_q + self.weights_p)
        # tensorflow l2 regularization multiply 0.5 to the l2 norm
        # multiply 2 so that it is back in the same scale
        neg_ELBO = neg_ll + self.anneal_ph * KL + 2 * reg_var
        
        train_op = tf.train.AdamOptimizer(self.lr).minimize(neg_ELBO)

        # add summary statistics
        tf.summary.scalar('negative_multi_ll', neg_ll)
        tf.summary.scalar('KL', KL)
        tf.summary.scalar('neg_ELBO_train', neg_ELBO)
        merged = tf.summary.merge_all()

        return saver, logits, neg_ELBO, train_op, merged

# Hyper parameter

In [25]:
N = train_data.shape[0]
idxlist = list(range(N))

# training batch size
batch_size = 500
batches_per_epoch = int(np.ceil(float(N) / batch_size))

N_vad = vad_data_tr.shape[0]
idxlist_vad = list(range(N_vad))

# validation batch size (since the entire validation set might not fit into GPU memory)
batch_size_vad = 2000

# the total number of gradient updates for annealing
total_anneal_steps = 200000
# largest annealing parameter
anneal_cap = 0.2

# Evaluation Metrics

In [26]:
def NDCG_binary_at_k_batch(X_pred, heldout_batch, k = 100):
    '''
    normalized discounted cumulative gain@k for binary relevance
    ASSUMPTIONS: all the 0's in heldout_data indicate 0 relevance
    '''
    batch_users = X_pred.shape[0]
    
    idx_topk_part = bn.argpartition(-X_pred, k, axis = 1)
    
    topk_part = X_pred[np.arange(batch_users)[:, np.newaxis], idx_topk_part[:, :k]]
    
    idx_part = np.argsort(-topk_part, axis = 1)
    
    # X_pred[np.arange(batch_users)[:, np.newaxis], idx_topk] is the sorted
    # topk predicted score
    idx_topk = idx_topk_part[np.arange(batch_users)[:, np.newaxis], idx_part]
    
    # build the discount template
    tp = 1. / np.log2(np.arange(2, k + 2))

    DCG = (heldout_batch[np.arange(batch_users)[:, np.newaxis], idx_topk].toarray() * tp).sum(axis = 1)
    IDCG = np.array([(tp[:min(n, k)]).sum() for n in heldout_batch.getnnz(axis = 1)])
    
    return DCG / IDCG

In [27]:
def Recall_at_k_batch(X_pred, heldout_batch, k=100):
    
    batch_users = X_pred.shape[0]

    idx = bn.argpartition(-X_pred, k, axis = 1)
    
    X_pred_binary = np.zeros_like(X_pred, dtype = bool)
    
    X_pred_binary[np.arange(batch_users)[:, np.newaxis], idx[:, :k]] = True

    X_true_binary = (heldout_batch > 0).toarray()
    
    tmp = (np.logical_and(X_true_binary, X_pred_binary).sum(axis = 1)).astype(np.float32)
    
    recall = tmp / np.minimum(k, X_true_binary.sum(axis = 1))
    
    return recall

# Train the model

In [28]:
p_dims = [200, 600, n_items]

In [29]:
tf.reset_default_graph()

In [30]:
vae = MultiVAE(p_dims, lam = 0.0, random_seed = 98765)

In [31]:
saver, logits_var, loss_var, train_op_var, merged_var = vae.build_graph()

INFO:tensorflow:Scale of 0 disables regularizer.


In [32]:
ndcg_var = tf.Variable(0.0)

ndcg_dist_var = tf.placeholder(dtype = tf.float64, shape = None)

ndcg_summary = tf.summary.scalar('ndcg_at_k_validation', ndcg_var)

ndcg_dist_summary = tf.summary.histogram('ndcg_at_k_hist_validation', ndcg_dist_var)

merged_valid = tf.summary.merge([ndcg_summary, ndcg_dist_summary])

In [33]:
arch_str = "I-%s-I" % ('-'.join([str(d) for d in vae.dims[1:-1]]))

log_dir = mainPath + '\\log\\ml-20m\\VAE_anneal{}K_cap{:1.1E}\\{}'.format(total_anneal_steps / 1000, anneal_cap, arch_str) + str(datetime.datetime.today()).replace(':', '-').replace('.', '-')

if not os.path.isdir(log_dir):
    os.makedirs(log_dir)

print("log directory: %s" % log_dir)

summary_writer = tf.summary.FileWriter(log_dir, graph = tf.get_default_graph())

ckpt_dir = mainPath + '\\chkpt\\ml-20m\\VAE_anneal{}K_cap{:1.1E}\\{}'.format(total_anneal_steps / 1000, anneal_cap, arch_str) + str(datetime.datetime.today()).replace(':', '-').replace('.', '-')

if not os.path.isdir(ckpt_dir):
    os.makedirs(ckpt_dir) 
    
print("ckpt directory: %s" % ckpt_dir)


log directory: C:\Users\nizhe\Desktop\python code\ml-20m\log\ml-20m\VAE_anneal200.0K_cap2.0E-01\I-600-200-600-I2019-03-31 19-52-44-572157
ckpt directory: C:\Users\nizhe\Desktop\python code\ml-20m\chkpt\ml-20m\VAE_anneal200.0K_cap2.0E-01\I-600-200-600-I2019-03-31 19-52-44-633197


In [34]:
n_epochs = 20

In [35]:
train_data

<116677x20108 sparse matrix of type '<class 'numpy.float64'>'
	with 8538846 stored elements in Compressed Sparse Row format>

In [36]:
ndcgs_vad = []

with tf.Session() as sess:

    init = tf.global_variables_initializer()
    sess.run(init)

    best_ndcg = -np.inf

    update_count = 0.0
    
    for epoch in range(n_epochs):
        np.random.shuffle(idxlist)
        print (epoch)
        # train for one epoch
        print ('begin training...')
        for bnum, st_idx in enumerate(range(0, N, batch_size)):
            end_idx = min(st_idx + batch_size, N)
            X = train_data[idxlist[st_idx : end_idx]]
            
            if sparse.isspmatrix(X):
                X = X.toarray()
            X = X.astype('float32')           
            
            if total_anneal_steps > 0:
                anneal = min(anneal_cap, 1. * update_count / total_anneal_steps)
            else:
                anneal = anneal_cap
            
            feed_dict = {vae.input_ph: X, 
                         vae.keep_prob_ph: 0.5, 
                         vae.anneal_ph: anneal,
                         vae.is_training_ph: 1}        
            sess.run(train_op_var, feed_dict = feed_dict)

            if bnum % 100 == 0:
                summary_train = sess.run(merged_var, feed_dict=feed_dict)
                summary_writer.add_summary(summary_train, 
                                           global_step = epoch * batches_per_epoch + bnum) 
            
            update_count += 1
        
        print ('begin evaluating...')
        
        # compute validation NDCG
        ndcg_dist = []
        for bnum, st_idx in enumerate(range(0, N_vad, batch_size_vad)):
            end_idx = min(st_idx + batch_size_vad, N_vad)
            X = vad_data_tr[idxlist_vad[st_idx:end_idx]]

            if sparse.isspmatrix(X):
                X = X.toarray()
            X = X.astype('float32')
        
            pred_val = sess.run(logits_var, feed_dict={vae.input_ph: X} )
            # exclude examples from training and validation (if any)
            pred_val[X.nonzero()] = -np.inf
            ndcg_dist.append(NDCG_binary_at_k_batch(pred_val, vad_data_te[idxlist_vad[st_idx : end_idx]]))
        
        ndcg_dist = np.concatenate(ndcg_dist)
        ndcg_ = ndcg_dist.mean()
        ndcgs_vad.append(ndcg_)
        merged_valid_val = sess.run(merged_valid, feed_dict = {ndcg_var: ndcg_, ndcg_dist_var : ndcg_dist})
        summary_writer.add_summary(merged_valid_val, epoch)

        # update the best model (if necessary)
        if ndcg_ > best_ndcg:
            saver.save(sess, '{}/model'.format(ckpt_dir))
            best_ndcg = ndcg_


0
begin training...
begin evaluating...
1
begin training...
begin evaluating...
2
begin training...
begin evaluating...
3
begin training...
begin evaluating...
4
begin training...
begin evaluating...
5
begin training...
begin evaluating...
6
begin training...
begin evaluating...
7
begin training...
begin evaluating...
8
begin training...
begin evaluating...
9
begin training...


KeyboardInterrupt: 

In [None]:
plt.figure(figsize = (12, 3))
plt.plot(ndcgs_vad)
plt.ylabel("Validation NDCG@100")
plt.xlabel("Epochs")