In [21]:
import numpy as np
import pandas as pd
import os
import sys
import shutil
import datetime
import bottleneck as bn
import matplotlib.pyplot as plt
from scipy import sparse

import tensorflow as tf

In [None]:
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

# Load Data

In [None]:
def load_data(data_dir):
    
    unique_sid = list()
    with open(os.path.join(data_dir, 'unique_sid.txt'), 'r') as f:
        for line in f:
            unique_sid.append(line.strip())
    
    n_items = len(unique_sid)
    
    tp = pd.read_csv(os.path.join(data_dir, 'train.csv'))
    tp_tr = pd.read_csv(os.path.join(data_dir, 'vad_tr.csv'))
    tp_te = pd.read_csv(os.path.join(data_dir, 'vad_te.csv'))
    
    n_users = tp['uid'].max() + 1
    start_idx = min(tp_tr['uid'].min(), tp_te['uid'].min())
    end_idx = max(tp_tr['uid'].max(), tp_te['uid'].max())
    
    rows, cols = tp['uid'], tp['sid']
    rows_tr, cols_tr = tp_tr['uid'] - start_idx, tp_tr['sid']
    rows_te, cols_te = tp_te['uid'] - start_idx, tp_te['sid']
    
    data = sparse.csr_matrix((np.ones_like(rows), (rows, cols)), dtype = 'float64', shape=(n_users, n_items))
    data_tr = sparse.csr_matrix((np.ones_like(rows_tr), (rows_tr, cols_tr)), dtype = 'float64', shape = (end_idx - start_idx + 1, n_items))
    data_te = sparse.csr_matrix((np.ones_like(rows_te), (rows_te, cols_te)), dtype = 'float64', shape = (end_idx - start_idx + 1, n_items))
    
    return unique_sid, n_items, data, data_tr, data_te

In [None]:
data_dir = 'ml-20m\\data'
unique_sid, n_items, train_data, vad_data_tr, vad_data_te = load_data(data_dir)

# Build the Model

In [1]:
class WAE(object):
    
    def __init__(self, p_dims, lam = 0.01, lr = 1e-3, random_seed = None):
        
        self.p_dims = p_dims
        self.q_dims = p_dims[::-1]
        self.dims = self.q_dims + self.p_dims[1:]
        self.lam = lam
        self.lr = lr
        self.random_seed = random_seed
        self.construct_placeholders()

    def construct_placeholders(self):
        
        self.input_ph = tf.placeholder(dtype = tf.float32, shape = [None, self.dims[0]])
        self.keep_prob_ph = tf.placeholder_with_default(1.0, shape = None)
        self.is_training_ph = tf.placeholder_with_default(0., shape = None)
        self.batch_size = tf.placeholder_with_default(500, shape = None)
        
    def encoder(self):
        
        h = tf.nn.dropout(tf.nn.l2_normalize(self.input_ph, 1), self.keep_prob_ph)
        for i, (w, b) in enumerate(zip(self.weights_q, self.biases_q)):
            h = tf.matmul(h, w) + b
            if i != len(self.weights_q) - 1:
                h = tf.nn.tanh(h)
        return h

    def decoder(self, z):
        
        h = z
        for i, (w, b) in enumerate(zip(self.weights_p, self.biases_p)):
            h = tf.matmul(h, w) + b
            if i != len(self.weights_p) - 1:
                h = tf.nn.tanh(h)
        return h
    

    def construct_weights(self):
        
        self.weights_q, self.biases_q = [], []
        for i, (d_in, d_out) in enumerate(zip(self.q_dims[:-1], self.q_dims[1:])):
            weight_key = "weight_q_{}to{}".format(i, i+1)
            bias_key = "bias_q_{}".format(i+1)
            self.weights_q.append(tf.get_variable(name = weight_key, shape = [d_in, d_out],
                initializer = tf.contrib.layers.xavier_initializer(seed = self.random_seed)))
            self.biases_q.append(tf.get_variable(name = bias_key, shape =[d_out],
                initializer = tf.truncated_normal_initializer(stddev = 0.001, seed = self.random_seed)))
            
            tf.summary.histogram(weight_key, self.weights_q[-1])
            tf.summary.histogram(bias_key, self.biases_q[-1])
            
        self.weights_p, self.biases_p = [], []

        for i, (d_in, d_out) in enumerate(zip(self.p_dims[:-1], self.p_dims[1:])):
            weight_key = "weight_p_{}to{}".format(i, i+1)
            bias_key = "bias_p_{}".format(i+1)
            self.weights_p.append(tf.get_variable(name = weight_key, shape =[d_in, d_out],
                initializer = tf.contrib.layers.xavier_initializer(seed = self.random_seed)))
            self.biases_p.append(tf.get_variable(name = bias_key, shape =[d_out],
                initializer = tf.truncated_normal_initializer(stddev = 0.001, seed=self.random_seed)))
            
            tf.summary.histogram(weight_key, self.weights_p[-1])
            tf.summary.histogram(bias_key, self.biases_p[-1])
            
    def mmd_loss(self, X, Y):
    
        p2_norm_x = tf.reduce_sum(tf.pow(X, 2), axis = 1)
        p2_norm_x = tf.reshape(p2_norm_x, [1, self.batch_size])
        norm_x = tf.reduce_sum(X, axis = 1)
        norm_x = tf.reshape(norm_x, [1, self.batch_size])
        prod_x = tf.matmul(norm_x, norm_x, transpose_b = True)
        dists_x = p2_norm_x + tf.transpose(p2_norm_x) - 2 * prod_x
        
        p2_norm_y = tf.reduce_sum(tf.pow(Y, 2), axis = 1)
        p2_norm_y = tf.reshape(p2_norm_y, [1, self.batch_size])
        norm_y = tf.reduce_sum(Y, axis = 1)
        norm_y = tf.reshape(norm_y, [1, self.batch_size])
        prod_y = tf.matmul(norm_y, norm_y, transpose_b = True)
        dists_y = p2_norm_y + tf.transpose(p2_norm_y) - 2 * prod_y
        
        dot_prd = tf.matmul(norm_x, norm_y, transpose_b = True)
        dists_c = p2_norm_x + tf.transpose(p2_norm_y) - 2 * dot_prd
        
        stats = 0.0
        
        for scale in [.1, .2, .5, 1., 2., 5., 10.]:
            C = 2 * p_dims[0] * 1.0 * scale
            res = C / (C + dists_x) + C / (C + dists_y)
            res1 = (1 - tf.eye(self.batch_size)) * res
            res1 = tf.reduce_sum(tf.reduce_sum(res1, axis = 0), axis = 0) / tf.cast((self.batch_size - 1), tf.float32)
            res2 =  C / (C + dists_c)
            res2 = tf.reduce_sum(tf.reduce_sum(res2, axis = 0), axis = 0) * 2/ tf.cast(self.batch_size, tf.float32)
            stats += res1 - res2
    
        return stats
    
    def build_graph(self):
        
        self.construct_weights()
        z_real = self.encoder()
        z_fake = tf.random_normal(shape = [self.batch_size, 200], mean = 0.0, stddev = 1.0)
        logits = self.decoder(z_fake)
        
        saver = tf.train.Saver()
        
        mmd_loss = self.mmd_loss(z_real, z_fake)
#        reg = l2_regularizer(self.lam)
#        reg_var = apply_regularization(reg, self.weights_q + self.weights_p)
        total_loss = tf.reduce_mean(tf.squared_difference(self.input_ph, logits)) - mmd_loss
        train_op = tf.train.AdamOptimizer(self.lr).minimize(total_loss)

        tf.summary.scalar('mmd_loss', mmd_loss)
        tf.summary.scalar('total_loss', total_loss)
        merged = tf.summary.merge_all()

        return saver, logits, total_loss, train_op, merged

# Hyper Parameters

In [None]:
N = train_data.shape[0]
idxlist = list(range(N))

n_epochs = 20
batch_size = 500
batches_per_epoch = int(np.ceil(float(N) / batch_size))
batch_size_vad = 500
N_vad = vad_data_tr.shape[0]
idxlist_vad = list(range(N_vad))

# Evaluation Metrics

In [None]:
def NDCG_binary_at_k_batch(X_pred, heldout_batch, k = 100):
    '''
    normalized discounted cumulative gain@k for binary relevance
    ASSUMPTIONS: all the 0's in heldout_data indicate 0 relevance
    '''
    batch_users = X_pred.shape[0]
    idx_topk_part = bn.argpartition(-X_pred, k, axis = 1)
    topk_part = X_pred[np.arange(batch_users)[:, np.newaxis], idx_topk_part[:, :k]]
    idx_part = np.argsort(-topk_part, axis = 1)
    idx_topk = idx_topk_part[np.arange(batch_users)[:, np.newaxis], idx_part]
    tp = 1. / np.log2(np.arange(2, k + 2))

    DCG = (heldout_batch[np.arange(batch_users)[:, np.newaxis], idx_topk].toarray() * tp).sum(axis = 1)
    IDCG = np.array([(tp[:min(n, k)]).sum() for n in heldout_batch.getnnz(axis = 1)])
    
    return DCG / IDCG

In [None]:
def Recall_at_k_batch(X_pred, heldout_batch, k=100):
    
    batch_users = X_pred.shape[0]
    idx = bn.argpartition(-X_pred, k, axis = 1)
    X_pred_binary = np.zeros_like(X_pred, dtype = bool)
    X_pred_binary[np.arange(batch_users)[:, np.newaxis], idx[:, :k]] = True
    X_true_binary = (heldout_batch > 0).toarray()
    tmp = (np.logical_and(X_true_binary, X_pred_binary).sum(axis = 1)).astype(np.float32)
    recall = tmp / np.minimum(k, X_true_binary.sum(axis = 1))
    
    return recall

In [None]:
p_dims = [200, 600, n_items]

tf.reset_default_graph()
wae = WAE(p_dims, lam = 0.0, random_seed = 98765)
saver, logits_var, loss_var, train_op_var, merged_var = wae.build_graph()

ndcg_var = tf.Variable(0.0)
ndcg_dist_var = tf.placeholder(dtype = tf.float64, shape = None)
ndcg_summary = tf.summary.scalar('ndcg_at_k_validation', ndcg_var)
ndcg_dist_summary = tf.summary.histogram('ndcg_at_k_hist_validation', ndcg_dist_var)
merged_valid = tf.summary.merge([ndcg_summary, ndcg_dist_summary])

# Result Path

In [None]:
arch_str = "I-%s-I" % ('-'.join([str(d) for d in wae.dims[1:-1]]))

log_dir = '\\log\\ml-20m\\wae\\{}'.format(arch_str) + str(datetime.datetime.today()).replace(':', '-').replace('.', '-')
if not os.path.isdir(log_dir):
    os.makedirs(log_dir)
print("log directory: %s" % log_dir)

summary_writer = tf.summary.FileWriter(log_dir, graph = tf.get_default_graph())

ckpt_dir = '\\chkpt\\ml-20m\\wae\\{}'.format(arch_str) + str(datetime.datetime.today()).replace(':', '-').replace('.', '-')
if not os.path.isdir(ckpt_dir):
    os.makedirs(ckpt_dir)    
print("ckpt directory: %s" % ckpt_dir)

# Train the model

In [None]:
ndcgs_vad = []
with tf.Session() as sess:

    init = tf.global_variables_initializer()
    sess.run(init)
    
    best_ndcg = -np.inf
    update_count = 0.0
    
    for epoch in range(n_epochs):
        np.random.shuffle(idxlist)
        print (epoch)
        # train for one epoch
        print ('begin training...')
        for bnum, st_idx in enumerate(range(0, N, batch_size)):
            print (bnum)
            end_idx = min(st_idx + batch_size, N)
            X = train_data[idxlist[st_idx : end_idx]]
            
            if sparse.isspmatrix(X):
                X = X.toarray()
            X = X.astype('float32')           
            
            feed_dict = {wae.input_ph: X, 
                         wae.keep_prob_ph: 0.6, 
                         wae.is_training_ph: 1,
                         wae.batch_size : X.shape[0]}        
            sess.run(train_op_var, feed_dict = feed_dict)

            if bnum % 100 == 0:
                try:
                    summary_train = sess.run(merged_var, feed_dict = feed_dict)
                    summary_writer.add_summary(summary_train, global_step = epoch * batches_per_epoch + bnum) 
                except tf.errors.InvalidArgumentError:
                    pass
            
            update_count += 1
        
        print ('begin evaluating...')
        
        # compute validation NDCG
        ndcg_dist = []
        for bnum, st_idx in enumerate(range(0, N_vad, batch_size_vad)):
            end_idx = min(st_idx + batch_size_vad, N_vad)
            X = vad_data_tr[idxlist_vad[st_idx:end_idx]]

            if sparse.isspmatrix(X):
                X = X.toarray()
            X = X.astype('float32')
        
            pred_val = sess.run(logits_var, feed_dict={wae.input_ph: X} )
            # exclude examples from training and validation (if any)
            pred_val[X.nonzero()] = -np.inf
            ndcg_dist.append(NDCG_binary_at_k_batch(pred_val, vad_data_te[idxlist_vad[st_idx : end_idx]]))
        
        ndcg_dist = np.concatenate(ndcg_dist)
        ndcg_ = ndcg_dist.mean()
        ndcgs_vad.append(ndcg_)
        merged_valid_val = sess.run(merged_valid, feed_dict = {ndcg_var: ndcg_, ndcg_dist_var : ndcg_dist})
        summary_writer.add_summary(merged_valid_val, epoch)
        
        print (ndcg_)
        # update the best model (if necessary)
        if ndcg_ > best_ndcg:
            saver.save(sess, '{}/model'.format(ckpt_dir))
            best_ndcg = ndcg_

# Plot the curves

In [None]:
plt.figure(figsize = (12, 3))
plt.plot(ndcgs_vad)
plt.ylabel("Validation NDCG@100")
plt.xlabel("Epochs")