In [None]:
import sys
import os

In [1]:
import pandas as pd
import numpy as np

# Instructions for data pre-processing

### Use the method described to generate train, validation and test sets.

In [3]:
#replace the following two paths
DATA_DIR = 'input_path'
EXP_DIR = 'output_path'

In [5]:
train_df = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'), header=0)
train_df['type'] = 'train'

In [6]:
validation_tr_df = pd.read_csv(os.path.join(DATA_DIR, 'validation_tr.csv'), header=0)
validation_tr_df['type'] = 'validation'

In [7]:
test_tr_df = pd.read_csv(os.path.join(DATA_DIR, 'test_tr.csv'), header=0)
test_tr_df['type'] = 'test'

In [8]:
train_df.uid.min(), train_df.uid.max(), validation_tr_df.uid.min(), validation_tr_df.uid.max(),\
test_tr_df.uid.min(), test_tr_df.uid.max()

(0, 116676, 116677, 126676, 126677, 136676)

In [9]:
num_users = 136676 + 1

In [10]:
all_tr_df = pd.concat([train_df, validation_tr_df, test_tr_df])

In [11]:
all_tr_df.head()

Unnamed: 0,uid,sid,type
0,116149,0,train
1,116149,1,train
2,116149,2,train
3,116149,3,train
4,116149,4,train


In [12]:
validation_te_df = pd.read_csv(os.path.join(DATA_DIR, 'validation_te.csv'), header = 0)

In [13]:
validation_te_df.head()

Unnamed: 0,uid,sid
0,123737,3
1,123737,135
2,120983,126
3,120983,272
4,120983,245


In [14]:
test_te_df = pd.read_csv(os.path.join(DATA_DIR, 'test_te.csv'), header = 0)

In [15]:
test_te_df.head()

Unnamed: 0,uid,sid
0,134677,239
1,134677,504
2,134677,367
3,128041,125
4,128041,235


In [16]:
show_2_id_df = pd.read_csv(os.path.join(DATA_DIR, 'show2id.txt'), names=['movieId', 'sid'])

In [17]:
num_items = show_2_id_df.sid.max() + 1
num_items
availability = np.ones(num_items, dtype=np.int32)

In [18]:
model_censored = False

In [19]:
if model_censored:
    np.random.seed(0)
    cold_start_titles = np.random.choice(pd.unique(test_te_df['sid']), 1000, replace=False)
    availability[cold_start_titles] = 0
    availability_df = pd.DataFrame({'sid' : np.arange(num_items), 'availability' : availability})
    show_2_id_df = pd.merge(show_2_id_df, availability_df, on = 'sid')
    all_tr_filtered_df = pd.merge(all_tr_df, show_2_id_df[show_2_id_df.availability == 1], on = 'sid') 
    print(all_tr_filtered_df.shape, all_tr_df.shape)
    print(pd.unique(all_tr_filtered_df.uid).shape, pd.unique(all_tr_df.uid).shape)
    pd.unique(all_tr_filtered_df.uid).shape == pd.unique(all_tr_df.uid).shape
    all_tr_df = all_tr_filtered_df

# Model 1 : With User-Video Tags; Generative model only for Video History

In [20]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [21]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
import pandas as pd
import gzip
import sys

# pylint: disable=missing-docstring
import time
import tensorflow as tf
import numpy as np
from scipy import sparse
import bottleneck as bn

In [22]:
def load_all_training_data(all_tr_df, num_users, num_items):
    rows, cols = all_tr_df['uid'], all_tr_df['sid']
    all_tr_mat = sparse.csr_matrix(
        (np.ones_like(rows), (rows, cols)),
        dtype='float64',
        shape=(num_users, num_items))
    return all_tr_mat, pd.unique(all_tr_df[all_tr_df.type == 'train'].uid), pd.unique(all_tr_df[all_tr_df.type == 'validation'].uid), pd.unique(all_tr_df[all_tr_df.type == 'test'].uid) 

def load_vad_te_data(validation_te_df, num_items):
    vad_start_idx = validation_te_df['uid'].min()
    vad_end_idx = validation_te_df['uid'].max()
    rows_te, cols_te = validation_te_df['uid'] - vad_start_idx, validation_te_df['sid']
    vad_data_te = sparse.csr_matrix((np.ones_like(rows_te),
                                 (rows_te, cols_te)),
                                dtype='float64', shape=(vad_end_idx - vad_start_idx + 1, num_items))
    return vad_data_te, pd.unique(validation_te_df.uid)

def get_video_tags_mat(num_items, num_tags, filename):
    video_tags_df = pd.read_csv(os.path.join(DATA_DIR, filename), header = 0)
    rows, cols = video_tags_df.sid, video_tags_df.genre_index
    video_tags_mat = np.zeros((num_items, num_tags))
    video_tags_mat[rows, cols] = 1
    return video_tags_mat

def NDCG_binary_at_k_batch(X_pred, heldout_batch, k=100):
    '''
    normalized discounted cumulative gain@k for binary relevance
    ASSUMPTIONS: all the 0's in heldout_data indicate 0 relevance
    '''
    batch_users = X_pred.shape[0]
    idx_topk_part = bn.argpartition(-X_pred, k, axis=1)
    topk_part = X_pred[np.arange(batch_users)[:, np.newaxis],
                       idx_topk_part[:, :k]]
    idx_part = np.argsort(-topk_part, axis=1)
    # X_pred[np.arange(batch_users)[:, np.newaxis], idx_topk] is the sorted
    # topk predicted score
    idx_topk = idx_topk_part[np.arange(batch_users)[:, np.newaxis], idx_part]
    # build the discount template
    tp = 1. / np.log2(np.arange(2, k + 2))

    DCG = (heldout_batch[np.arange(batch_users)[:, np.newaxis],
                         idx_topk].toarray() * tp).sum(axis=1)
    IDCG = np.array([(tp[:min(n, k)]).sum()
                     for n in heldout_batch.getnnz(axis=1)])
    return DCG / IDCG

In [23]:
all_tr_mat, train_idxlist, validation_idxlist, test_idxlist =\
load_all_training_data(all_tr_df, num_users, num_items)

In [24]:
print(all_tr_mat.shape,train_idxlist.shape, validation_idxlist.shape, test_idxlist.shape)

(136677, 20108) (116677,) (10000,) (10000,)


In [25]:
vad_data_te, _ = load_vad_te_data(validation_te_df, num_items)
vad_start_index = validation_idxlist.min()

In [26]:
N_validation = validation_idxlist.shape[0]
print(N_validation, vad_start_index)

10000 116677


In [27]:
test_data_te, _ = load_vad_te_data(test_te_df, num_items)
test_start_index = test_idxlist.min()

In [28]:
# Genre Year Matrix
num_tags = pd.read_csv(os.path.join(DATA_DIR, 'genreyear2id.csv'), names=['genre', 'genre_index']).shape[0]
video_tags_mat = get_video_tags_mat(num_items, num_tags, 'movies_genres_year.csv')
video_tags_mat.shape, num_tags

((20108, 137), 137)

In [30]:
num_tags_per_video = video_tags_mat.sum(axis = 1)

In [31]:
num_tags_per_video

array([5., 2., 3., ..., 2., 3., 2.])

In [32]:
num_tags_per_video[num_tags_per_video == 0] = 1

In [33]:
num_tags_per_video

array([5., 2., 3., ..., 2., 3., 2.])

In [34]:
video_metadata_array = video_tags_mat / (num_tags_per_video[:, np.newaxis])

In [35]:
print(video_metadata_array.shape)
print(num_tags)

(20108, 137)
137


In [41]:
class Model1(object):
    def __init__(self,
                 num_users,
                 num_items,
                 num_tags,
                 num_factors,
                 var_prior,
                 video_metadata_array,
                 availability_mask,
                 model_variational,
                 model_censored,
                 model_user_tags,
                 model_video_tags):
        self.num_users = num_users
        self.num_items = num_items
        self.num_tags = num_tags
        self.num_factors = num_factors
        self.var_prior = var_prior
        self.model_variational = model_variational
        self.model_user_tags = model_user_tags
        self.model_video_tags = model_video_tags
        self.model_censored = model_censored
        self.video_metadata_array_const = tf.constant(video_metadata_array, dtype = tf.float32)
        self.availability_mask = tf.constant(availability_mask, dtype = tf.float32, shape=[1, num_items])
        self.construct_placeholders()
        
    def construct_placeholders(self):
        self.users_ph = tf.placeholder(dtype=tf.int32, shape=[None])
        self.played_videos_ph = tf.placeholder(dtype=tf.float32, shape=[None, self.num_items])
        if self.model_user_tags:
            self.played_tags_ph = tf.placeholder(dtype=tf.float32, shape=[None, self.num_tags])
    
    def construct_model_variables(self):
        self.Mu_Zu = tf.Variable(dtype=tf.float32,
                            initial_value=tf.random_normal(shape=[self.num_users, self.num_factors]), 
                            name = 'mean_latent_factors_zu')
        self.lsdev_Zu = tf.Variable(dtype=tf.float32,
                               initial_value=tf.random_normal(shape=[self.num_users, 1]), name='lsdev_Zu')

        self.Mu_Zv = tf.Variable(dtype=tf.float32,
                                 initial_value=tf.random_normal(shape=[self.num_items, self.num_factors]),
                                 name = 'mean_latent_factors_zv')
        
        self.Mu_Zt = tf.Variable(dtype=tf.float32,
                            initial_value=tf.random_normal(shape=[self.num_tags, self.num_factors]),
                            name = 'mean_latent_factors_zt')
        
        
    def compute_kl_div(self, lsdev_Zu_batch, Mu_Zu_batch):
        sdev_Zu_batch = tf.exp(lsdev_Zu_batch)
        comp1 = num_factors * (0.5 * np.log(self.var_prior) - lsdev_Zu_batch)
        comp2 = (num_factors / (2 * self.var_prior)) * (tf.pow(sdev_Zu_batch, 2))
        comp3 = (1.0 / (2 * self.var_prior)) * tf.reduce_sum(tf.pow(Mu_Zu_batch, 2), axis=1, keep_dims = True)
        comp4 = (self.num_factors / 2.0)

        return comp1 + comp2 + comp3 - comp4
        
    def construct_graph(self):
        self.construct_model_variables()
        
        Mu_Zu_batch = tf.gather(self.Mu_Zu, self.users_ph)
        lsdev_Zu_batch = tf.gather(self.lsdev_Zu, self.users_ph)
        Eps_u_ph = tf.random_normal(shape = [tf.size(self.users_ph), self.num_factors],
                                    mean=0.0, stddev=1.0, dtype=tf.float32, seed=None, name="eps")
        if self.model_variational:
            print('modeling variational, adding noise ...')
            Zu_batch = Mu_Zu_batch + Eps_u_ph * tf.exp(lsdev_Zu_batch)
        else:
            print('modeling non-variational, Zu_batch = Mu_Zu_batch')
            Zu_batch = Mu_Zu_batch

        if self.model_user_tags and self.model_video_tags:
            print('modeling user and video tags ...')
            Mu_Zv_hat = tf.matmul(self.video_metadata_array_const, self.Mu_Zt)
            Mu_Zu_tag_batch = tf.matmul(self.played_tags_ph, self.Mu_Zt)
            batch_logits = tf.matmul(Zu_batch + Mu_Zu_tag_batch, self.Mu_Zv + Mu_Zv_hat, transpose_b=True)
            batch_logits_validation = tf.matmul(Mu_Zu_batch + Mu_Zu_tag_batch, self.Mu_Zv + Mu_Zv_hat, transpose_b=True)
        if not self.model_user_tags and self.model_video_tags:
            print('modeling video tags ...')
            Mu_Zv_hat = tf.matmul(self.video_metadata_array_const, self.Mu_Zt)
            batch_logits = tf.matmul(Zu_batch, self.Mu_Zv + Mu_Zv_hat, transpose_b=True)
            batch_logits_validation = tf.matmul(Mu_Zu_batch, self.Mu_Zv + Mu_Zv_hat, transpose_b=True)
        if not self.model_user_tags and not self.model_video_tags:
            print('modeling no tags ...')
            batch_logits = tf.matmul(Zu_batch, self.Mu_Zv, transpose_b=True)
            batch_logits_validation = tf.matmul(Mu_Zu_batch, self.Mu_Zv, transpose_b=True)
        if self.model_censored:
            print('modeling censored...')
            max_logits = tf.reduce_max(batch_logits, axis=1, keep_dims=True)
            logsum_exp_masked = max_logits +\
            tf.log(tf.reduce_sum(self.availability_mask * tf.exp(batch_logits - max_logits), axis=1, keep_dims=True) + 1e-8)
            log_softmax = batch_logits - logsum_exp_masked
        else:
            print('modeling non-censored')
            log_softmax = tf.nn.log_softmax(batch_logits)
        
        num_items_per_document = tf.reduce_sum(self.played_videos_ph, axis=1, keep_dims=True)
        
        batch_conditional_log_likelihood = tf.reduce_sum(self.played_videos_ph * log_softmax, axis = 1, keep_dims=True)
        if model_variational:
            print('computing elbo')
            batch_kl_div = self.compute_kl_div(lsdev_Zu_batch, Mu_Zu_batch)
            batch_elbo = (1.0 / num_items_per_document) * (batch_conditional_log_likelihood - batch_kl_div)
        else:
            print('only using likelihood')

            batch_elbo = ((1.0 / num_items_per_document) * batch_conditional_log_likelihood)
        
        avg_loss = -1 * tf.reduce_mean(batch_elbo) + reg * (tf.nn.l2_loss(self.Mu_Zv) +
                                                            tf.nn.l2_loss(self.Mu_Zt))
        if not self.model_variational:
            print('using l2-loss on Mu_Zu_batch')
            avg_loss += reg * tf.nn.l2_loss(Mu_Zu_batch)
        
        return batch_logits, batch_logits_validation, log_softmax, avg_loss, batch_conditional_log_likelihood,  num_items_per_document

In [42]:
var_prior = 1.0
lr = 4e-3
reg = 1e-9
num_factors = 100
batch_size = 1000
num_epochs = 1000
model_video_tags = False
model_user_tags = False
model_variational = False

In [43]:
heldout_idxlist = test_idxlist
heldout_start_index = test_start_index
heldout_data_te = test_data_te

In [44]:
'''logging info'''
attempt = 8
experiment_name = 'censored-{model_censored}-variational-{model_variational}-testmetrics-user=id-tags-{model_user_tags}-video=id-tags-{model_video_tags}-learningrate={lr}-regularization={reg}-numfactors={num_factors}-prior_var-{prior_var}-epochs-{epochs}-attempt={attempt}'\
.format(model_censored=model_censored,model_user_tags=model_user_tags, model_video_tags=model_video_tags,
        lr=lr, reg=reg, num_factors=num_factors, attempt=attempt, prior_var = var_prior, epochs = num_epochs, model_variational = model_variational)

print('experiment name :', experiment_name)
print('output dir :', EXP_DIR)

!mkdir -p {os.path.join(EXP_DIR, 'logs', experiment_name)}
!mkdir -p {os.path.join(EXP_DIR, 'tensorflow_output', experiment_name)}
!mkdir -p {os.path.join(EXP_DIR, 'tensorflow_models', experiment_name)}

output_line_template = '{epoch_ind},ndcg,{ndcg_mean}+/-{ndcg_se},batch_loss,{batch_loss}'
tf.reset_default_graph()
fw = open(os.path.join(EXP_DIR, 'logs', experiment_name,'_logs.txt'), 'wb')

experiment name : censored-False-variational-False-testmetrics-user=id-tags-False-video=id-tags-False-learningrate=0.004-regularization=1e-09-numfactors=100-prior_var-1.0-epochs-1000-attempt=8
output dir : /data/ml20m/ml-20m/exp20181002-final


In [None]:
tf.reset_default_graph()
with tf.Graph().as_default():
    model1 = Model1(num_users,
                    num_items,
                    num_tags,
                    num_factors,
                    var_prior,
                    video_metadata_array,
                    availability,
                    model_variational,
                    model_censored,
                    model_user_tags,
                    model_video_tags)
    batch_logits, batch_logits_validation, log_softmax, avg_loss, batch_conditional_log_likelihood,\
    num_items_per_document = model1.construct_graph()
    train_op = tf.train.AdamOptimizer(learning_rate=lr)\
    .minimize(avg_loss, global_step=tf.Variable(0, name='global_step_1', trainable=False))

    ####Tensors for validation####
    train_op_validation = tf.train.AdamOptimizer(learning_rate=lr)\
    .minimize(avg_loss,
              var_list = [model1.Mu_Zu, model1.lsdev_Zu],
              global_step=tf.Variable(0, name='global_step_1_validation', trainable=False))

    ####Summary####
    avg_loss_summary_ph = tf.placeholder(dtype = tf.float32)
    tf.summary.scalar('avg_loss', avg_loss_summary_ph)

    ndcg_summary_ph = tf.placeholder(dtype=tf.float32)
    tf.summary.scalar('ndcg_100', ndcg_summary_ph)
    summary = tf.summary.merge_all()

    ####Start####
    init = tf.global_variables_initializer()
    saver = tf.train.Saver()
    
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        summary_writer = tf.summary.FileWriter('{EXP_DIR}/tensorflow_output/{experiment_name}'.format(
            EXP_DIR = EXP_DIR,
            experiment_name = experiment_name), sess.graph)
        sess.run(init)
        ndcgs_vad = []
        best_ndcg_sofar = -1000
        for epoch_ind in np.arange(num_epochs):

            ###Optimize parameters for Validation users ####    
            avg_loss_validation = 0
            num_batches = 0
            for batch_ind, st_index in enumerate(range(0, heldout_idxlist.shape[0], batch_size)):
                batch_start_time = time.time()
                end_index = min(st_index + batch_size, heldout_idxlist.shape[0])
                user_indices = heldout_idxlist[st_index:end_index]
                X = all_tr_mat[user_indices]
                X = X.toarray()
                X = X.astype('float32')
                
                feed_dict = {model1.users_ph : user_indices,
                             model1.played_videos_ph : X}
                if model_user_tags:
                    M_u = np.dot(X, video_tags_mat)
                    M_u[M_u > 0] = 1
                    num_tags_per_user = M_u.sum(axis = 1)
                    num_tags_per_user[num_tags_per_user == 0] = 1
                    M_u = M_u / (num_tags_per_user[:, np.newaxis])
                    feed_dict[model1.played_tags_ph] = M_u
                
                _, loss_val = sess.run([train_op_validation, avg_loss], feed_dict=feed_dict)
                avg_loss_validation += loss_val
                num_batches += 1
            avg_loss_validation /= num_batches

            ####Metrics Computation####
            ndcg_dist = []
            for batch_ind, st_index in enumerate(range(0, heldout_idxlist.shape[0], batch_size)):
                end_index = min(st_index + batch_size, heldout_idxlist.shape[0])
                heldout_user_indices = heldout_idxlist[st_index:end_index]
                X = all_tr_mat[heldout_user_indices]
                X = X.toarray()
                X = X.astype('float32')
                feed_dict = {model1.users_ph : heldout_user_indices}
                if model_user_tags:
                    M_u = np.dot(X, video_tags_mat)
                    M_u[M_u > 0] = 1
                    num_tags_per_user = M_u.sum(axis = 1)
                    num_tags_per_user[num_tags_per_user == 0] = 1.0
                    M_u = M_u / (num_tags_per_user[:, np.newaxis])
                    feed_dict[model1.played_tags_ph] = M_u
                
                logit_heldout = sess.run(batch_logits_validation, feed_dict=feed_dict)
                logit_heldout[X.nonzero()] = -np.inf
                ndcg_dist.append(NDCG_binary_at_k_batch(logit_heldout,
                                                        heldout_data_te[heldout_user_indices - heldout_start_index]))
            ndcg_dist = np.concatenate(ndcg_dist)
            ndcg_ = ndcg_dist.mean()
            ndcg_serr = ndcg_dist.std() / np.sqrt(ndcg_dist.shape[0])
            ndcgs_vad.append((ndcg_, ndcg_serr))
            
            if ndcg_ >= best_ndcg_sofar:
                print('writing out model with ndcg: ', ndcg_, ' better than best ndcg so far: ', best_ndcg_sofar)
                saver.save(sess, os.path.join(EXP_DIR, 'tensorflow_models', experiment_name, 'files'))
                best_ndcg_sofar = ndcg_

            ####Training####
            avg_loss_dataset = 0
            num_batches = 0
            np.random.shuffle(train_idxlist)
            for batch_ind, st_index in enumerate(range(0, train_idxlist.shape[0], batch_size)):
                batch_start_time = time.time()
                end_index = min(st_index + batch_size, train_idxlist.shape[0])
                user_indices = train_idxlist[st_index:end_index]
                X = all_tr_mat[user_indices]
                X = X.toarray()
                X = X.astype('float32')
                feed_dict = {model1.users_ph : user_indices, model1.played_videos_ph : X}
                '''_, loss_val, batch_conditional_log_likelihood_val,\
                batch_kl_div_val, num_items_per_document_val = sess.run([train_op, avg_loss,
                                                                         batch_conditional_log_likelihood, 
                                                                         batch_kl_div, num_items_per_document ], feed_dict=feed_dict)
                #print(batch_conditional_log_likelihood_val.shape, batch_kl_div_val.shape, num_items_per_document_val.shape)
                #print(1.0 / num_items_per_document_val)'''
                if model_user_tags:
                    M_u = np.dot(X, video_tags_mat)
                    M_u[M_u > 0] = 1
                    num_tags_per_user = M_u.sum(axis = 1)
                    num_tags_per_user[num_tags_per_user == 0] = 1
                    M_u = M_u / (num_tags_per_user[:, np.newaxis])
                    feed_dict[model1.played_tags_ph] = M_u
                
                _, loss_val =\
                sess.run([train_op, avg_loss], feed_dict=feed_dict)
                #print(log_softmax_val.shape)
                avg_loss_dataset += loss_val
                num_batches += 1
            avg_loss_dataset = avg_loss_dataset / max(num_batches, 1)
            ####Summary####
            output_line = output_line_template.format(epoch_ind = epoch_ind,
                                                      ndcg_mean = ndcg_,
                                                      ndcg_se='%.3f' % ndcg_serr,
                                                      batch_loss='%6.3f' % avg_loss_dataset) 
            print(output_line)
            sys.stdout.flush()
            fw.write(output_line + '\n')
            summary_str = sess.run(summary, feed_dict={avg_loss_summary_ph : avg_loss_dataset,
                                                       ndcg_summary_ph : ndcg_})
            summary_writer.add_summary(summary_str, epoch_ind)
            summary_writer.flush()

modeling non-variational, Zu_batch = Mu_Zu_batch
modeling no tags ...
only using likelihood
using l2-loss on Mu_Zu_batch
writing out model with ndcg:  0.0020123824366537015  better than best ndcg so far:  -1000
0,ndcg,0.00201238243665+/-0.000,batch_loss,39.346
writing out model with ndcg:  0.002296468290578545  better than best ndcg so far:  0.0020123824366537015
1,ndcg,0.00229646829058+/-0.000,batch_loss,35.053
writing out model with ndcg:  0.0026049899666055914  better than best ndcg so far:  0.002296468290578545
2,ndcg,0.00260498996661+/-0.000,batch_loss,31.454
writing out model with ndcg:  0.003012113335054301  better than best ndcg so far:  0.0026049899666055914
3,ndcg,0.00301211333505+/-0.000,batch_loss,28.332
writing out model with ndcg:  0.0035174542357283654  better than best ndcg so far:  0.003012113335054301
4,ndcg,0.00351745423573+/-0.000,batch_loss,25.511
writing out model with ndcg:  0.004223357383472369  better than best ndcg so far:  0.0035174542357283654
5,ndcg,0.00422

54,ndcg,0.186884407984+/-0.002,batch_loss, 5.498
writing out model with ndcg:  0.18742830146044037  better than best ndcg so far:  0.18688440798401001
55,ndcg,0.18742830146+/-0.002,batch_loss, 5.472
writing out model with ndcg:  0.18828833188806096  better than best ndcg so far:  0.18742830146044037
56,ndcg,0.188288331888+/-0.002,batch_loss, 5.447
writing out model with ndcg:  0.1888381697152494  better than best ndcg so far:  0.18828833188806096
57,ndcg,0.188838169715+/-0.002,batch_loss, 5.422
writing out model with ndcg:  0.18964373010554128  better than best ndcg so far:  0.1888381697152494
58,ndcg,0.189643730106+/-0.002,batch_loss, 5.397
writing out model with ndcg:  0.18975185023882934  better than best ndcg so far:  0.18964373010554128
59,ndcg,0.189751850239+/-0.002,batch_loss, 5.372
writing out model with ndcg:  0.19059810873975452  better than best ndcg so far:  0.18975185023882934
60,ndcg,0.19059810874+/-0.002,batch_loss, 5.348
writing out model with ndcg:  0.19106844803093026

112,ndcg,0.220231751964+/-0.002,batch_loss, 4.590
writing out model with ndcg:  0.2205663405648123  better than best ndcg so far:  0.2202317519638169
113,ndcg,0.220566340565+/-0.002,batch_loss, 4.583
writing out model with ndcg:  0.22099160146242244  better than best ndcg so far:  0.2205663405648123
114,ndcg,0.220991601462+/-0.002,batch_loss, 4.577
writing out model with ndcg:  0.22182425227395897  better than best ndcg so far:  0.22099160146242244
115,ndcg,0.221824252274+/-0.002,batch_loss, 4.570
writing out model with ndcg:  0.22226912390317202  better than best ndcg so far:  0.22182425227395897
116,ndcg,0.222269123903+/-0.002,batch_loss, 4.564
writing out model with ndcg:  0.22303551802862157  better than best ndcg so far:  0.22226912390317202
117,ndcg,0.223035518029+/-0.002,batch_loss, 4.558
writing out model with ndcg:  0.2235768190246872  better than best ndcg so far:  0.22303551802862157
118,ndcg,0.223576819025+/-0.002,batch_loss, 4.552
writing out model with ndcg:  0.2242565543

writing out model with ndcg:  0.25630182377314237  better than best ndcg so far:  0.25527109027906325
169,ndcg,0.256301823773+/-0.002,batch_loss, 4.387
writing out model with ndcg:  0.25644961183156295  better than best ndcg so far:  0.25630182377314237
170,ndcg,0.256449611832+/-0.002,batch_loss, 4.385
writing out model with ndcg:  0.2577095919211061  better than best ndcg so far:  0.25644961183156295
171,ndcg,0.257709591921+/-0.002,batch_loss, 4.383
writing out model with ndcg:  0.2584005580973035  better than best ndcg so far:  0.2577095919211061
172,ndcg,0.258400558097+/-0.002,batch_loss, 4.381
writing out model with ndcg:  0.2592068438787746  better than best ndcg so far:  0.2584005580973035
173,ndcg,0.259206843879+/-0.002,batch_loss, 4.380
174,ndcg,0.258989745895+/-0.002,batch_loss, 4.379
writing out model with ndcg:  0.25965277211764487  better than best ndcg so far:  0.2592068438787746
175,ndcg,0.259652772118+/-0.002,batch_loss, 4.377
writing out model with ndcg:  0.260418566848

229,ndcg,0.287219627193+/-0.002,batch_loss, 4.324
230,ndcg,0.287081922591+/-0.002,batch_loss, 4.323
231,ndcg,0.287115232688+/-0.002,batch_loss, 4.323
writing out model with ndcg:  0.2887599522442193  better than best ndcg so far:  0.2872196271930664
232,ndcg,0.288759952244+/-0.002,batch_loss, 4.322
233,ndcg,0.288473170992+/-0.002,batch_loss, 4.321
writing out model with ndcg:  0.2892966755772428  better than best ndcg so far:  0.2887599522442193
234,ndcg,0.289296675577+/-0.002,batch_loss, 4.320
writing out model with ndcg:  0.28971099386225024  better than best ndcg so far:  0.2892966755772428
235,ndcg,0.289710993862+/-0.002,batch_loss, 4.320
writing out model with ndcg:  0.28997532850649776  better than best ndcg so far:  0.28971099386225024
236,ndcg,0.289975328506+/-0.002,batch_loss, 4.319
writing out model with ndcg:  0.2905351140807155  better than best ndcg so far:  0.28997532850649776
237,ndcg,0.290535114081+/-0.002,batch_loss, 4.319
writing out model with ndcg:  0.29106222136052

299,ndcg,0.31083629009+/-0.002,batch_loss, 4.293
writing out model with ndcg:  0.311157652655782  better than best ndcg so far:  0.31083629008992336
300,ndcg,0.311157652656+/-0.002,batch_loss, 4.292
writing out model with ndcg:  0.31123673327233037  better than best ndcg so far:  0.311157652655782
301,ndcg,0.311236733272+/-0.002,batch_loss, 4.292
writing out model with ndcg:  0.31128030423824965  better than best ndcg so far:  0.31123673327233037
302,ndcg,0.311280304238+/-0.002,batch_loss, 4.291
writing out model with ndcg:  0.31160360193912173  better than best ndcg so far:  0.31128030423824965
303,ndcg,0.311603601939+/-0.002,batch_loss, 4.291
writing out model with ndcg:  0.31222648107824086  better than best ndcg so far:  0.31160360193912173
304,ndcg,0.312226481078+/-0.002,batch_loss, 4.291
305,ndcg,0.311585057726+/-0.002,batch_loss, 4.291
writing out model with ndcg:  0.31280387954481736  better than best ndcg so far:  0.31222648107824086
306,ndcg,0.312803879545+/-0.002,batch_loss,

387,ndcg,0.321198338344+/-0.002,batch_loss, 4.271
388,ndcg,0.321027928541+/-0.002,batch_loss, 4.270
389,ndcg,0.321126315021+/-0.002,batch_loss, 4.271
390,ndcg,0.321524337071+/-0.002,batch_loss, 4.270
writing out model with ndcg:  0.32161363876325566  better than best ndcg so far:  0.3215648642883124
391,ndcg,0.321613638763+/-0.002,batch_loss, 4.270
392,ndcg,0.321112071144+/-0.002,batch_loss, 4.270
393,ndcg,0.321326217838+/-0.002,batch_loss, 4.270
394,ndcg,0.32151215077+/-0.002,batch_loss, 4.270
writing out model with ndcg:  0.3222008850017651  better than best ndcg so far:  0.32161363876325566
395,ndcg,0.322200885002+/-0.002,batch_loss, 4.269
396,ndcg,0.321847757022+/-0.002,batch_loss, 4.269
397,ndcg,0.320720412341+/-0.002,batch_loss, 4.269
398,ndcg,0.321624935893+/-0.002,batch_loss, 4.269
writing out model with ndcg:  0.3224280499736297  better than best ndcg so far:  0.3222008850017651
399,ndcg,0.322428049974+/-0.002,batch_loss, 4.268
400,ndcg,0.321495198215+/-0.002,batch_loss, 4.268

535,ndcg,0.321819316741+/-0.002,batch_loss, 4.251
536,ndcg,0.32183806307+/-0.002,batch_loss, 4.251
537,ndcg,0.322510008049+/-0.002,batch_loss, 4.250
538,ndcg,0.322540762355+/-0.002,batch_loss, 4.250
539,ndcg,0.322223347298+/-0.002,batch_loss, 4.250
540,ndcg,0.321835795942+/-0.002,batch_loss, 4.250
541,ndcg,0.32174893345+/-0.002,batch_loss, 4.249
542,ndcg,0.322212029332+/-0.002,batch_loss, 4.250
543,ndcg,0.322006057461+/-0.002,batch_loss, 4.250
544,ndcg,0.322324201808+/-0.002,batch_loss, 4.249
545,ndcg,0.322234036905+/-0.002,batch_loss, 4.249
546,ndcg,0.321493541354+/-0.002,batch_loss, 4.249
547,ndcg,0.322455733711+/-0.002,batch_loss, 4.249
548,ndcg,0.322130046728+/-0.002,batch_loss, 4.249
549,ndcg,0.321880083194+/-0.002,batch_loss, 4.249
550,ndcg,0.322445236611+/-0.002,batch_loss, 4.249
551,ndcg,0.321427905402+/-0.002,batch_loss, 4.249
552,ndcg,0.322177569403+/-0.002,batch_loss, 4.249
553,ndcg,0.321786296672+/-0.002,batch_loss, 4.249
554,ndcg,0.321925289242+/-0.002,batch_loss, 4.248
55

# Analysis

In [None]:
np.sort(cold_start_titles)

In [None]:
def run_video_level_metric_analysis(experiment_name,
                                    EXP_DIR,
                                    heldout_idxlist,
                                    heldout_data_te,
                                    heldout_start_index,
                                    model_censored,
                                    model_user_tags,
                                    model_video_tags):
    tf.reset_default_graph()
    with tf.Graph().as_default():
        model1 = Model1(num_users,
                    num_items,
                    num_tags,
                    num_factors,
                    var_prior,
                    video_metadata_array,
                    availability,
                    model_censored,
                    model_user_tags,
                    model_video_tags)
        batch_logits, batch_logits_validation, log_softmax, avg_loss, batch_conditional_log_likelihood,\
        batch_kl_div, num_items_per_document = model1.construct_graph()
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        with tf.Session(config=config) as sess:
            saver = tf.train.Saver()
            saver.restore(sess, os.path.join(EXP_DIR, 'tensorflow_models', experiment_name, 'files'))

            ### Metrics computation ###
            positives_list = []
            positives_ranks_list = []
            ndcg_dist = []
            for batch_ind, st_index in enumerate(range(0, heldout_idxlist.shape[0], batch_size)):
                end_index = min(st_index + batch_size, heldout_idxlist.shape[0])
                heldout_user_indices = heldout_idxlist[st_index:end_index]
                logit_validation = sess.run(batch_logits_validation,
                                            feed_dict={model1.users_ph : heldout_user_indices})
                X = all_tr_mat[heldout_user_indices]
                X = X.toarray()
                X = X.astype('float32')
                labels = heldout_data_te[heldout_user_indices - heldout_start_index].toarray()
                logit_validation[X.nonzero()] = -np.inf

                ranking = np.argsort(-logit_validation, axis = 1)
                videos_with_ranks = np.zeros(ranking.shape, dtype=np.int32)
                videos_with_ranks[np.arange(logit_validation.shape[0])[:, np.newaxis], ranking] =\
                np.zeros(ranking.shape, dtype=np.int32) + np.arange(num_items)
                rows, positives = np.where(labels)
                ranks_of_positives = videos_with_ranks[rows, positives]
                positives_list.append(positives)
                positives_ranks_list.append(ranks_of_positives)
                ndcg_dist.append(NDCG_binary_at_k_batch(logit_validation,
                                                        heldout_data_te[heldout_user_indices - heldout_start_index]))
            all_positives = np.concatenate(positives_list)
            all_positives_ranks = np.concatenate(positives_ranks_list)
            ndcg_dist = np.concatenate(ndcg_dist)
            ndcg_ = ndcg_dist.mean()
            ndcg_serr = ndcg_dist.std() / np.sqrt(ndcg_dist.shape[0])
            print(ndcg_, '+/-', ndcg_serr)
            video_ranks_df = pd.DataFrame({'sid' : all_positives, 'ranks' : all_positives_ranks})
            video_ranks_df['mrr'] = 1.0 / (video_ranks_df['ranks'] + 1.0)
            video_playcount_df = train_df.groupby('sid').apply(len).reset_index()
            video_playcount_df.columns = ['sid', 'playcount']
            video_avg_mrr_df = video_ranks_df.groupby('sid')['ranks', 'mrr'].mean().reset_index()
            video_playcount_mrr_sorted_df =\
            pd.merge(video_playcount_df, video_avg_mrr_df, on = 'sid').sort_values(by = 'playcount', ascending = False)
            return video_playcount_mrr_sorted_df, video_ranks_df, ndcg_dist

In [None]:
video_playcount_mrr_sorted_withtags_df,video_ranks_withtags_df ,ndcg_withtags_dist =\
run_video_level_metric_analysis('censored-testmetrics-user=id-video=id_genreyear-learningrate=0.004-regularization=1e-09-numfactors=100-prior_var-1.0-epochs-1000-attempt=2',
                                '/data/ml20m/ml-20m/exp',
                                heldout_idxlist,
                                heldout_data_te,
                                heldout_start_index,
                                model_censored=True,
                                model_user_tags=False,
                                model_video_tags =True)

In [None]:
video_playcount_mrr_sorted_withouttags_df, video_ranks_withouttags_df, ndcg_withouttags_dist =\
run_video_level_metric_analysis('censored-testmetrics-user=id-video=id-learningrate=0.004-regularization=1e-09-numfactors=100-prior_var-1.0-epochs-1000-attempt=2',
                               '/data/ml20m/ml-20m/exp',
                                heldout_idxlist,
                                heldout_data_te,
                                heldout_start_index,
                                model_censored=True,
                                model_user_tags=False,
                                model_video_tags =False)

In [None]:
video_playcount_mrr_sorted_with_uv_tags_df,video_ranks_with_uv_tags_df ,ndcg_with_uv_tags_dist =\
run_video_level_metric_analysis(experiment_name=experiment_name,
                                EXP_DIR=EXP_DIR,
                                heldout_idxlist=heldout_idxlist,
                                heldout_data_te=heldout_data_te,
                                heldout_start_index=heldout_start_index,
                                model_censored=True,
                                model_user_tags=True,
                                model_video_tags =True)

In [None]:
video_playcount_mrr_sorted_withtags_df['label'] = 'with document tags'
video_playcount_mrr_sorted_withouttags_df['label'] = 'without tags'
video_playcount_mrr_sorted_with_uv_tags_df['label'] = 'with user, document tags'

In [None]:
levels = video_playcount_mrr_sorted_withtags_df['playcount'].quantile(np.linspace(0, 1, 5)).values.astype(np.int32)
num_plays_ranks =\
np.argmax(video_playcount_mrr_sorted_withtags_df['playcount'][:, np.newaxis] <= levels, axis = 1) + 1

In [None]:
video_playcount_mrr_sorted_withtags_df['rank'] = num_plays_ranks
video_playcount_mrr_sorted_withouttags_df['rank'] = num_plays_ranks
video_playcount_mrr_sorted_with_uv_tags_df['rank'] = num_plays_ranks

In [None]:
video_playcount_mrr_df = pd.concat([video_playcount_mrr_sorted_withouttags_df,
                                    video_playcount_mrr_sorted_withtags_df,
                                    video_playcount_mrr_sorted_with_uv_tags_df])

In [None]:
video_playcount_mrr_df = pd.merge(video_playcount_mrr_df, availability_df, on = 'sid')

In [None]:
%pylab inline

In [None]:
import seaborn as sns

In [None]:
 sns.set(style="whitegrid")

In [None]:
video_playcount_mrr_df.groupby(['availability', 'rank', 'label'])['ranks'].mean()

In [None]:
video_playcount_mrr_df.groupby(['availability', 'label'])['ranks'].mean()

In [None]:
fig1 = plt.figure(figsize=(10, 8))
ax = sns.barplot(x="availability", y="ranks", hue = 'label', data=video_playcount_mrr_df)
ax.set_xlabel('Available');
ax.set_ylabel('Average Rank');

In [None]:
model_censored, model_user_tags, model_video_tags

In [None]:
tf.reset_default_graph()
with tf.Graph().as_default():
    model1 = Model1(num_users,
                num_items,
                num_tags,
                num_factors,
                var_prior,
                video_metadata_array,
                availability,
                model_censored,
                model_user_tags,
                model_video_tags)
    batch_logits, batch_logits_validation, log_softmax, avg_loss, batch_conditional_log_likelihood,\
    batch_kl_div, num_items_per_document = model1.construct_graph()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        saver = tf.train.Saver()
        saver.restore(sess, os.path.join(EXP_DIR, 'tensorflow_models', experiment_name, 'files'))
        logit_validation = sess.run(batch_logits_validation,
                                    feed_dict={model1.users_ph : [0,1,2]})
        print(logit_validation)

In [None]:
model1.played_tags_ph