In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import pickle
import time
import random
import tensorflow as tf
from scipy import spatial
import os
import json
import argparse
import scipy
from scipy import sparse
from sklearn.preprocessing import LabelEncoder
import copy
from joblib import *
SEED = int(open("SEED.txt", "r").readlines()[0])

In [2]:
# In[2]:


parser = argparse.ArgumentParser()
parser.add_argument('--npos', type=str, default=0,help='neg or not pos')
parser.add_argument('--beta', type=str, default='',help='beta')
parser.add_argument('--iter', type=str, default=0,help='max iteration')
parser.add_argument('--dist', type=str, default='',help='distant type')
parser.add_argument('--nov', type=str, default='',help='novelty type')
parser.add_argument('--baseline', type=str, default=0,help='whether to run baseline')

FLAGS, _ = parser.parse_known_args()

DATA_DIR='../../data/ml-1m/'
MODEL_DIR='./'


NPOS_FLG=1#int(FLAGS.npos)
DISTANT_TYPE=0#int(FLAGS.dist)
NOVELTY_TYPE=1#int(FLAGS.nov)
BASELINE=1#int(FLAGS.baseline)

assert(DISTANT_TYPE==0 or DISTANT_TYPE==1)
assert(NOVELTY_TYPE==0 or NOVELTY_TYPE==1)

DATASET_NAME="ml"

DATASETOBJ_PATH = '%s_dataset_%d.pkl'%(DATASET_NAME,SEED)

In [3]:
class MovieLens:
    def load_raw_data(self):
        f=tf.gfile.Open(DATA_DIR + 'ratings.dat',"r")
        self.df_rating = pd.read_csv(
            f,
            sep='::',
            names=['uid', 'itemid', 'rating', 'time'])
        
        f=tf.gfile.Open(DATA_DIR + 'users.dat',"r")
        self.df_userinfo = pd.read_csv(
            f,
            sep='::',
            names=['uid', 'sex', 'age', 'occupation', 'zip_code'])
        
        list_item_attr = [
            'itemid', 'title', "Action", "Adventure", "Animation", "Children's",
            "Comedy", "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir",
            "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller",
            "War", "Western"
        ]
        item_data=[]
        for line in open(DATA_DIR + 'movies.dat',"r").readlines():
            tmp=line.strip().split('::')
            itemid=tmp[0]
            title=tmp[1]
            tmp=tmp[2].split('|')
            genres=[]
            for g in (list_item_attr[2:]):
                if g in tmp:
                    genres.append(1)
                else:
                    genres.append(0)
            item_data.append([itemid,title]+genres)
        self.df_iteminfo=pd.DataFrame(data=item_data,columns=list_item_attr)
        self.df_userinfo = self.df_userinfo.fillna(0)
        self.df_iteminfo = self.df_iteminfo.fillna(0)
        self.df_iteminfo["itemid"]=self.df_iteminfo["itemid"].astype("int64")

    def feature_engineering(self):
        
        le=LabelEncoder()
        le.fit(self.df_iteminfo["itemid"])
        self.df_iteminfo["itemid"]=le.transform(self.df_iteminfo["itemid"])
        self.df_rating["itemid"]=le.transform(self.df_rating["itemid"])

        le=LabelEncoder()
        le.fit(self.df_userinfo["uid"])
        self.df_userinfo["uid"]=le.transform(self.df_userinfo["uid"])
        self.df_rating["uid"]=le.transform(self.df_rating["uid"])
        self.df_iteminfo.drop([ "title"],axis=1,inplace=True)
        
        ##iteminfo
        for index,(df_all,num_feat) in enumerate([(self.df_iteminfo,self.item_numerical_attr),\
                                (self.df_userinfo,self.user_numerical_attr)]):
            df_obj = df_all.drop(num_feat,axis=1)
            df_numeric=df_all[num_feat]
            for c in df_obj:
                df_obj[c] = (pd.factorize(df_obj[c])[0])
            df_all = pd.concat([df_obj,df_numeric], axis=1)
            if index==0:
                self.df_iteminfo=df_all
            else:
                self.df_userinfo=df_all

    def __init__(self):
        self.rating_threshold = 3
        self.load_raw_data()
        self.df_iteminfo["padding"]=0
        self.user_numerical_attr =  ["age"]
        self.item_numerical_attr = ["padding"]
        
        self.feature_engineering()



# In[5]:

In [4]:
movielens=MovieLens()

  import sys
  del sys.path[0]


In [7]:
# In[3]:


class DataSetProcesser():
    def calculate_data(self):
        self.list_uid = self.df_userinfo.uid
        self.list_itemid = self.df_iteminfo.itemid

        self.all_rateduser_byitemid = [[] for itemid in self.list_itemid]
        self.all_posuser_byitemid = [[] for itemid in self.list_itemid]
        self.all_rateditem_byuid = [[] for uid in self.list_uid]
        self.all_positem_byuid = [[] for uid in self.list_uid]
        self.all_neguser_byitemid = [ [] for itemid in self.list_itemid]
        self.all_negitem_byuid = [[] for uid in self.list_uid]
        
        self.all_ratings_byitemid=[[] for itemid in self.list_itemid]
        self.all_ratings_byitemid_sparse=[[] for itemid in self.list_itemid]
        self.all_israted_byitemid_sparse=[[] for itemid in self.list_itemid]
        self.all_ratings_uid_byitemid=[[] for itemid in self.list_itemid]

        sz = len(self.df_rating)
        
        rating_mat=self.df_rating.values

        for (index, row) in enumerate(rating_mat):
            if index % 10000 == 0:
                print('Preprocessing Dataset', index, '/', sz)
                
            uid,itemid,rating=row[0],row[1],row[2]
           
            self.all_rateduser_byitemid[itemid].append(uid)
            self.all_rateditem_byuid[uid].append(itemid)

            self.all_ratings_byitemid[itemid].append(rating)
            self.all_ratings_uid_byitemid[itemid].append(uid)
            
            if rating > self.rating_threshold:
                self.all_posuser_byitemid[itemid].append(uid)
                self.all_positem_byuid[uid].append(itemid)
            else:
                self.all_neguser_byitemid[itemid].append(uid)
                self.all_negitem_byuid[uid].append(itemid)
                
        self._USER_SIZE_ONLY_NUM = len(self.user_numerical_attr)
        self._USER_SIZE_OF_FIELDS = []
        for feat in self.df_userinfo.columns:
            if feat not in self.user_numerical_attr:
                self._USER_SIZE_OF_FIELDS.append(
                    len(np.unique(self.df_userinfo[feat])))
        for feat in self.user_numerical_attr:
            self._USER_SIZE_OF_FIELDS.append(1)

        self._USER_SIZE = len(self._USER_SIZE_OF_FIELDS)
        self._USER_SIZE_OF_MASK_FIELDS = self._USER_SIZE_OF_FIELDS[:-self.
                                                                   _USER_SIZE_ONLY_NUM]
        self._USER_SIZE_BIN = sum(self._USER_SIZE_OF_FIELDS)

        self._ITEM_SIZE_ONLY_NUM = len(self.item_numerical_attr)

        self._ITEM_SIZE_OF_FIELDS = []
        for feat in self.df_iteminfo.columns:
            if feat in self.item_numerical_attr:
                self._ITEM_SIZE_OF_FIELDS.append(1)
            else:
                self._ITEM_SIZE_OF_FIELDS.append(
                    len(np.unique(self.df_iteminfo[feat])))

        self._ITEM_SIZE = len(self._ITEM_SIZE_OF_FIELDS)
        self._ITEM_SIZE_OF_MASK_FIELDS = self._ITEM_SIZE_OF_FIELDS[:-self.
                                                                   _ITEM_SIZE_ONLY_NUM]
        self._ITEM_SIZE_BIN = sum(self._ITEM_SIZE_OF_FIELDS)

    def split_history(self, dic,ratio):
        seed = self.seed
        dic1 = [[] for k in range(len(dic))]
        dic2 = [[] for k in range(len(dic))]
        for index in range(len(dic)):
            lst=dic[index]
            lenoflist = len(lst)
            if lenoflist != 0:
                random.Random(seed).shuffle(lst)
                dic1[index] = lst[:int(ratio * lenoflist)]
                dic2[index] = lst[int(ratio * lenoflist):]
            else:
                dic1[index] = []
                dic2[index] = []
        return dic1, dic2

    def merge_history(self, dic1, dic2):
        return [ dic1[ky] + dic2[ky] for ky in range(len(dic1))]

    def reverse_user_and_item(self, dict_byuid):
        result = [[] for itemid in self.list_itemid]
        for uid in range(len(dict_byuid)):
            for itemid in dict_byuid[uid]:
                result[itemid].append(uid)
        return result

    def split_data(self):
        #print(self.all_positem_byuid[1])
        self.train_positem_byuid, self.test_positem_byuid = self.split_history(
            self.all_positem_byuid,self.ratio)
        #print(self.all_positem_byuid[1])

        self.train_posuser_byitemid, self.test_posuser_byitemid = self.reverse_user_and_item(
            self.train_positem_byuid), self.reverse_user_and_item(
                self.test_positem_byuid)

        self.train_negitem_byuid, self.test_negitem_byuid = self.split_history(
            self.all_negitem_byuid,self.ratio)

        self.train_neguser_byitemid, self.test_neguser_byitemid = self.reverse_user_and_item(
            self.train_negitem_byuid), self.reverse_user_and_item(
                self.test_negitem_byuid)

        self.train_rateduser_byitemid = self.merge_history(
            self.train_posuser_byitemid, self.train_neguser_byitemid)

        self.test_rateduser_byitemid = self.merge_history(
            self.test_posuser_byitemid, self.test_neguser_byitemid)

        self.train_rateditem_byuid = self.merge_history(self.train_positem_byuid,
                                                     self.train_negitem_byuid)

        self.test_rateditem_byuid = self.merge_history(self.test_positem_byuid,
                                                    self.test_negitem_byuid)
        
        self.train_ratings_byitemid=[[] for itemid in self.list_itemid]
        self.train_ratings_byitemid_sparse=[[] for itemid in self.list_itemid]
        self.train_israted_byitemid_sparse=[[] for itemid in self.list_itemid]
        self.train_ratings_uid_byitemid=[[] for itemid in self.list_itemid]
        sz=len(self.list_itemid)
        for itemid in range(len(self.list_itemid)):
            if itemid % 50 == 0:
                print('dense rating',itemid,'/',sz)
            for index in range(len(self.all_ratings_uid_byitemid[itemid])):
                uid,rating=self.all_ratings_uid_byitemid[itemid][index],self.all_ratings_byitemid[itemid][index]
                if uid in self.train_rateduser_byitemid[itemid]:
                    self.train_ratings_byitemid[itemid].append(rating)
                    self.train_ratings_uid_byitemid[itemid].append(uid)
                
        usz=len(self.list_uid)
        isz=len(self.list_itemid)
        for itemid in range(len(self.list_itemid)):
            if itemid % 50 == 0:
                print('sparse all rating',itemid,'/',isz)     
            vec=self.all_ratings_byitemid[itemid]
            vec=vec/np.linalg.norm(vec)
            self.all_ratings_byitemid_sparse[itemid]=sparse.coo_matrix((vec,
                                                                       ([0.0 for k in range(len(vec))],
                                                                       self.all_ratings_uid_byitemid[itemid])),shape=(1,usz))
            self.all_israted_byitemid_sparse[itemid]=sparse.coo_matrix(([1.0 for k in range(len(vec))],
                                                                       ([0.0 for k in range(len(vec))],
                                                                       self.all_ratings_uid_byitemid[itemid])),shape=(1,usz))
            
            vec=self.train_ratings_byitemid[itemid]
            vec=vec/np.linalg.norm(vec)
            self.train_ratings_byitemid_sparse[itemid]=sparse.coo_matrix((vec,
                                                                       ([0.0 for k in range(len(vec))],
                                                                       self.train_ratings_uid_byitemid[itemid])),shape=(1,usz))
            self.train_israted_byitemid_sparse[itemid]=sparse.coo_matrix(([1.0 for k in range(len(vec))],
                                                                       ([0.0 for k in range(len(vec))],
                                                                       self.train_ratings_uid_byitemid[itemid])),shape=(1,usz))
            
            
        

    def __init__(self, movielens, split_ratio, seed=SEED):
        self.seed = seed
        self.rating_threshold = movielens.rating_threshold
        self.ratio = split_ratio
        self.df_rating = movielens.df_rating
        self.df_userinfo = movielens.df_userinfo
        self.df_iteminfo = movielens.df_iteminfo
        self.user_numerical_attr = movielens.user_numerical_attr
        self.item_numerical_attr = movielens.item_numerical_attr
        self.calculate_data()
        self.split_data()

In [9]:
try:
    f=open(DATASETOBJ_PATH,"rb")
    dataset=pickle.load(f)
    f.close()
except:
    movielens=MovieLens()
    dataset=DataSetProcesser(movielens,0.7)
    f=open(DATASETOBJ_PATH,"wb")
    pickle.dump(dataset,f)
    f.close()


# In[6]:

In [10]:
def check():
    rating=movielens.df_rating
    user=movielens.df_userinfo
    item=movielens.df_iteminfo

    uid=10
    itemid=12

    tmp1=[0 for k in range(len(dataset.list_uid))]
    for row in rating[rating.itemid==itemid].values:
        tmp1[row[0]]=row[2]
    tmp1=np.array(tmp1)/np.linalg.norm(tmp1)

    tmp2=dataset.all_ratings_byitemid_sparse[itemid].toarray()[0]

    print(sum(tmp1)==sum(tmp2))

    tmp1=[0 for k in range(len(dataset.list_uid))]
    for row in rating[rating.itemid==itemid].values:
        if row[0] in dataset.train_rateduser_byitemid[row[1]]:
            tmp1[row[0]]=row[2]
    tmp1=np.array(tmp1)/np.linalg.norm(tmp1)

    tmp2=dataset.train_ratings_byitemid_sparse[itemid].toarray()[0]

    sum(tmp1==tmp2)==len(tmp1)

    tmp1=sorted(dataset.all_positem_byuid[uid])
    tmp2=sorted(rating[rating.rating>3][rating.uid==uid].itemid.tolist())
    print(tmp1==tmp2)

    tmp1=sorted(dataset.all_posuser_byitemid[itemid])
    tmp2=sorted(rating[rating.rating>3][rating.itemid==itemid].uid.tolist())
    print(tmp1==tmp2)
    
    tmp1=sorted(dataset.all_negitem_byuid[uid])
    tmp2=sorted(rating[rating.rating<=3][rating.uid==uid].itemid.tolist())
    print(tmp1==tmp2)


    tmp1=sorted(dataset.all_rateditem_byuid[uid])
    tmp2=sorted(rating[rating.rating!=0][rating.uid==uid].itemid.tolist())
    print(tmp1==tmp2)

    tmp1=sorted(dataset.all_rateduser_byitemid[itemid])
    tmp2=sorted(rating[rating.rating!=0][rating.itemid==itemid].uid.tolist())
    print(tmp1==tmp2)

    print(set(dataset.train_positem_byuid[uid])|set(dataset.test_positem_byuid[uid])==set(dataset.all_positem_byuid[uid]))
    print(set(dataset.train_rateditem_byuid[uid])|set(dataset.test_rateditem_byuid[uid])==set(dataset.all_rateditem_byuid[uid]))
    print(set(dataset.train_negitem_byuid[uid])|set(dataset.test_negitem_byuid[uid])==set(dataset.all_negitem_byuid[uid]))
    print(set(dataset.all_positem_byuid[uid])|set(dataset.all_negitem_byuid[uid])==set(dataset.all_rateditem_byuid[uid]))
    print(set(dataset.train_positem_byuid[uid])|set(dataset.train_negitem_byuid[uid])==set(dataset.train_rateditem_byuid[uid]))
    #print(dataset.train_rateditem_byuid[1])
check()

True




True
True




True
True
True
True
True
True
True
True


In [11]:
class RecommendSysUtil():

    def distant(self, lst1,lst2,is_test=0,dis_type=DISTANT_TYPE):
        if is_test==0:
            mat1=sparse.vstack([self.dataset.train_ratings_byitemid_sparse[i] if dis_type==1 \
                                   else self.dataset.train_israted_byitemid_sparse[i]\
                                   for i in lst1])
        else:
            mat1=sparse.vstack([self.dataset.all_ratings_byitemid_sparse[i] if dis_type==1 \
                                   else self.dataset.all_israted_byitemid_sparse[i]\
                                   for i in lst1])
        if is_test==0:
            mat2=sparse.vstack([self.dataset.train_ratings_byitemid_sparse[i] if dis_type==1 \
                                   else self.dataset.train_israted_byitemid_sparse[i]\
                                   for i in lst2])
        else:
            mat2=sparse.vstack([self.dataset.all_ratings_byitemid_sparse[i] if dis_type==1 \
                                   else self.dataset.all_israted_byitemid_sparse[i]\
                                   for i in lst2])
        if dis_type==1:
            dis_mat=1-mat1.dot(mat2.transpose()).toarray()
        else:
            dis_mat=mat1.dot(mat2.transpose()).toarray()
            #print(dis_mat)
            for i in range(len(lst1)):
                for j in range(len(lst2)):
                    sz=len(self.dataset.train_rateduser_byitemid[lst2[j]]) if is_test==0 \
                    else len(self.dataset.all_rateduser_byitemid[lst2[j]])
                    #print(dis_mat[index],sz,1.0-1.0*dis_mat[index]/sz)
                    dis_mat[i][j]=1.0-1.0*dis_mat[i][j]/sz
                #print(dis_mat[index])
        return dis_mat
    

    def novelty(self,n_jobs=1,is_test=0,novelty_type=NOVELTY_TYPE):  
        if novelty_type==1:
            def novelty_dist(self,uid,is_test=0):
                if (is_test==0 and len(self.dataset.train_positem_byuid[uid])==0) or (
                    is_test==1 and len(self.dataset.all_positem_byuid[uid])==0):
                    return []
                #start = time.clock()
                if is_test==0:
                    pos_lst=list(self.dataset.train_positem_byuid[uid])
                else:
                    pos_lst=list(self.dataset.all_positem_byuid[uid])
                if is_test==0:
                    rated_lst=list(self.dataset.train_rateditem_byuid[uid])
                else:
                    rated_lst=list(self.dataset.all_rateditem_byuid[uid])
                dis_mat=self.distant(pos_lst,rated_lst,is_test)
                nov_lst=[]
                for index_i,i in enumerate(pos_lst):
                    nov_lst.append(np.mean([dis_mat[index_i][index_j] for index_j,j in enumerate(rated_lst)]))
                return nov_lst
            results = Parallel(n_jobs=n_jobs,verbose=100,pre_dispatch='all',batch_size=int(len(self.dataset.list_uid)/n_jobs))(
            delayed(novelty_dist)(self,uid,is_test) for uid in dataset.list_uid)
        else:
            def novelty_popular(self,uid,is_test=0):
                if is_test==0:
                    nov_lst=[
                        -np.log2(1.0*len(self.dataset.train_rateduser_byitemid[itemid]) \
                                 / len(self.dataset.list_uid) + pow(10, -9)) \
                             for itemid in self.dataset.train_positem_byuid[uid]
                    ]
                else:
                    nov_lst=[
                        -np.log2(1.0*len(self.dataset.all_rateduser_byitemid[itemid]) \
                                 / len(self.dataset.list_uid) + pow(10, -9)) \
                             for itemid in self.dataset.all_positem_byuid[uid]
                    ]
                #if uid==0:
                    #print(uid,nov_lst)
                return nov_lst
            results = Parallel(n_jobs=n_jobs,verbose=100,pre_dispatch='all',batch_size=int(len(self.dataset.list_uid)/n_jobs))(
            delayed(novelty_popular)(self,uid,is_test) for uid in dataset.list_uid)
        return results  
    
    def item_vectorize(self, itemid):
        return [ x for x in self.dataset.df_iteminfo.iloc[itemid,:].values] 

    def user_vectorize(self, uid):
        return [ x for x in self.dataset.df_userinfo.iloc[uid,:].values]  
                
    def __init__(self, dataset):
        self.dataset = dataset

In [12]:
util=RecommendSysUtil(dataset)

In [13]:
def check_dist():
    print(1-len(set(dataset.all_rateduser_byitemid[1])&set(dataset.all_rateduser_byitemid[2]))/len(set(dataset.all_rateduser_byitemid[2])))
    print(1-len(set(dataset.train_rateduser_byitemid[1])&set(dataset.train_rateduser_byitemid[2]))/len(set(dataset.train_rateduser_byitemid[2])))
    
    print(util.distant([1,2],[2,1],dis_type=0))

    print(util.distant([1,2],[2,1],dis_type=0,is_test=1))

    print(util.distant([1],[1,2,3],dis_type=1,is_test=0))

    print(util.distant([1],[1,2,3],dis_type=1,is_test=1))
    def check(item1,item2,is_test=0):
        vec1=[0 for k in range(len(dataset.list_uid))]
        vec2=[0 for k in range(len(dataset.list_uid))]

        if is_test==0:
            for index,uid in enumerate(dataset.train_ratings_uid_byitemid[item1]):
                vec1[uid]=dataset.train_ratings_byitemid[item1][index]

            for index,uid in enumerate(dataset.train_ratings_uid_byitemid[item2]):
                vec2[uid]=dataset.train_ratings_byitemid[item2][index]
        else:
            for index,uid in enumerate(dataset.all_ratings_uid_byitemid[item1]):
                vec1[uid]=dataset.all_ratings_byitemid[item1][index]
               # print(vec1[uid])

            for index,uid in enumerate(dataset.all_ratings_uid_byitemid[item2]):
                vec2[uid]=dataset.all_ratings_byitemid[item2][index]

        vec1=np.array(vec1)/np.linalg.norm(vec1)
        vec2=np.array(vec2)/np.linalg.norm(vec2)
        return 1-vec1.dot(vec2)
#     return vec1,vec2
    print(check(1,2,is_test=0),check(1,3,is_test=0),check(1,2,is_test=1),check(1,3,is_test=1))
check_dist()

0.6799163179916319
0.805732484076433
[[0.80573248 0.        ]
 [0.         0.86593407]]
[[0.67991632 0.        ]
 [0.         0.78174037]]
[[-6.66133815e-16  8.66107392e-01  9.20543707e-01]]
[[-6.43929354e-15  7.59053552e-01  8.44543398e-01]]
0.866107391548371 0.9205437065370933 0.7590535515158896 0.8445433982798506


In [43]:
# def check_novelty():
#     mat=util.novelty(is_test=0,novelty_type=0)

#     -np.log2(1.0*len(self.dataset.train_rateduser_byitemid[itemid]) \
#                                      / len(self.dataset.list_uid) + pow(10, -9)) \
#                                  for itemid in self.dataset.train_positem_byuid[uid]

#     -np.log2(1.0*len(dataset.train_rateduser_byitemid[2722]) \
#                                      / len(dataset.list_uid) + pow(10, -9))

#     dataset.train_positem_byuid[0]

#     mat[0]

#     mat2=util.novelty(is_test=1,novelty_type=0)

#     mat2[0]

#     dataset.all_positem_byuid[0]

#      -np.log2(1.0*len(dataset.all_rateduser_byitemid[2722]) \
#                                      / len(dataset.list_uid) + pow(10, -9))

#     mat3=util.novelty(is_test=0,novelty_type=1)

#     mat4=util.novelty(is_test=1,novelty_type=1)

#     mat3[0]

#     rated_lst=dataset.train_positem_byuid[0]
#     dis_lst=[]
#     for item in rated_lst:
#         dis_lst.append(util.distant([2722],[item],is_test=0,dis_type=0)[0][0])
#     np.mean(dis_lst)

#     mat4[0]

#     rated_lst=dataset.all_positem_byuid[0]
#     dis_lst=[]
#     for item in rated_lst:
#         dis_lst.append(util.distant([2722],[item],is_test=1,dis_type=0)[0][0])
#     np.mean(dis_lst)

In [44]:
class RecommendSys():

    def get_novelty_distribution(self):
        print('Calculating novelty distribution...')
        try:
            f=open(self.nov_distri_path,"rb")
            train_distri,all_distri=pickle.load(f)
        except:
            train_distri=self.util.novelty(n_jobs=1,is_test=0)
            all_distri=self.util.novelty(n_jobs=1,is_test=1)
            f=open(self.nov_distri_path,"wb")
            pickle.dump((train_distri,all_distri),f)
        
        train_distri_pow=[ [pow(nov,self.beta) for nov in lst ] for lst in train_distri ]
        train_distri_pow=[ (np.array(lst)/sum(lst)).tolist() for lst in train_distri_pow ]

        return train_distri_pow,all_distri

    def predict(self, list_uid, list_itemid):
        user_batch = [self.util.user_vectorize(uid) for uid in list_uid]
        item_batch = [self.util.item_vectorize(itemid) for itemid in list_itemid]
        label_batch = [[1] * len(list_itemid) for uid in list_uid]
        prob_matrix = self.prob.eval(
            feed_dict={
                self.user_input: user_batch,
                self.item_input: item_batch,
                self.label: label_batch
            })
        return prob_matrix

    def predict_by_queue(self, list_uid, list_itemid,n_jobs=1):
        sz = len(list_uid)
        batch_sz = 5000
        bins = int(sz / batch_sz)
        def predict_batch(self,idx,list_uid,list_itemid):
            return  self.predict(list_uid, list_itemid)
        results = Parallel(n_jobs=n_jobs,verbose=100,pre_dispatch='all',batch_size=int(len(self.dataset.list_uid)/n_jobs))(
            delayed(predict_batch)(self,idx,list_uid[idx * batch_sz:(idx + 1) * batch_sz],list_itemid) for idx in range(bins))
        
        results=np.concatenate(results,axis=0)
        tmp = self.predict(list_uid[bins * batch_sz:], list_itemid)
        #print(np.shape(results),np.shape(tmp))
        if results != []:
            results = np.concatenate((results, tmp), axis=0)
        else:
            results = tmp
        return results

    def eval_performance(self):

        list_uid = dataset.list_uid
        list_itemid = dataset.list_itemid
        self.prob_by_uitem = self.predict_by_queue(list_uid, list_itemid)
        self.uid_to_recomm = self.base_recommend(self.prob_by_uitem,
                                                 self.top_N)
        ndcg,reward0, reward1, agg_div, entro_div = self.measure(
            self.uid_to_recomm)
        return ndcg, reward0, reward1, agg_div, entro_div

    def base_recommend(self, prob_by_uitem,top_N,n_jobs=1):
        #print(time.clock())
        self.base_uid_to_reomm={}
        def recommend_single_user(uid,rated, prob_lst,list_item,top_N):
            #start=time.clock()
            prob_arr = list(zip(list_item, prob_lst))
            prob_arr = sorted(prob_arr, key=lambda d: -d[1])
            cnt = 0
            recomm= []
            for pair in prob_arr:
                itemid = pair[0]
                rel=pair[1]
                if itemid not in rated:
                    recomm.append(itemid)
                    cnt += 1
                    if cnt == top_N:
                        break
            #print(time.clock()-start)
            return (uid,recomm)
        uid_to_recomm = Parallel(n_jobs=n_jobs,verbose=100,pre_dispatch='all',batch_size=int(len(self.dataset.list_uid)/n_jobs))(
            delayed(recommend_single_user)(uid,dataset.train_rateditem_byuid[uid],prob_by_uitem[uid],self.dataset.list_itemid,top_N) 
            for uid in list(filter(
                lambda uid: len(self.dataset.test_positem_byuid[uid]) >= self.top_N,self.dataset.list_uid)))
        self.base_uid_to_recomm=dict(uid_to_recomm)
        return self.base_uid_to_recomm

    def measure(self, uid_to_recomm):
        
        ndcg=0.0
        #print("NDCG: %.4f"%(ndcg))


        #### novelty metric
        avg_reward0 = 0.0
        avg_reward1 = 0.0
        agg_div = 0.0
        enp_div = 0.0

        cnt = 0
        for uid in uid_to_recomm:
            reward0 = 0.0
            reward1 = 0.0
            sz=len(self.dataset.train_positem_byuid[uid])
            for itemid in uid_to_recomm[uid]:
                if (itemid in self.dataset.test_positem_byuid[uid]):
                    nov = self.all_distri[uid][sz+self.dataset.test_positem_byuid[uid].index(itemid)]
                    #print(nov)
                    nov0 = pow(nov, 0)
                    nov1 = pow(nov, 1)
                    reward0 = max(reward0, nov0)
                    reward1 = max(reward1, nov1)
            avg_reward0 += reward0
            if reward1!=np.inf and reward1!=-np.inf and reward1 != 0.0 :
                avg_reward1 += reward1
                #print(reward1)
                cnt+=1
                

        if avg_reward0 != 0:
            avg_reward0 /= len(uid_to_recomm)
        if avg_reward1 != 0:
            avg_reward1 /= cnt
        print(
            'Novelty: reward(β=0)=%.5f reward(β=1)=%.5f'
            % (avg_reward0, avg_reward1))


        #### diversity metric
        recomm_set = set()
        for uid in uid_to_recomm:
            recomm_set = recomm_set | set(uid_to_recomm[uid])
        agg_div = 1.0*len(recomm_set) / len(uid_to_recomm) / self.top_N

        itemid_to_recomuser = {}

        for uid in uid_to_recomm:
            for itemid in uid_to_recomm[uid]:
                if itemid not in itemid_to_recomuser:
                    itemid_to_recomuser[itemid] = 0
                itemid_to_recomuser[itemid] += 1

        s = 0
        for itemid in itemid_to_recomuser:
            s += itemid_to_recomuser[itemid]

        for itemid in itemid_to_recomuser:
            probb = 1.0*itemid_to_recomuser[itemid] / s + pow(10, -9)
            enp_div += -(np.log2(probb) * probb)

        print(
            'Diversity: aggdiv=%.5f entropydiv=%.5f'
            % (agg_div, enp_div))
        return ndcg, avg_reward0, avg_reward1, agg_div, enp_div

    # def print_recommend(uid):
        
    def train_a_batch(self, iter, session):
        
        loss_all = 0

        user_batch = []
        item_batch = []
        label_batch = []
        list_positemid = []
        list_uid = []
        list_label = []
        list_negitemid = []

        for i in range(self.batch_size):
            uid = 0
            while (True):
                uid = self.rng.randint(1, self.NUM_USERS)
                dataset = self.dataset
                if ((uid in dataset.list_uid)
                        and len(dataset.train_positem_byuid[uid]) != 0)#and len(dataset.train_negitem_byuid[uid]) != 0):
                    break
            list_uid.append(uid)

        for uid in list_uid:
            pos_itemid = self.rng.choice(
                self.dataset.train_positem_byuid[uid], p=self.pos_distri[uid])
            list_positemid.append(pos_itemid)
            list_label.append(1)
            user_batch.append(self.util.user_vectorize(uid))
            pos_itemvec = self.util.item_vectorize(pos_itemid)
            item_batch.append(pos_itemvec)

        prob_by_uitem = self.predict(list_uid, list_positemid)

        neg_itemset = set()
        neg_index = {}
        for uid in list_uid:
            if NPOS_FLG==1:
                neg_itemset = neg_itemset | (
                    set(self.dataset.list_itemid)
                    -set(self.dataset.train_positem_byuid[uid])
                )
            else:
                neg_itemset = neg_itemset | set(dataset.train_negitem_byuid[uid])
        
        neg_itemset=list(neg_itemset)
        for index, neg_item in enumerate(neg_itemset):
            neg_index[neg_item] = index
        neg_prob_by_uitem = self.predict(list_uid, neg_itemset)

        violator_cnt = 0
        for i, uid in enumerate(list_uid):
            neg_itemid = -1
            pos_itemid = list_positemid[i]
            pos_prob = prob_by_uitem[i][i]
            for k in range(self.LIMIT):
                
                if NPOS_FLG==1:
                    neg_item_list=list(
                    set(self.dataset.list_itemid)
                    -set(self.dataset.train_positem_byuid[uid])
                    )
                    neg_itemid = self.rng.choice(
                        neg_item_list)
                else:
                    neg_itemid = self.rng.choice(
                        self.dataset.train_negitem_byuid[uid])
                neg_prob = neg_prob_by_uitem[i][neg_index[neg_itemid]]
                if neg_prob >= pos_prob and neg_prob != 0:
                    break
                else:
                    neg_itemid = -1

            if neg_itemid != -1:
                violator_cnt += 1
                list_label.append(-1)
                user_batch.append(self.util.user_vectorize(uid))
                neg_itemvec = self.util.item_vectorize(neg_itemid)
                item_batch.append(neg_itemvec)

        label_batch = [[1] * len(user_batch) for j in range(len(user_batch))]
        for i, label in enumerate(list_label):
            label_batch[i][i] = label

        feed_dict = {
            self.user_input: user_batch,
            self.item_input: item_batch,
            self.label: label_batch
        }
        if iter != 0:
            [_optimize, _loss] = session.run(
                [self.optimize, self.loss], feed_dict=feed_dict)
        else:
            writer = tf.summary.FileWriter("log",session.graph)
            [_loss] = session.run([self.loss], feed_dict=feed_dict)
            writer.close()

        return _loss
                
    def train(self,
              nov_distri_path,
              model_path,
              beta=0.0,
              batch_size=128,
              learning_rate=0.006,
              nu=0.0001,
              embedding_size=600,
              EVERY_N_ITERATIONS=50,
              MAX_ITERATIONS=0,
              predict_pair=[]):
        self.rng=np.random.RandomState(SEED)
        self.beta = beta
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.nu = nu
        self.embedding_size = embedding_size
        self.EVERY_N_ITERATIONS = EVERY_N_ITERATIONS
        self.MAX_ITERATIONS = MAX_ITERATIONS

        self.nov_distri_path = "./"+ nov_distri_path
        self.model_path = "./"+ model_path
        
        self.pos_distri,self.all_distri=self.get_novelty_distribution()

        
        graph = tf.Graph()
        dataset = self.util.dataset
        with graph.as_default():
            tf.set_random_seed(SEED)
            with tf.name_scope("input"):
                self.user_input = tf.placeholder(
                    tf.int32, shape=[None, dataset._USER_SIZE], name='user_info')
                self.item_input = tf.placeholder(
                    tf.int32, shape=[None, dataset._ITEM_SIZE], name='item_info')
                self.label = tf.placeholder(
                    tf.int32, shape=[None, None], name='label')
                
            with tf.name_scope("intercept"):
                 b = tf.Variable(
                    initial_value=tf.truncated_normal(
                        (self.embedding_size, 1),
                        stddev=1.0 / np.sqrt(self.embedding_size)))

            with tf.name_scope("user_embedding"):
                W = tf.Variable(
                    initial_value=tf.truncated_normal(
                        (self.embedding_size, dataset._USER_SIZE_BIN),
                        stddev=1.0 / np.sqrt(self.embedding_size)))
                w_offsets = [0] + [
                    sum(dataset._USER_SIZE_OF_MASK_FIELDS[:i + 1])
                    for i, j in enumerate(dataset._USER_SIZE_OF_MASK_FIELDS[:-1])
                ]
                w_offsets = tf.matmul(
                    tf.ones(
                        shape=(tf.shape(self.user_input)[0], 1), dtype=tf.int32),
                    tf.convert_to_tensor([w_offsets]))
                w_columns = self.user_input[:, :-dataset.
                                            _USER_SIZE_ONLY_NUM] + w_offsets  # last column is not an index
                w_selected = tf.gather(W, w_columns, axis=1)
            # age * corresponding column of W
            
                aux = tf.matmul(
                    W[:, -dataset._USER_SIZE_ONLY_NUM:],
                    tf.transpose(
                        tf.to_float(
                            (self.user_input[:, -dataset._USER_SIZE_ONLY_NUM:]))))
                user_batch_num = tf.reshape(
                    aux,
                    shape=(self.embedding_size, tf.shape(self.user_input)[0], 1))
                w_with_num = tf.concat([w_selected, user_batch_num], axis=2)
                w_result = tf.reduce_sum(w_with_num, axis=2)
            with tf.name_scope("item_embedding"):
                A = tf.Variable(
                    initial_value=tf.truncated_normal(
                        (self.embedding_size, dataset._ITEM_SIZE_BIN),
                        stddev=1.0 / np.sqrt(self.embedding_size)))
                # select and sum the columns of A depending on the input
                a_offsets = [0] + [
                    sum(dataset._ITEM_SIZE_OF_MASK_FIELDS[:i + 1])
                    for i, j in enumerate(dataset._ITEM_SIZE_OF_MASK_FIELDS[:-1])
                ]
                a_offsets = tf.matmul(
                    tf.ones(
                        shape=(tf.shape(self.item_input)[0], 1), dtype=tf.int32),
                    tf.convert_to_tensor([a_offsets]))
                a_columns = self.item_input[:, :-dataset.
                                            _ITEM_SIZE_ONLY_NUM] + a_offsets  # last two columns are not indices
                a_selected = tf.gather(A, a_columns, axis=1)
                # dates * corresponding last two columns of A
                aux = tf.matmul(
                    A[:, -dataset._ITEM_SIZE_ONLY_NUM:],
                    tf.transpose(
                        tf.to_float(
                            self.item_input[:, -dataset._ITEM_SIZE_ONLY_NUM:])))
                item_batch_num = tf.reshape(
                    aux,
                    shape=(self.embedding_size, tf.shape(self.item_input)[0], 1))
                # ... and the intercept
                intercept = tf.gather(
                    b,
                    tf.zeros(
                        shape=(tf.shape(self.item_input)[0], 1), dtype=tf.int32),
                    axis=1)
                a_with_num = tf.concat(
                    [a_selected, item_batch_num, intercept], axis=2)
                a_result = tf.reduce_sum(a_with_num, axis=2)

            with tf.name_scope("output"):
                g = tf.matmul(tf.transpose(w_result), a_result)

                x = tf.to_float(self.label) * g
                self.prob = tf.nn.sigmoid(x)

                loss = tf.reduce_mean(tf.nn.softplus(tf.diag_part(-x)))

                # Regularization
                reg = self.nu * (tf.nn.l2_loss(W) + tf.nn.l2_loss(A))
                # Loss function with regularization (what we want to minimize)
                loss_to_minimize = loss + reg
                
                self.loss= loss_to_minimize

                self.optimize = tf.train.AdamOptimizer(
                    learning_rate=self.learning_rate).minimize(
                        loss=loss_to_minimize)
        # Once thep graph is created, let's probgram the training loop
        
        config = tf.ConfigProto(allow_soft_placement=True)
        config.gpu_options.allow_growth = True
        
        with tf.Session(config=config,graph=graph) as session:
            tf.set_random_seed(SEED)
            session.run(tf.global_variables_initializer())
            saver = tf.train.Saver()

            try:
                saver.restore(session, self.model_path)
            except:
                pass


            average_loss = 0.0
            for iter in range(self.MAX_ITERATIONS + 1):
                train_loss = self.train_a_batch(iter, session)
                average_loss += train_loss
                print('Iteration', iter, 'Train_loss', train_loss)

                if iter % self.EVERY_N_ITERATIONS == 0:
                    ndcg,reward0, reward1, agg_div, entro_div = self.eval_performance()
                    saver.save(session, self.model_path)
                        
            result = {}
            return result, ndcg, reward1, agg_div, entro_div
         
# In[ ]:


    def run_baseline(self,pref,dis_type,n_jobs=1):
        if BASELINE==1:
            s1="%s_nov_distri_beta%.1f"%(pref,0.0)
        if NPOS_FLG==1:
            s1+="_npos"
        else:
            s1+="_normal"
        s2="%s_K_600_beta_%.1f_vald2"%(pref,0.0)
        if NPOS_FLG==1:
            s2+="_npos"
        else:
            s2+="_normal"
        result,ndcg, reward1, agg_div, entro_div=self.train(
            s1,s2,beta=0.0,predict_pair=[],MAX_ITERATIONS=0)

        RESORT_LEN=500
        tmp=self.base_recommend(self.prob_by_uitem, RESORT_LEN)
        
        sz=len(tmp)
        
        self.uid_to_dismat={}
        for uid in tmp:
            recommend=tmp[uid]
            dis_mat=self.util.distant(recommend,recommend,is_test=0,dis_type=dis_type)
            self.uid_to_dismat[uid]=dis_mat
            #if uid==1:
                #print(dis_mat)
        result_list = []
        k_list=[]
        if dis_type==0:
            k_list=[0.0,0.02,0.04,0.06,0.08,0.1,0.3,0.5,0.7,
            0.9,1.0]
        else:
            k_list=[0.04,0.08,0.1,0.12,0.14,0.16,0.18,0.2,0.22,0.24]

        def MMR(self,uid,k,dis_type):
            R, S = self.base_uid_to_recomm[uid], []
            #start=time.clock()
            dis_mat=self.uid_to_dismat[uid]
            #print('dis use time',time.clock()-start)
            for index,itemid in enumerate(self.base_uid_to_recomm[uid]):
                    R[index]=(itemid,index)
            #print(R[:20])
            #start=time.clock()
            #print('precise recommend uid',uid)
            for iter in range(self.top_N):
                #print(iter)
                fobj_set = []
                for itemid_r,index_r in R:
                    #print(itemid_r,index_r)
                    rel = self.prob_by_uitem[uid][itemid_r]
                    min_dist = 1.0
                    for itemid_s,index_s in S:
                        dist = dis_mat[index_r][index_s]
                        min_dist = min(dist, min_dist)
                    fobj = (1 - k) * rel + k * min_dist
                    fobj_set.append((itemid_r, index_r,fobj))

                pair = max(fobj_set, key=lambda x: x[2])
                best = (pair[0],pair[1])
                R.remove(best)
                S.append(best)
            #print('single recommend time',time.clock()-start)
            return (uid,[x[0] for x in S])

        for k in k_list:
            print("lambda=%f" % (k))
            #start=time.clock()
            self.base_uid_to_recomm=copy.deepcopy(tmp)
            #print(self.base_uid_to_recomm[1],self.base_uid_to_recomm[2])
            #print('use time',time.clock()-start)
            start=time.clock()
            results = Parallel(n_jobs=n_jobs,verbose=100,pre_dispatch='all',batch_size=int(len(dataset.list_uid)/n_jobs))(
            delayed(MMR)(self,uid,k,dis_type) for uid in self.base_uid_to_recomm)
            print('use time',time.clock()-start)
            print('Baseline Performance')
            ndcg, avg_reward0,avg_reward1, agg_div, enp_div = self.measure(
                dict(results))
            result_list.append((k, ndcg, avg_reward1, agg_div, enp_div))
        pd.DataFrame(
             result_list,
             columns=[
                 "lambda", "ndcg", "avg_reward", "agg_div", "entropy_div"
             ]).to_csv(
                 "%s_baseline_result_"%(pref) + str(SEED) + "_eq%d_%s.csv"%(14 if dis_type==1 else 15,"normal" if NPOS_FLG==0 else "npos" ), index=False)


    def __init__(self, util):
        self.top_N = 10
        self.LIMIT = 100
        self.util = util
        self.dataset = util.dataset
        self.NUM_USERS = len(self.util.dataset.df_userinfo)
        self.NUM_ITEMS = len(self.util.dataset.df_iteminfo)
        self.beta = 0.0
        

In [45]:
util=RecommendSysUtil(dataset)
rec_sys=RecommendSys(util)

beta_list=[0.0]#,0.5,1.0,1.5,2.0,5.0]#[ float(x) for x in FLAGS.beta.split(',')]
max_iter=0 #int(FLAGS.iter)
result_list=[]

pref="ml"


for beta in list(beta_list):
    print('beta',beta)
    s1="%s_nov_distri"%(pref)
    if beta>=0.1 or beta == 0.0 :
         s2="%s_K_600_beta_%.1f_vald2"%(pref,beta)
    else:
         s2="%s_K_600_beta_%.3f_vald2"%(pref,beta)
    
    if NPOS_FLG==1:
        s1+="_npos"
        s2+="_npos"
    else:
        s1+="_normal"
        s2+="_normal"
        
    result,ndcg, reward1, agg_div, entro_div=rec_sys.train(
    s1,s2,beta=beta, predict_pair=[],MAX_ITERATIONS=max_iter)
    result_list.append((beta,ndcg,reward1,agg_div,entro_div))
pd.DataFrame(result_list,columns=["beta","ndcg","avg_reward","agg_div","entropy_div"]).to_csv("ml_newmethod_result_%s_%s.csv"%(str(SEED),"normal" if NPOS_FLG==0 else "npos"),index=False)
print(time.asctime())
rec_sys.run_baseline(pref,0)

beta 0.0
Calculating novelty distribution...
INFO:tensorflow:Restoring parameters from ./ml_K_600_beta_0.0_vald2_npos


INFO:tensorflow:Restoring parameters from ./ml_K_600_beta_0.0_vald2_npos


Iteration 0 Train_loss 0.8484484
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.1s finished




[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 4324 out of 4324 | elapsed:   11.1s finished
Novelty: reward(β=0)=0.77613 reward(β=1)=0.44924
Diversity: aggdiv=0.01462 entropydiv=6.51331
Tue Nov 27 16:43:07 2018
Calculating novelty distribution...
INFO:tensorflow:Restoring parameters from ./ml_K_600_beta_0.0_vald2_npos


INFO:tensorflow:Restoring parameters from ./ml_K_600_beta_0.0_vald2_npos


Iteration 0 Train_loss 0.8484484
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 4324 out of 4324 | elapsed:   11.2s finished
Novelty: reward(β=0)=0.77613 reward(β=1)=0.44924
Diversity: aggdiv=0.01462 entropydiv=6.51331
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


KeyboardInterrupt: 