In [72]:
'''
@author: Jason.F
@data: 2019.07.09
@function: Implementing NCF and Setting pinterest-20 Dataset as baseline by rmse,hitradio,ndcg
           https://arxiv.org/pdf/1708.05031.pdf
           https://github.com/hexiangnan/neural_collaborative_filtering
'''
import pandas as pd
import numpy as np
import math
from collections import defaultdict
import heapq
import scipy.sparse as sp
import theano
import theano.tensor as T
import keras
from keras import backend 
from keras.regularizers import l1, l2
from keras.models import Sequential, Model
from keras.layers.core import Dense, Lambda, Activation
from keras.layers import Embedding, Input, Dense, Reshape, Flatten, Dropout, Multiply, concatenate
from keras.optimizers import Adagrad, Adam, SGD, RMSprop
from keras import initializers
import heapq # for retrieval topK
import multiprocessing

#1.Loading the  MovienLen dataset, ml-1m
def load_rating_file_as_list(filename):
    ratingList = []
    with open(filename, "r") as f:
        line = f.readline()
        while line != None and line != "":
            arr = line.split("\t")
            user, item = int(arr[0]), int(arr[1])
            ratingList.append([user, item])
            line = f.readline()
    return ratingList
    
def load_negative_file(filename):
    negativeList = []
    with open(filename, "r") as f:
        line = f.readline()
        while line != None and line != "":
            arr = line.split("\t")
            negatives = []
            for x in arr[1: ]:
                negatives.append(int(x))
            negativeList.append(negatives)
            line = f.readline()
    return negativeList
    
def load_rating_file_as_matrix(filename):
    '''
    Read .rating file and Return dok matrix.
    The first line of .rating file is: num_users\t num_items
    '''
    # Get number of users and items
    num_users, num_items = 0, 0
    with open(filename, "r") as f:
        line = f.readline()
        while line != None and line != "":
            arr = line.split("\t")
            u, i = int(arr[0]), int(arr[1])
            num_users = max(num_users, u)
            num_items = max(num_items, i)
            line = f.readline()
    # Construct matrix
    mat = sp.dok_matrix((num_users+1, num_items+1), dtype=np.float32)
    with open(filename, "r") as f:
        line = f.readline()
        while line != None and line != "":
            arr = line.split("\t")
            user, item, rating = int(arr[0]), int(arr[1]), float(arr[2])
            if (rating > 0):
                mat[user, item] = 1.0
            line = f.readline()    
    return mat

trainMatrix = load_rating_file_as_matrix("/data/fjsdata/ctKngBase/ml/pinterest-20.train.rating")
testRatings = load_rating_file_as_list("/data/fjsdata/ctKngBase/ml/pinterest-20.test.rating")
testNegatives = load_negative_file("/data/fjsdata/ctKngBase/ml/pinterest-20.test.negative")
assert len(testRatings) == len(testNegatives)
#2.NCF Class
class NCF():
    
    def __init__(self,K):
        self.K = K#num_factors
    
    def get_model(self, num_users, num_items, layers=[64,32,16,8], reg_layers=[0,0,0,0], reg_mf=0):
        mf_dim = self.K#num_factors
        num_layer = len(layers) #Number of layers in the MLP
        # Input variables
        user_input = Input(shape=(1,), dtype='int32', name = 'user_input')
        item_input = Input(shape=(1,), dtype='int32', name = 'item_input')

        # Embedding layer
        MF_Embedding_User = Embedding(input_dim = num_users, output_dim = mf_dim, name = 'mf_embedding_user',
                                      embeddings_initializer='normal', W_regularizer = l2(reg_mf), input_length=1)
        MF_Embedding_Item = Embedding(input_dim = num_items, output_dim = mf_dim, name = 'mf_embedding_item',
                                      embeddings_initializer='normal', W_regularizer = l2(reg_mf), input_length=1)   

        MLP_Embedding_User = Embedding(input_dim = num_users, output_dim = int(layers[0]/2), name = "mlp_embedding_user",
                                      embeddings_initializer='normal', W_regularizer = l2(reg_layers[0]), input_length=1)
        MLP_Embedding_Item = Embedding(input_dim = num_items, output_dim = int(layers[0]/2), name = 'mlp_embedding_item',
                                      embeddings_initializer='normal', W_regularizer = l2(reg_layers[0]), input_length=1)   

        # MF part
        mf_user_latent = Flatten()(MF_Embedding_User(user_input))
        mf_item_latent = Flatten()(MF_Embedding_Item(item_input))
        #mf_vector = merge([mf_user_latent, mf_item_latent], mode = 'mul') # element-wise multiply
        mf_vector = Multiply()([mf_user_latent, mf_item_latent])

        # MLP part 
        mlp_user_latent = Flatten()(MLP_Embedding_User(user_input))
        mlp_item_latent = Flatten()(MLP_Embedding_Item(item_input))
        #mlp_vector = merge([mlp_user_latent, mlp_item_latent], mode = 'concat')
        mlp_vector = concatenate([mlp_user_latent, mlp_item_latent], axis=-1)
        
        for idx in range(1, num_layer):
            layer = Dense(layers[idx], W_regularizer= l2(reg_layers[idx]), activation='relu', name="layer%d" %idx)
            mlp_vector = layer(mlp_vector)

        # Concatenate MF and MLP parts
        #mf_vector = Lambda(lambda x: x * alpha)(mf_vector)
        #mlp_vector = Lambda(lambda x : x * (1-alpha))(mlp_vector)
        #predict_vector = merge([mf_vector, mlp_vector], mode = 'concat')
        predict_vector = concatenate([mf_vector, mlp_vector], axis=-1)

        # Final prediction layer
        prediction = Dense(1, activation='sigmoid', init='lecun_uniform', name = "prediction")(predict_vector)

        model = Model(input=[user_input, item_input], output=prediction)

        return model
    
    def get_train_instances(self, train, num_negatives):
        user_input, item_input, labels = [],[],[]
        num_users = train.shape[0]
        for (u, i) in train.keys():
            # positive instance
            user_input.append(u)
            item_input.append(i)
            labels.append(1)
            # negative instances
            for t in range(num_negatives):
                j = np.random.randint(num_items)
                #while train.has_key((u, j)):
                while (u,j) in train:
                    j = np.random.randint(num_items)
                user_input.append(u)
                item_input.append(j)
                labels.append(0)
        return user_input, item_input, labels
    
    def evaluate_model(self, model, testRatings, testNegatives, topK, num_thread):
        """
        Evaluate the performance (Hit_Ratio, NDCG) of top-K recommendation
        Return: score of each test rating.
        """
        self.model = model
        self.testRatings = testRatings
        self.testNegatives = testNegatives
        self.topK = topK

        hits, ndcgs = [],[]
        if(num_thread > 1): # Multi-thread
            pool = multiprocessing.Pool(processes=num_thread)
            res = pool.map(self.eval_one_rating, range(len(self.testRatings)))
            pool.close()
            pool.join()
            hits = [r[0] for r in res]
            ndcgs = [r[1] for r in res]
            return (hits, ndcgs)
        # Single thread
        for idx in range(len(self.testRatings)):
            (hr,ndcg) = self.eval_one_rating(idx)
            hits.append(hr)
            ndcgs.append(ndcg)      
        return (hits, ndcgs)

    def eval_one_rating(self,idx):
        rating = self.testRatings[idx]
        items = self.testNegatives[idx]
        u = rating[0]
        gtItem = rating[1]
        items.append(gtItem)
        # Get prediction scores
        map_item_score = {}
        users = np.full(len(items), u, dtype = 'int32')
        predictions = self.model.predict([users, np.array(items)], 
                                     batch_size=100, verbose=0)
        for i in range(len(items)):
            item = items[i]
            map_item_score[item] = predictions[i]
        items.pop()

        # Evaluate top rank list
        ranklist = heapq.nlargest(self.topK, map_item_score, key=map_item_score.get)
        hr = self.getHitRatio(ranklist, gtItem)
        ndcg = self.getNDCG(ranklist, gtItem)
        return (hr, ndcg)

    def getHitRatio(self,ranklist, gtItem):
        for item in ranklist:
            if item == gtItem:
                return 1
        return 0

    def getNDCG(self,ranklist, gtItem):
        for i in range(len(ranklist)):
            item = ranklist[i]
            if item == gtItem:
                return math.log(2) / math.log(i+2)
        return 0
#3.Training and Evaluating
print ("%3s%20s%20s%20s" % ('K', 'NegNum', 'HitRatio', 'NDCG'))
for K in [2,10,50,100]:#latent factors
    ncf = NCF(K=K)
    num_users, num_items = trainMatrix.shape
    model = ncf.get_model(num_users, num_items)# Build model
    model.compile(optimizer=Adam(lr=0.001), loss='binary_crossentropy')
    for NegNum in [4,10,20,50]:#iterations epoches  
        user_input, item_input, labels = ncf.get_train_instances(trainMatrix, num_negatives=NegNum)# Generate training instances
        hist = model.fit([np.array(user_input), np.array(item_input)], #input
                          np.array(labels), # labels 
                          batch_size=256, nb_epoch=1, verbose=0, shuffle=True)
        (hits, ndcgs) = ncf.evaluate_model(model=model, testRatings=testRatings, \
                                           testNegatives=testNegatives, topK=10, num_thread=1)
        hr, ndcg, loss = np.array(hits).mean(), np.array(ndcgs).mean(), hist.history['loss'][0]
        print ("%3d%20d%20.6f%20.6f" % (K, NegNum, hr, ndcg))

  K              NegNum            HitRatio                NDCG




  2                   4            0.737293            0.421269
  2                  10            0.809085            0.487186
  2                  20            0.807437            0.493963
  2                  50            0.774295            0.470568




 10                   4            0.783917            0.470333
 10                  10            0.835432            0.514143
 10                  20            0.824524            0.513770
 10                  50            0.760560            0.453226




 50                   4            0.816225            0.495922
 50                  10            0.843532            0.525368
 50                  20            0.826318            0.522346
 50                  50            0.713375            0.419657




100                   4            0.825774            0.504620
100                  10            0.840615            0.525899
100                  20            0.826227            0.523378
100                  50            0.707649            0.409568


In [9]:
'''
@author: Jason.F
@data: 2019.07.09
@function: Implementing NCF and Setting Movielen Dataset(ml-1m) as baseline by rmse,hitradio,ndcg
           https://arxiv.org/pdf/1708.05031.pdf
           https://github.com/hexiangnan/neural_collaborative_filtering
'''
import pandas as pd
import numpy as np
import math
from collections import defaultdict
import heapq
import scipy.sparse as sp

#1.Loading the  MovienLen dataset, ml-1m
def load_rating_file_as_list(filename):
    ratingList = []
    with open(filename, "r") as f:
        line = f.readline()
        while line != None and line != "":
            arr = line.split("\t")
            user, item = int(arr[0]), int(arr[1])
            ratingList.append([user, item])
            line = f.readline()
    return ratingList
    
def load_negative_file(filename):
    negativeList = []
    with open(filename, "r") as f:
        line = f.readline()
        while line != None and line != "":
            arr = line.split("\t")
            negatives = []
            for x in arr[1: ]:
                negatives.append(int(x))
            negativeList.append(negatives)
            line = f.readline()
    return negativeList
    
def load_rating_file_as_matrix(filename):
    '''
    Read .rating file and Return dok matrix.
    The first line of .rating file is: num_users\t num_items
    '''
    # Get number of users and items
    num_users, num_items = 0, 0
    with open(filename, "r") as f:
        line = f.readline()
        while line != None and line != "":
            arr = line.split("\t")
            u, i = int(arr[0]), int(arr[1])
            num_users = max(num_users, u)
            num_items = max(num_items, i)
            line = f.readline()
    # Construct matrix
    mat = sp.dok_matrix((num_users+1, num_items+1), dtype=np.float32)
    with open(filename, "r") as f:
        line = f.readline()
        while line != None and line != "":
            arr = line.split("\t")
            user, item, rating = int(arr[0]), int(arr[1]), float(arr[2])
            if (rating > 0):
                mat[user, item] = 1.0
            line = f.readline()    
    return mat

trainMatrix = load_rating_file_as_matrix("/data/fjsdata/ctKngBase/ml/ml-1m.train.rating")
testRatings = load_rating_file_as_list("/data/fjsdata/ctKngBase/ml/ml-1m.test.rating")
testNegatives = load_negative_file("/data/fjsdata/ctKngBase/ml/ml-1m.test.negative")
assert len(testRatings) == len(testNegatives)

In [71]:
#2. NCF class
import theano
import theano.tensor as T
import keras
from keras import backend 
from keras.regularizers import l1, l2
from keras.models import Sequential, Model
from keras.layers.core import Dense, Lambda, Activation
from keras.layers import Embedding, Input, Dense, Reshape, Flatten, Dropout, Multiply, concatenate
from keras.optimizers import Adagrad, Adam, SGD, RMSprop
from keras import initializers
import heapq # for retrieval topK
import multiprocessing
class NCF():
    
    def __init__(self,K):
        self.K = K#num_factors
    
    def get_model(self, num_users, num_items, layers=[64,32,16,8], reg_layers=[0,0,0,0], reg_mf=0):
        mf_dim = self.K#num_factors
        num_layer = len(layers) #Number of layers in the MLP
        # Input variables
        user_input = Input(shape=(1,), dtype='int32', name = 'user_input')
        item_input = Input(shape=(1,), dtype='int32', name = 'item_input')

        # Embedding layer
        MF_Embedding_User = Embedding(input_dim = num_users, output_dim = mf_dim, name = 'mf_embedding_user',
                                      embeddings_initializer='normal', W_regularizer = l2(reg_mf), input_length=1)
        MF_Embedding_Item = Embedding(input_dim = num_items, output_dim = mf_dim, name = 'mf_embedding_item',
                                      embeddings_initializer='normal', W_regularizer = l2(reg_mf), input_length=1)   

        MLP_Embedding_User = Embedding(input_dim = num_users, output_dim = int(layers[0]/2), name = "mlp_embedding_user",
                                      embeddings_initializer='normal', W_regularizer = l2(reg_layers[0]), input_length=1)
        MLP_Embedding_Item = Embedding(input_dim = num_items, output_dim = int(layers[0]/2), name = 'mlp_embedding_item',
                                      embeddings_initializer='normal', W_regularizer = l2(reg_layers[0]), input_length=1)   

        # MF part
        mf_user_latent = Flatten()(MF_Embedding_User(user_input))
        mf_item_latent = Flatten()(MF_Embedding_Item(item_input))
        #mf_vector = merge([mf_user_latent, mf_item_latent], mode = 'mul') # element-wise multiply
        mf_vector = Multiply()([mf_user_latent, mf_item_latent])

        # MLP part 
        mlp_user_latent = Flatten()(MLP_Embedding_User(user_input))
        mlp_item_latent = Flatten()(MLP_Embedding_Item(item_input))
        #mlp_vector = merge([mlp_user_latent, mlp_item_latent], mode = 'concat')
        mlp_vector = concatenate([mlp_user_latent, mlp_item_latent], axis=-1)
        
        for idx in range(1, num_layer):
            layer = Dense(layers[idx], W_regularizer= l2(reg_layers[idx]), activation='relu', name="layer%d" %idx)
            mlp_vector = layer(mlp_vector)

        # Concatenate MF and MLP parts
        #mf_vector = Lambda(lambda x: x * alpha)(mf_vector)
        #mlp_vector = Lambda(lambda x : x * (1-alpha))(mlp_vector)
        #predict_vector = merge([mf_vector, mlp_vector], mode = 'concat')
        predict_vector = concatenate([mf_vector, mlp_vector], axis=-1)

        # Final prediction layer
        prediction = Dense(1, activation='sigmoid', init='lecun_uniform', name = "prediction")(predict_vector)

        model = Model(input=[user_input, item_input], output=prediction)

        return model
    
    def get_train_instances(self, train, num_negatives):
        user_input, item_input, labels = [],[],[]
        num_users = train.shape[0]
        for (u, i) in train.keys():
            # positive instance
            user_input.append(u)
            item_input.append(i)
            labels.append(1)
            # negative instances
            for t in range(num_negatives):
                j = np.random.randint(num_items)
                #while train.has_key((u, j)):
                while (u,j) in train:
                    j = np.random.randint(num_items)
                user_input.append(u)
                item_input.append(j)
                labels.append(0)
        return user_input, item_input, labels
    
    def evaluate_model(self, model, testRatings, testNegatives, topK, num_thread):
        """
        Evaluate the performance (Hit_Ratio, NDCG) of top-K recommendation
        Return: score of each test rating.
        """
        self.model = model
        self.testRatings = testRatings
        self.testNegatives = testNegatives
        self.topK = topK

        hits, ndcgs = [],[]
        if(num_thread > 1): # Multi-thread
            pool = multiprocessing.Pool(processes=num_thread)
            res = pool.map(self.eval_one_rating, range(len(self.testRatings)))
            pool.close()
            pool.join()
            hits = [r[0] for r in res]
            ndcgs = [r[1] for r in res]
            return (hits, ndcgs)
        # Single thread
        for idx in range(len(self.testRatings)):
            (hr,ndcg) = self.eval_one_rating(idx)
            hits.append(hr)
            ndcgs.append(ndcg)      
        return (hits, ndcgs)

    def eval_one_rating(self,idx):
        rating = self.testRatings[idx]
        items = self.testNegatives[idx]
        u = rating[0]
        gtItem = rating[1]
        items.append(gtItem)
        # Get prediction scores
        map_item_score = {}
        users = np.full(len(items), u, dtype = 'int32')
        predictions = self.model.predict([users, np.array(items)], 
                                     batch_size=100, verbose=0)
        for i in range(len(items)):
            item = items[i]
            map_item_score[item] = predictions[i]
        items.pop()

        # Evaluate top rank list
        ranklist = heapq.nlargest(self.topK, map_item_score, key=map_item_score.get)
        hr = self.getHitRatio(ranklist, gtItem)
        ndcg = self.getNDCG(ranklist, gtItem)
        return (hr, ndcg)

    def getHitRatio(self,ranklist, gtItem):
        for item in ranklist:
            if item == gtItem:
                return 1
        return 0

    def getNDCG(self,ranklist, gtItem):
        for i in range(len(ranklist)):
            item = ranklist[i]
            if item == gtItem:
                return math.log(2) / math.log(i+2)
        return 0
#3.Training and Evaluating
print ("%3s%20s%20s%20s" % ('K', 'NegNum', 'HitRatio', 'NDCG'))
for K in [2,10,50,100]:#latent factors
    ncf = NCF(K=K)
    num_users, num_items = trainMatrix.shape
    model = ncf.get_model(num_users, num_items)# Build model
    model.compile(optimizer=Adam(lr=0.001), loss='binary_crossentropy')
    for NegNum in [4,10,20,50]:#iterations epoches  
        user_input, item_input, labels = ncf.get_train_instances(trainMatrix, num_negatives=NegNum)# Generate training instances
        hist = model.fit([np.array(user_input), np.array(item_input)], #input
                          np.array(labels), # labels 
                          batch_size=256, nb_epoch=1, verbose=0, shuffle=True)
        (hits, ndcgs) = ncf.evaluate_model(model=model, testRatings=testRatings, \
                                           testNegatives=testNegatives, topK=10, num_thread=1)
        hr, ndcg, loss = np.array(hits).mean(), np.array(ndcgs).mean(), hist.history['loss'][0]
        print ("%3d%20d%20.6f%20.6f" % (K, NegNum, hr, ndcg))

  K              NegNum            HitRatio                NDCG




  2                   4            0.544702            0.306534
  2                  10            0.614238            0.353439
  2                  20            0.611093            0.354414
  2                  50            0.582781            0.333956




 10                   4            0.601987            0.340144
 10                  10            0.645530            0.376645
 10                  20            0.638742            0.374999
 10                  50            0.561755            0.326713




 50                   4            0.629636            0.366642
 50                  10            0.680132            0.405432
 50                  20            0.665397            0.400873
 50                  50            0.576325            0.329987




100                   4            0.650166            0.374791
100                  10            0.672682            0.405173
100                  20            0.662417            0.398848
100                  50            0.582285            0.339298
