In [1]:
#适用版本paddlepaddle1.6.0

import paddle 
import paddle.fluid as fluid
import paddle.fluid.dygraph as dygraph
from paddle.fluid.dygraph import FC, Conv2D, Embedding, Pool2D
import numpy as np
import random
from PIL import Image


class MovieLen(object):
    def __init__(self, use_poster):
        self.use_poster = use_poster
        # 声明每个数据文件的路径
        usr_info_path = "./ml-1m/users.dat"
        if not use_poster:
            rating_path = "./ml-1m/ratings.dat"
        else:
            rating_path = "./ml-1m/new_rating.txt"

        movie_info_path = "./ml-1m/movies.dat"
        self.poster_path = "./ml-1m/posters/"
        # 得到电影数据
        self.movie_info, self.movie_cat, self.movie_title = self.get_movie_info(movie_info_path)
        # 记录电影的最大ID
        self.max_mov_cat = np.max([self.movie_cat[k] for k in self.movie_cat])
        self.max_mov_tit = np.max([self.movie_title[k] for k in self.movie_title])
        self.max_mov_id = np.max(list(map(int, self.movie_info.keys())))
        # 记录用户数据的最大ID
        self.max_usr_id = 0
        self.max_usr_age = 0
        self.max_usr_job = 0
        # 得到用户数据
        self.usr_info = self.get_usr_info(usr_info_path)
        # 得到评分数据
        self.rating_info = self.get_rating_info(rating_path)
        # 构建数据集 
        self.dataset = self.get_dataset(usr_info=self.usr_info,
                                        rating_info=self.rating_info,
                                        movie_info=self.movie_info)
        # 划分数据及，获得数据加载器
        self.train_dataset = self.dataset[:int(len(self.dataset)*0.9)]
        self.valid_dataset = self.dataset[int(len(self.dataset)*0.9):]
        print("##Total dataset instances: ", len(self.dataset))
        print("##MovieLens dataset information: \nusr num: {}\n"
              "movies num: {}".format(len(self.usr_info),len(self.movie_info)))
    # 得到电影数据
    def get_movie_info(self, path):
        # 打开文件，编码方式选择ISO-8859-1，读取所有数据到data中 
        with open(path, 'r', encoding="ISO-8859-1") as f:
            data = f.readlines()
        # 建立三个字典，分别用户存放电影所有信息，电影的名字信息、类别信息
        movie_info, movie_titles, movie_cat = {}, {}, {}
        # 对电影名字、类别中不同的单词计数
        t_count, c_count = 1, 1

        count_tit = {}
        # 按行读取数据并处理
        for item in data:
            item = item.strip().split("::")
            v_id = item[0]
            v_title = item[1][:-7]
            cats = item[2].split('|')
            v_year = item[1][-5:-1]

            titles = v_title.split()
            # 统计电影名字的单词，并给每个单词一个序号，放在movie_titles中
            for t in titles:
                if t not in movie_titles:
                    movie_titles[t] = t_count
                    t_count += 1
            # 统计电影类别单词，并给每个单词一个序号，放在movie_cat中
            for cat in cats:
                if cat not in movie_cat:
                    movie_cat[cat] = c_count
                    c_count += 1
            # 补0使电影名称对应的列表长度为15
            v_tit = [movie_titles[k] for k in titles]
            while len(v_tit)<15:
                v_tit.append(0)
            # 补0使电影种类对应的列表长度为6
            v_cat = [movie_cat[k] for k in cats]
            while len(v_cat)<6:
                v_cat.append(0)
            # 保存电影数据到movie_info中
            movie_info[v_id] = {'mov_id': int(v_id),
                                'title': v_tit,
                                'category': v_cat,
                                'years': int(v_year)}
        return movie_info, movie_cat, movie_titles

    def get_usr_info(self, path):
        # 性别转换函数，M-0， F-1
        def gender2num(gender):
            return 1 if gender == 'F' else 0

        # 打开文件，读取所有行到data中
        with open(path, 'r') as f:
            data = f.readlines()
        # 建立用户信息的字典
        use_info = {}

        max_usr_id = 0
        #按行索引数据
        for item in data:
            # 去除每一行中和数据无关的部分
            item = item.strip().split("::")
            usr_id = item[0]
            # 将字符数据转成数字并保存在字典中
            use_info[usr_id] = {'usr_id': int(usr_id),
                                'gender': gender2num(item[1]),
                                'age': int(item[2]),
                                'job': int(item[3])}
            self.max_usr_id = max(self.max_usr_id, int(usr_id))
            self.max_usr_age = max(self.max_usr_age, int(item[2]))
            self.max_usr_job = max(self.max_usr_job, int(item[3]))
        return use_info
    # 得到评分数据
    def get_rating_info(self, path):
        # 读取文件里的数据
        with open(path, 'r') as f:
            data = f.readlines()
        # 将数据保存在字典中并返回
        rating_info = {}
        for item in data:
            item = item.strip().split("::")
            usr_id,movie_id,score = item[0],item[1],item[2]
            if usr_id not in rating_info.keys():
                rating_info[usr_id] = {movie_id:float(score)}
            else:
                rating_info[usr_id][movie_id] = float(score)
        return rating_info
    # 构建数据集
    def get_dataset(self, usr_info, rating_info, movie_info):
        trainset = []
        for usr_id in rating_info.keys():
            usr_ratings = rating_info[usr_id]
            for movie_id in usr_ratings:
                trainset.append({'usr_info': usr_info[usr_id],
                                 'mov_info': movie_info[movie_id],
                                 'scores': usr_ratings[movie_id]})
        return trainset
    
    def load_data(self, dataset=None, mode='train'):
        use_poster = False

        # 定义数据迭代Batch大小
        BATCHSIZE = 256

        data_length = len(dataset)
        index_list = list(range(data_length))
        # 定义数据迭代加载器
        def data_generator():
            # 训练模式下，打乱训练数据
            if mode == 'train':
                random.shuffle(index_list)
            # 声明每个特征的列表
            usr_id_list,usr_gender_list,usr_age_list,usr_job_list = [], [], [], []
            mov_id_list,mov_tit_list,mov_cat_list,mov_poster_list = [], [], [], []
            score_list = []
            # 索引遍历输入数据集
            for idx, i in enumerate(index_list):
                # 获得特征数据保存到对应特征列表中
                usr_id_list.append(dataset[i]['usr_info']['usr_id'])
                usr_gender_list.append(dataset[i]['usr_info']['gender'])
                usr_age_list.append(dataset[i]['usr_info']['age'])
                usr_job_list.append(dataset[i]['usr_info']['job'])

                mov_id_list.append(dataset[i]['mov_info']['mov_id'])
                mov_tit_list.append(dataset[i]['mov_info']['title'])
                mov_cat_list.append(dataset[i]['mov_info']['category'])
                mov_id = dataset[i]['mov_info']['mov_id']

                if self.use_poster:
                    # 不使用图像特征时，不读取图像数据，加快数据读取速度
                    poster = Image.open(self.poster_path+'mov_id{}.jpg'.format(str(mov_id)))
                    poster = poster.resize([64, 64])
                    if len(poster.size) <= 2:
                        poster = poster.convert("RGB")

                    mov_poster_list.append(np.array(poster))
             
                score_list.append(int(dataset[i]['scores']))
                # 如果读取的数据量达到当前的batch大小，就返回当前批次
                if len(usr_id_list)==BATCHSIZE:
                    # 转换列表数据为数组形式，reshape到固定形状
                    usr_id_arr = np.expand_dims(np.array(usr_id_list), axis=-1)
                    usr_gender_arr = np.expand_dims(np.array(usr_gender_list), axis=-1)
                    usr_age_arr = np.expand_dims(np.array(usr_age_list), axis=-1)
                    usr_job_arr = np.expand_dims(np.array(usr_job_list), axis=-1)

                    mov_id_arr = np.expand_dims(np.array(mov_id_list), axis=-1)
                    mov_cat_arr = np.reshape(np.array(mov_cat_list), [BATCHSIZE, 1, 6, 1]).astype(np.int64)
                    mov_tit_arr = np.reshape(np.array(mov_tit_list), [BATCHSIZE, 1, 15, 1]).astype(np.int64)

#np.int64->np.float32
                    if self.use_poster:
                        mov_poster_arr = np.reshape(np.array(mov_poster_list)/127.5 - 1, [BATCHSIZE, 3, 64, 64]).astype(np.float32)
                    else:
                        mov_poster_arr = np.array([0.])

                    scores_arr = np.reshape(np.array(score_list), [-1, 1]).astype(np.float32)
                    
                    # 放回当前批次数据
                    yield [usr_id_arr, usr_gender_arr, usr_age_arr, usr_job_arr], \
                           [mov_id_arr, mov_cat_arr, mov_tit_arr, mov_poster_arr], scores_arr

                    # 清空数据
                    usr_id_list, usr_gender_list, usr_age_list, usr_job_list = [], [], [], []
                    mov_id_list, mov_tit_list, mov_cat_list, score_list = [], [], [], []
                    mov_poster_list = []
        return data_generator

class Model(dygraph.layers.Layer):
    def __init__(self, name_scope, use_poster, use_mov_title, use_mov_cat, use_age_job):
        super(Model, self).__init__(name_scope)
        name = self.full_name()
        
        # 将传入的name信息和bool型参数添加到模型类中
        self.use_mov_poster = use_poster
        self.use_mov_title = use_mov_title
        self.use_usr_age_job = use_age_job
        self.use_mov_cat = use_mov_cat
        
        # 获取数据集的信息，并构建训练和验证集的数据迭代器
        Dataset = MovieLen(self.use_mov_poster)
        self.Dataset = Dataset
        self.trainset = self.Dataset.train_dataset
        self.valset = self.Dataset.valid_dataset
        self.train_loader = self.Dataset.load_data(dataset=self.trainset, mode='train')
        self.valid_loader = self.Dataset.load_data(dataset=self.valset, mode='valid')

        """ define network layer for embedding usr info """
        USR_ID_NUM = Dataset.max_usr_id + 1
        # 对用户ID做映射，并紧接着一个FC层
        self.usr_emb = Embedding(name, [USR_ID_NUM, 32], is_sparse=False)
        self.usr_fc = FC(name, size=32)
        
        # 对用户性别信息做映射，并紧接着一个FC层
        USR_GENDER_DICT_SIZE = 2
        self.usr_gender_emb = Embedding(name, [USR_GENDER_DICT_SIZE, 16])
        self.usr_gender_fc = FC(name, 16)
        
        # 对用户年龄信息做映射，并紧接着一个FC层
        USR_AGE_DICT_SIZE = Dataset.max_usr_age + 1
        self.usr_age_emb = Embedding(name, [USR_AGE_DICT_SIZE, 16])
        self.usr_age_fc = FC(name, 16)
        
        # 对用户职业信息做映射，并紧接着一个FC层
        USR_JOB_DICT_SIZE = Dataset.max_usr_job + 1
        self.usr_job_emb = Embedding(name, [USR_JOB_DICT_SIZE, 16])
        self.usr_job_fc = FC(name, 16)
        
        # 新建一个FC层，用于整合用户数据信息
        self.usr_combined = FC(name, 200, act='tanh')
        
        """ define network layer for embedding usr info """
        # 对电影ID信息做映射，并紧接着一个FC层
        MOV_DICT_SIZE = Dataset.max_mov_id + 1
        self.mov_emb = Embedding(name, [MOV_DICT_SIZE, 32])
        self.mov_fc = FC(name, 32)
        
        # 对电影类别做映射
        CATEGORY_DICT_SIZE = len(Dataset.movie_cat) + 1
        self.mov_cat_emb = Embedding(name, [CATEGORY_DICT_SIZE, 32], is_sparse=False)
        self.mov_cat_fc = FC(name, 32)
        
        # 对电影名称做映射
        MOV_TITLE_DICT_SIZE = len(Dataset.movie_title) + 1
        self.mov_title_emb = Embedding(name, [MOV_TITLE_DICT_SIZE, 32], is_sparse=False)
        self.mov_title_conv = Conv2D(name, 1, filter_size=(3, 1), stride=(2,1), padding=0, act='relu')
        self.mov_title_conv2 = Conv2D(name, 1, filter_size=(3, 1), stride=1, padding=0, act='relu')

        #对电影海报做特征
        self.mov_poster_conv= Conv2D(name,50,filter_size=10,stride=(1,1),padding=0,act='relu' )
        self.mov_poster_pool= Pool2D(name,pool_size=5,pool_type='max', pool_stride=1,pool_padding=0)
        self.mov_poster_conv2=Conv2D(name,50,filter_size=3,stride=1,padding=0,act='relu' )
        self.mov_poster_pool2= Pool2D(name,pool_size=2,pool_type='max', pool_stride=1,pool_padding=0)
        self.mov_poster_fc = FC(name, 32)
        # 新建一个FC层，用于整合电影特征
        self.mov_concat_embed = FC(name, size=200, act='tanh')
        
    # 定义计算用户特征的前向运算过程
    def get_usr_feat(self, usr_var):
        """ get usr features"""
        # 获取到用户数据
        usr_id, usr_gender, usr_age, usr_job = usr_var
        # 将用户的ID数据经过embedding和FC计算，得到的特征保存在feats_collect中
        feats_collect = []
        usr_id = self.usr_emb(usr_id)
        usr_id = self.usr_fc(usr_id)
        usr_id = fluid.layers.relu(usr_id)
        feats_collect.append(usr_id)
        
        # 计算用户的性别特征，并保存在feats_collect中
        usr_gender = self.usr_gender_emb(usr_gender)
        usr_gender = self.usr_gender_fc(usr_gender)
        usr_gender = fluid.layers.relu(usr_gender)
        feats_collect.append(usr_gender)
        # 选择是否使用用户的年龄-职业特征
        if self.use_usr_age_job:
            # 计算用户的年龄特征，并保存在feats_collect中
            usr_age = self.usr_age_emb(usr_age)
            usr_age = self.usr_age_fc(usr_age)
            usr_age = fluid.layers.relu(usr_age)
            feats_collect.append(usr_age)
            # 计算用户的职业特征，并保存在feats_collect中
            usr_job = self.usr_job_emb(usr_job)
            usr_job = self.usr_job_fc(usr_job)
            usr_job = fluid.layers.relu(usr_job)
            feats_collect.append(usr_job)
        
        # 将用户的特征级联，并通过FC层得到最终的用户特征
        usr_feat = fluid.layers.concat(feats_collect, axis=1)
        usr_feat = self.usr_combined(usr_feat)
        return usr_feat

        # 定义电影特征的前向计算过程
    def get_mov_feat(self, mov_var):
        """ get movie features"""
        # 获得电影数据
        mov_id, mov_cat, mov_title, mov_poster = mov_var
        feats_collect = []
        # 获得batchsize的大小
        batch_size = mov_id.shape[0]
        # 计算电影ID的特征，并存在feats_collect中
        mov_id = self.mov_emb(mov_id)
        mov_id = self.mov_fc(mov_id)
        mov_id = fluid.layers.relu(mov_id)
        feats_collect.append(mov_id)
        
        # 如果使用电影的种类数据，计算电影种类特征的映射
        if self.use_mov_cat:
            # 计算电影种类的特征映射，对多个种类的特征求和得到最终特征
            mov_cat = self.mov_cat_emb(mov_cat)
            mov_cat = fluid.layers.reduce_sum(mov_cat, dim=1, keep_dim=False)

            mov_cat = self.mov_cat_fc(mov_cat)
            feats_collect.append(mov_cat)

        if self.use_mov_title:
            # 计算电影名字的特征映射，对特征映射使用卷积计算最终的特征
            mov_title = self.mov_title_emb(mov_title)
            mov_title = self.mov_title_conv2(self.mov_title_conv(mov_title))
            mov_title = fluid.layers.reduce_sum(mov_title, dim=2, keep_dim=False)
            mov_title = fluid.layers.relu(mov_title)
            mov_title = fluid.layers.reshape(mov_title, [batch_size, -1])
            feats_collect.append(mov_title)

        if self.use_mov_poster:

            mov_poster=self.mov_poster_conv(mov_poster)
            mov_poster=self.mov_poster_pool(mov_poster)
            mov_poster=self.mov_poster_conv2(mov_poster)
            mov_poster=self.mov_poster_pool2(mov_poster)
            mov_poster = self.mov_poster_fc(mov_poster)
            feats_collect.append(mov_poster)

        # 使用一个全连接层，整合所有电影特征，映射为一个200维的特征向量
        mov_feat = fluid.layers.concat(feats_collect, axis=1)
        mov_feat = self.mov_concat_embed(mov_feat)
        return mov_feat
    
    # 定义个性化推荐算法的前向计算
    def forward(self, usr_var, mov_var):
        # 计算用户特征和电影特征
        usr_feat = self.get_usr_feat(usr_var)
        mov_feat = self.get_mov_feat(mov_var)
        # 根据计算的特征计算相似度
        res = fluid.layers.cos_sim(usr_feat, mov_feat)
        # 将相似度扩大范围到和电影评分相同数据范围
        res = fluid.layers.scale(res, scale=5)
        return usr_feat, mov_feat, res

In [17]:
def train(model):
    # 配置训练参数
    use_gpu = False
    lr = 0.01
    Epoches = 10

    place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
    with fluid.dygraph.guard(place):
        # 启动训练
        model.train()
        # 获得数据读取器
        data_loader = model.train_loader
        # 使用adam优化器，学习率使用0.01
        opt = fluid.optimizer.Adam(learning_rate=lr)
        
        for epoch in range(0, Epoches):
            for idx, data in enumerate(data_loader()):
                # 获得数据，并转为动态图格式
                usr, mov, score = data
                usr_v = [dygraph.to_variable(np.int64(var)) for var in usr]
                mov_v = [dygraph.to_variable(np.int64(var)) for var in mov[:-1]]
                mov_v.append(dygraph.to_variable(np.float32(mov[-1])))
                
                scores_label = dygraph.to_variable(score)
                # 计算出算法的前向计算结果
                _, _, scores_predict = model(usr_v, mov_v)
                # 计算loss
#                print(scores_predict.dtype,scores_label.dtype)
                loss = fluid.layers.square_error_cost(scores_predict, scores_label)
                avg_loss = fluid.layers.mean(loss)
                if idx % 500 == 0:
                    print("epoch: {}, batch_id: {}, loss is: {}".format(epoch, idx, avg_loss.numpy()))
                    
                # 损失函数下降，并清除梯度
                avg_loss.backward()
                opt.minimize(avg_loss)
                model.clear_gradients()
            # 每个epoch 保存一次模型
            fluid.save_dygraph(model.state_dict(), './checkpoint/epoch'+str(epoch))

In [4]:
def evaluation(model, params_file_path):
    use_gpu = False
    place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()

    with fluid.dygraph.guard(place):

        model_state_dict, _ = fluid.load_dygraph(params_file_path)
        model.load_dict(model_state_dict)
        model.eval()

        acc_set = []
        avg_loss_set = []
        for idx, data in enumerate(model.valid_loader()):
            usr, mov, score_label = data
            usr_v = [dygraph.to_variable(np.int64(var)) for var in usr]
            mov_v = [dygraph.to_variable(np.int64(var)) for var in mov[:-1]]
            mov_v.append(dygraph.to_variable(np.float32(mov[-1])))

            _, _, scores_predict = model(usr_v, mov_v)

            pred_scores = scores_predict.numpy()
            
            avg_loss_set.append(np.mean(np.abs(pred_scores - score_label)))

            diff = np.abs(pred_scores - score_label)
            diff[diff>0.5] = 1
            acc = 1 - np.mean(diff)
            acc_set.append(acc)
        return np.mean(acc_set), np.mean(avg_loss_set)

In [6]:
# 启动训练,使用了除海报外的所有特征
with dygraph.guard():
    use_poster, use_mov_title, use_mov_cat, use_age_job = False, True, True, True
    model = Model('Recommend', use_poster, use_mov_title, use_mov_cat, use_age_job)
    train(model)

##Total dataset instances:  1000209
##MovieLens dataset information: 
usr num: 6040
movies num: 3883
epoch: 0, batch_id: 0, loss is: [15.340841]
epoch: 0, batch_id: 500, loss is: [1.0508535]
epoch: 0, batch_id: 1000, loss is: [1.1642591]
epoch: 0, batch_id: 1500, loss is: [1.1351136]
epoch: 0, batch_id: 2000, loss is: [0.96807575]
epoch: 0, batch_id: 2500, loss is: [0.9790999]
epoch: 0, batch_id: 3000, loss is: [1.0027442]
epoch: 0, batch_id: 3500, loss is: [0.9665835]
epoch: 1, batch_id: 0, loss is: [0.9519994]
epoch: 1, batch_id: 500, loss is: [1.0032053]
epoch: 1, batch_id: 1000, loss is: [1.039664]
epoch: 1, batch_id: 1500, loss is: [1.216085]
epoch: 1, batch_id: 2000, loss is: [0.95938915]
epoch: 1, batch_id: 2500, loss is: [1.0065184]
epoch: 1, batch_id: 3000, loss is: [1.0903273]
epoch: 1, batch_id: 3500, loss is: [0.92516017]
epoch: 2, batch_id: 0, loss is: [0.9524243]
epoch: 2, batch_id: 500, loss is: [1.032253]
epoch: 2, batch_id: 1000, loss is: [0.92622983]
epoch: 2, batch_i

In [7]:
param_path = "./checkpoint/epoch"
for i in range(10):
    acc, mae = evaluation(model, param_path+str(i))
    print("ACC:", acc, "MAE:", mae)

ACC: 0.26040655542642643 MAE: 0.85908407
ACC: 0.2693785063731365 MAE: 0.84108776
ACC: 0.2783773402372996 MAE: 0.83288324
ACC: 0.27654058077396493 MAE: 0.82881063
ACC: 0.2784864398149344 MAE: 0.82926965
ACC: 0.2767193344923166 MAE: 0.82750934
ACC: 0.28125518262386323 MAE: 0.8246155
ACC: 0.2790882761661823 MAE: 0.8232499
ACC: 0.2773896784354479 MAE: 0.8260784
ACC: 0.2735202075579228 MAE: 0.8306713


In [None]:
with dygraph.guard():
    use_poster, use_mov_title, use_mov_cat, use_age_job = True, True, True, True
    model = Model('Recommend', use_poster, use_mov_title, use_mov_cat, use_age_job)
    train(model)

##Total dataset instances:  382499
##MovieLens dataset information: 
usr num: 6040
movies num: 3883
epoch: 0, batch_id: 0, loss is: [17.017052]


In [8]:
from PIL import Image
# 加载第三方库Pickle，用来保存Python数据到本地
import pickle
# 定义特征保存函数
def get_usr_mov_features(model, params_file_path, poster_path):
    use_gpu = False
    place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
    usr_pkl = {}
    mov_pkl = {}
    
    # 定义将list中每个元素转成variable的函数
    def list2variable(inputs, shape):
        inputs = np.reshape(np.array(inputs).astype(np.int64), shape)
        return fluid.dygraph.to_variable(inputs)
    
    with fluid.dygraph.guard(place):
        # 加载模型参数到模型中，设置为验证模式eval（）
        model_state_dict, _ = fluid.load_dygraph(params_file_path)
        model.load_dict(model_state_dict)
        model.eval()
        # 获得整个数据集的数据
        dataset = model.Dataset.dataset

        for i in range(len(dataset)):
            # 获得用户数据，电影数据，评分数据  
            # 本案例只转换所有在样本中出现过的user和movie，实际中可以使用业务系统中的全量数据
            usr_info, mov_info, score = dataset[i]['usr_info'], dataset[i]['mov_info'],dataset[i]['scores']
            usrid = str(usr_info['usr_id'])
            movid = str(mov_info['mov_id'])

            # 获得用户数据，计算得到用户特征，保存在usr_pkl字典中
            if usrid not in usr_pkl.keys():
                usr_id_v = list2variable(usr_info['usr_id'], [1, 1])
                usr_age_v = list2variable(usr_info['age'], [1, 1])
                usr_gender_v = list2variable(usr_info['gender'], [1, 1])
                usr_job_v = list2variable(usr_info['job'], [1, 1])

                usr_in = [usr_id_v, usr_gender_v, usr_age_v, usr_job_v]
                usr_feat = model.get_usr_feat(usr_in)

                usr_pkl[usrid] = usr_feat.numpy()
            
            # 获得电影数据，计算得到电影特征，保存在mov_pkl字典中
            if movid not in mov_pkl.keys():
                mov_id_v = list2variable(mov_info['mov_id'], [1, 1])
                mov_tit_v = list2variable(mov_info['title'], [1, 1, 15, 1])
                mov_cat_v = list2variable(mov_info['category'], [1, 1, 6, 1])

                mov_in = [mov_id_v, mov_cat_v, mov_tit_v, None]
                mov_feat = model.get_mov_feat(mov_in)

                mov_pkl[movid] = mov_feat.numpy()
    
    print(len(mov_pkl.keys()))
    # 保存特征到本地
    pickle.dump(usr_pkl, open('./usr_feat.pkl', 'wb'))
    pickle.dump(mov_pkl, open('./mov_feat.pkl', 'wb'))
    print("usr / mov features saved!!!")

        
param_path = "./checkpoint/epoch7"
poster_path = "./ml-1m/posters/"
get_usr_mov_features(model, param_path, poster_path) 

3706
usr / mov features saved!!!


In [10]:
#第一个推荐系统，即课程中的那个
# 定义根据用户兴趣推荐电影
def recommend_mov_for_usr(usr_id, top_k, pick_num, usr_feat_dir, mov_feat_dir, mov_info_path):
    assert pick_num <= top_k
    # 读取电影和用户的特征
    usr_feats = pickle.load(open(usr_feat_dir, 'rb'))
    mov_feats = pickle.load(open(mov_feat_dir, 'rb'))
    usr_feat = usr_feats[str(usr_id)]

    cos_sims = []

    with dygraph.guard():
        # 索引电影特征，计算和输入用户ID的特征的相似度
        for idx, key in enumerate(mov_feats.keys()):
            mov_feat = mov_feats[key]
            usr_feat = dygraph.to_variable(usr_feat)
            mov_feat = dygraph.to_variable(mov_feat)
            sim = fluid.layers.cos_sim(usr_feat, mov_feat)
            cos_sims.append(sim.numpy()[0][0])
    # 对相似度排序
    index = np.argsort(cos_sims)[-top_k:]

    mov_info = {}
    # 读取电影文件里的数据，根据电影ID索引到电影信息
    with open(mov_info_path, 'r', encoding="ISO-8859-1") as f:
        data = f.readlines()
        for item in data:
            item = item.strip().split("::")
            mov_info[str(item[0])] = item
            
    print("当前的用户是：")
    print("usr_id:", usr_id)
    print("推荐可能喜欢的电影是：")
    res = []
    
    # 加入随机选择因素，确保每次推荐的都不一样
    while len(res) < pick_num:
        val = np.random.choice(len(index), 1)[0]
        idx = index[val]
        mov_id = list(mov_feats.keys())[idx]
        if mov_id not in res:
            res.append(mov_id)

    for id in res:
        print("mov_id:", id, mov_info[str(id)])

movie_data_path = "./ml-1m/movies.dat"
top_k, pick_num = 10, 5

recommend_mov_for_usr(2, top_k, pick_num, 'usr_feat.pkl', 'mov_feat.pkl', movie_data_path)

当前的用户是：
usr_id: 2
推荐可能喜欢的电影是：
mov_id: 913 ['913', 'Maltese Falcon, The (1941)', 'Film-Noir|Mystery']
mov_id: 1198 ['1198', 'Raiders of the Lost Ark (1981)', 'Action|Adventure']
mov_id: 1898 ['1898', 'Land Girls, The (1998)', 'Drama|War']
mov_id: 260 ['260', 'Star Wars: Episode IV - A New Hope (1977)', 'Action|Adventure|Fantasy|Sci-Fi']
mov_id: 50 ['50', 'Usual Suspects, The (1995)', 'Crime|Thriller']


In [2]:
#第二个推荐系统，根据相似用户推荐电影（user-based）

# 给定一个用户ID，找到评分最高的topk个电影
def gen_movie_with_usr(usr_a,topk):
    usr_a=usr_a
    rating_path = "./ml-1m/ratings.dat"
    # 打开文件，ratings_data
    with open(rating_path, 'r') as f:
        ratings_data = f.readlines()
    
    usr_rating_info = {}
    for item in ratings_data:
        item = item.strip().split("::")
        # 处理每行数据，分别得到用户ID，电影ID，和评分
        usr_id,movie_id,score = item[0],item[1],item[2]
        if str(usr_id) == str(usr_a):
            usr_rating_info[movie_id] = float(score)

    # 获得评分过的电影ID
    movie_ids = list(usr_rating_info.keys())
 #   print("ID为 {} 的用户，评分过的电影数量是: ".format(usr_a), len(movie_ids))

    ratings_topk = sorted(usr_rating_info.items(), key=lambda item:item[1])[-topk:]    
    return ratings_topk





#根据相似用户推荐电影

import pickle
import paddle.fluid as fluid
import paddle.fluid.dygraph as dygraph
import numpy as np

#以下参数的top_k是参考最为相似的用户个数，在最为相似的几个用户中获得相应看过的电影抽取pick_num部电影推荐给用户
def recommend_mov_user_based(usr_id, top_k, pick_num, usr_feat_dir, mov_feat_dir,usr_info_path,mov_info_path):

    
    movie_data_path = mov_info_path
    mov_info = {}
    # 打开电影数据文件，根据电影ID索引到电影信息
    with open(movie_data_path, 'r', encoding="ISO-8859-1") as f:
        data = f.readlines()
        for item in data:
            item = item.strip().split("::")
            mov_info[str(item[0])] = item

    usr_file = usr_info_path
    usr_info = {}
    # 打开文件，读取所有行到data中
    with open(usr_file, 'r') as f:
        data = f.readlines()
        for item in data:
            item = item.strip().split("::")
            usr_info[str(item[0])] = item

    
   
   
    # 根据用户ID获得该用户的特征
    usr_ID = usr_id
    # 读取保存的用户特征
    usr_feat_dir = usr_feat_dir
    usr_feats = pickle.load(open(usr_feat_dir, 'rb'))
    # 根据用户ID索引到该用户的特征
    usr_ID_feat = usr_feats[str(usr_ID)]

    # 记录计算的相似度
    cos_sims = []
    # 记录下与用户特征计算相似的电影顺序

    with dygraph.guard():
        # 索引电影特征，计算和输入用户ID的特征的相似度
        for idx, key in enumerate(usr_feats.keys()):
            usr_feat = usr_feats[key]
            usr_query_feat = dygraph.to_variable(usr_ID_feat)
            usr_feat = dygraph.to_variable(usr_feat)
            
            # 计算余弦相似度
            sim = fluid.layers.cos_sim(usr_query_feat,usr_feat)
            # 打印特征和相似度的形状
#            if idx==0:
#                print("用户特征形状：{}, 用户特征形状：{}, 相似度结果形状：{}，相似度结果：{}".format(usr_query_feat.shape, usr_feat0.shape, sim.numpy().shape, sim.numpy()))
            # 从形状为（1，1）的相似度sim中获得相似度值sim.numpy()[0][0]，并添加到相似度列表cos_sims中
            cos_sims.append(sim.numpy()[0][0])


    # 3. 对相似度排序，获得最大相似度在cos_sims中的位置
    index = np.argsort(cos_sims)
    # 打印相似度最大的前topk个位置
    topk = top_k+1
#   print("相似度最大的前{}个索引是{}\n对应的相似度是：{}\n".format(topk-1, index[-topk:-1], [cos_sims[k] for k in index[-topk:-1]]))
    print('要推荐的用户是：user:{}'.format(usr_info[list(usr_feats.keys())[usr_ID-1]]))
    recommend_mov=[]
    for i in index[-topk:-1]:           
        print("与其相似的用户分别是：user:{}".format(usr_info[list(usr_feats.keys())[i]]))


        ratings_topk=gen_movie_with_usr(usr_info[list(usr_feats.keys())[i]][0],5)
        for k, score in ratings_topk:
 #           print("电影ID: {}，评分是: {}, 电影信息: {}".format(k, score, movie_info[k]))
            recommend_mov.append(k)
#用于推荐的电影
    recommend_mov=set(recommend_mov)
    recommend_mov=list(recommend_mov)
    res = []
# 加入随机选择因素，确保每次推荐的结果稍有差别
    while len(res) < pick_num:
        mov_id = np.random.choice(len(recommend_mov), 1)[0]
        if recommend_mov[mov_id] not in res:
            res.append(recommend_mov[mov_id])

    for id in res:
        print("要推荐的电影为 mov_id:", id, mov_info[str(id)])

        
usr_feat_dir='usr_feat.pkl'
mov_feat_dir='mov_feat.pkl'
mov_info_path="./ml-1m/movies.dat"
usr_info_path="./ml-1m/users.dat"
recommend_mov_user_based(1, 5, 5, usr_feat_dir, mov_feat_dir,usr_info_path, mov_info_path)

要推荐的用户是：user:['1', 'F', '1', '10', '48067']
与其相似的用户分别是：user:['1411', 'M', '35', '1', '08107']
与其相似的用户分别是：user:['1936', 'M', '35', '0', '90026']
与其相似的用户分别是：user:['1996', 'M', '35', '0', '85621']
与其相似的用户分别是：user:['2418', 'F', '1', '10', '06074']
与其相似的用户分别是：user:['896', 'M', '18', '15', '94015']
要推荐的电影为 mov_id: 3173 ['3173', 'Any Given Sunday (1999)', 'Drama']
要推荐的电影为 mov_id: 3743 ['3743', 'Boys and Girls (2000)', 'Comedy|Romance']
要推荐的电影为 mov_id: 306 ['306', 'Three Colors: Red (1994)', 'Drama']
要推荐的电影为 mov_id: 1240 ['1240', 'Terminator, The (1984)', 'Action|Sci-Fi|Thriller']
要推荐的电影为 mov_id: 1234 ['1234', 'Sting, The (1973)', 'Comedy|Crime']


In [3]:
#第三个推荐系统，根据相似电影推荐电影（item-based）
#tok_k是用户曾经看过的并且评价最高的电影部数，从中选出一部将与该部相似的pick_num部电影推荐给用户
def recommend_mov_item_based(usr_id, top_k, pick_num, usr_feat_dir, mov_feat_dir, usr_info_path,mov_info_path):
    
    movie_data_path = mov_info_path
    mov_info = {}
    # 打开电影数据文件，根据电影ID索引到电影信息
    with open(movie_data_path, 'r', encoding="ISO-8859-1") as f:
        data = f.readlines()
        for item in data:
            item = item.strip().split("::")
            mov_info[str(item[0])] = item

    usr_file = usr_info_path
    usr_info = {}
    # 打开文件，读取所有行到data中
    with open(usr_file, 'r') as f:
        data = f.readlines()
        for item in data:
            item = item.strip().split("::")
            usr_info[str(item[0])] = item

    
   
    watched_mov=[]
    # 根据用户ID获得该用户看过的电影
    usr_ID = usr_id
    ratings_topk=gen_movie_with_usr(usr_ID,5)
    for k, score in ratings_topk:
 #           print("电影ID: {}，评分是: {}, 电影信息: {}".format(k, score, movie_info[k]))
            watched_mov.append(k)

    mov_id = np.random.choice(len(watched_mov), 1)[0]
#拿这部电影作为推荐的基本       
    based_mov=watched_mov[mov_id]

    # 读取保存的电影特征
    mov_feat_dir = mov_feat_dir
    mov_feats = pickle.load(open(mov_feat_dir, 'rb'))
    # 根据用户ID索引到该用户的特征



    mov_query_feat = mov_feats[str(based_mov)]

    # 记录计算的相似度
    cos_sims = []
    # 记录下与用户特征计算相似的电影顺序

    with dygraph.guard():
        # 索引电影特征，计算和输入用户ID的特征的相似度
        for idx, key in enumerate(mov_feats.keys()):
            mov_feat = mov_feats[key]
            mov_query_feat = dygraph.to_variable(mov_query_feat)
            mov_feat = dygraph.to_variable(mov_feat)
            
            # 计算余弦相似度
            sim = fluid.layers.cos_sim(mov_query_feat,mov_feat)
            # 打印特征和相似度的形状
#            if idx==0:
#                print("用户特征形状：{}, 用户特征形状：{}, 相似度结果形状：{}，相似度结果：{}".format(usr_query_feat.shape, usr_feat0.shape, sim.numpy().shape, sim.numpy()))
            # 从形状为（1，1）的相似度sim中获得相似度值sim.numpy()[0][0]，并添加到相似度列表cos_sims中
            cos_sims.append(sim.numpy()[0][0])

  
    # 3. 对相似度排序，获得最大相似度在cos_sims中的位置
    index = np.argsort(cos_sims)
    # 打印相似度最大的前topk个位置
 
    topk = top_k+1
#   print("相似度最大的前{}个索引是{}\n对应的相似度是：{}\n".format(topk-1, index[-topk:-1], [cos_sims[k] for k in index[-topk:-1]]))
    print('用户{}评分高的电影是：{}'.format(usr_ID,mov_info[based_mov]))
    recommend_mov=[]
    for i in index[-topk:-1]:           
        print("与其相似的电影分别是：{}".format(mov_info[list(mov_feats.keys())[i]]))
        recommend_mov.append(mov_info[list(mov_feats.keys())[i]])

       

# 加入随机选择因素，确保每次推荐的结果稍有差别
#    while len(res) < pick_num:
#        mov_id = np.random.choice(len(recommend_mov), 1)[0]
#        if recommend_mov[mov_id] not in res:
#            res.append(recommend_mov[mov_id])

#    for id in res:
#        print("要推荐的电影为 mov_id:", id, mov_info[str(id)])
usr_feat_dir='usr_feat.pkl'
mov_feat_dir='mov_feat.pkl'
mov_info_path="./ml-1m/movies.dat"
usr_info_path="./ml-1m/users.dat"

recommend_mov_item_based(1, 5, 5, usr_feat_dir, mov_feat_dir,usr_info_path, mov_info_path)

用户1评分高的电影是：['1029', 'Dumbo (1941)', "Animation|Children's|Musical"]
与其相似的电影分别是：['914', 'My Fair Lady (1964)', 'Musical|Romance']
与其相似的电影分别是：['244', 'Gumby: The Movie (1995)', "Animation|Children's"]
与其相似的电影分别是：['2294', 'Antz (1998)', "Animation|Children's"]
与其相似的电影分别是：['3606', 'On the Town (1949)', 'Musical']
与其相似的电影分别是：['2139', 'Secret of NIMH, The (1982)', "Animation|Children's"]


In [4]:
#热门、新品和个性化推荐。三种各占比例2、3、5条
#热门可以认为是被评分最多的电影，新品可以认为是上映时间最晚的电影，个性化推荐就按用户特征的embedding和电影特征的embedding相似度最高的就行
def recommend_mov_for_usr_mix(usr_id,  pick_num, usr_feat_dir, mov_feat_dir, mov_info_path):
 #首先来看个性化推荐的部分

    person_num=int(pick_num*0.5)
    top_k=3*person_num


    # 读取电影和用户的特征
    usr_feats = pickle.load(open(usr_feat_dir, 'rb'))
    mov_feats = pickle.load(open(mov_feat_dir, 'rb'))
    usr_feat = usr_feats[str(usr_id)]

    cos_sims = []

    with dygraph.guard():
        # 索引电影特征，计算和输入用户ID的特征的相似度
        for idx, key in enumerate(mov_feats.keys()):
            mov_feat = mov_feats[key]
            usr_feat = dygraph.to_variable(usr_feat)
            mov_feat = dygraph.to_variable(mov_feat)
            sim = fluid.layers.cos_sim(usr_feat, mov_feat)
            cos_sims.append(sim.numpy()[0][0])
    # 对相似度排序
    index = np.argsort(cos_sims)[-top_k:]

    mov_info = {}
    # 读取电影文件里的数据，根据电影ID索引到电影信息
    with open(mov_info_path, 'r', encoding="ISO-8859-1") as f:
        data = f.readlines()
        for item in data:
            item = item.strip().split("::")
            mov_info[str(item[0])] = item
            
    print("当前的用户是：")
    print("usr_id:", usr_id)
    print("推荐可能喜欢的电影是：")
    res = []
    
    # 加入随机选择因素，确保每次推荐的都不一样
    while len(res) < person_num:
        val = np.random.choice(len(index), 1)[0]
        idx = index[val]
        mov_id = list(mov_feats.keys())[idx]
        if mov_id not in res:
            res.append(mov_id)
    print("个性化推荐部分")
    for id in res:
        print("mov_id:", id, mov_info[str(id)])
#热门部分，被点评次数最多的电影
    hot_num=int(pick_num*0.2)
    hot_top_k=3*hot_num

    rating_path = "./ml-1m/ratings.dat"
    # 打开文件，ratings_data
    with open(rating_path, 'r') as f:
        ratings_data = f.readlines()
    
    mov_rating_count = {}
    for item in ratings_data:
        item = item.strip().split("::")
        # 处理每行数据，分别得到用户ID，电影ID，和评分
        usr_id,movie_id,score = item[0],item[1],item[2]

        if str(movie_id) in mov_rating_count:
            mov_rating_count[movie_id] = mov_rating_count[movie_id]+1
        else:
            mov_rating_count[movie_id] =1

    hot_topk = sorted(mov_rating_count.items(), key=lambda item:item[1])[-hot_top_k:]
    hot_recommend_mov=[]
    for k, score in hot_topk:
 #           print("电影ID: {}，评分次数是: {}, 电影信息: {}".format(k, score, movie_info[k]))
            hot_recommend_mov.append(k)
#用于推荐的电影
   
    hot_res = []
# 加入随机选择因素，确保每次推荐的结果稍有差别
    while len(hot_res) < hot_num:
        mov_id = np.random.choice(len(hot_recommend_mov), 1)[0]
        if hot_recommend_mov[mov_id] not in res:
            hot_res.append(hot_recommend_mov[mov_id])
    print('热门推荐部分')
    for id in hot_res:
        print("要推荐的电影为 mov_id:", id, mov_info[str(id)])
#新品部分,选时间最晚上映的
    new_num=int(pick_num*0.3)
    new_top_k=3*new_num

    mov_year={}
    for index,info in mov_info.items():       
        mov_year[index]=int(str(info[1])[-5:-1])
    
    new_topk = sorted(mov_year.items(), key=lambda item:item[1])[-new_top_k:]
    new_recommend_mov=[]
    for k, year in new_topk:
 #           print("电影ID: {}，评分次数是: {}, 电影信息: {}".format(k, score, movie_info[k]))
            new_recommend_mov.append(k)
#用于推荐的电影
   
    new_res = []
# 加入随机选择因素，确保每次推荐的结果稍有差别
    while len(new_res) < new_num:
        mov_id = np.random.choice(len(new_recommend_mov), 1)[0]
        if new_recommend_mov[mov_id] not in res:
            new_res.append(new_recommend_mov[mov_id])
    print('新品推荐部分')
    for id in new_res:
        print("要推荐的电影为 mov_id:", id, mov_info[str(id)])


movie_data_path = "./ml-1m/movies.dat"

pick_num = 10

recommend_mov_for_usr_mix(1, pick_num, 'usr_feat.pkl', 'mov_feat.pkl', movie_data_path)

当前的用户是：
usr_id: 1
推荐可能喜欢的电影是：
个性化推荐部分
mov_id: 1898 ['1898', 'Land Girls, The (1998)', 'Drama|War']
mov_id: 1198 ['1198', 'Raiders of the Lost Ark (1981)', 'Action|Adventure']
mov_id: 858 ['858', 'Godfather, The (1972)', 'Action|Crime|Drama']
mov_id: 1136 ['1136', 'Monty Python and the Holy Grail (1974)', 'Comedy']
mov_id: 50 ['50', 'Usual Suspects, The (1995)', 'Crime|Thriller']
热门推荐部分
要推荐的电影为 mov_id: 480 ['480', 'Jurassic Park (1993)', 'Action|Adventure|Sci-Fi']
要推荐的电影为 mov_id: 1210 ['1210', 'Star Wars: Episode VI - Return of the Jedi (1983)', 'Action|Adventure|Romance|Sci-Fi|War']
新品推荐部分
要推荐的电影为 mov_id: 3946 ['3946', 'Get Carter (2000)', 'Action|Drama|Thriller']
要推荐的电影为 mov_id: 3949 ['3949', 'Requiem for a Dream (2000)', 'Drama']
要推荐的电影为 mov_id: 3950 ['3950', 'Tigerland (2000)', 'Drama']
