In [136]:
import pandas as pd 
import tensorflow as tf
import numpy as np
import os
import pickle

## 数据预处理

In [137]:
def load_data():
    """
    Load Dataset from File
    """
    #读取Movie数据集
    movies = pd.read_csv('./data/ml-latest-small/movies.csv')
    movies_orig = movies.values
    
    #将Title中的年份去掉
    pattern = re.compile(r'^(.*)\((\d+)\)$')
    title_set = set(movies['title'])
    title_map = {}
    
    for ii,val in enumerate(title_set):
        if(pattern.match(val.strip())):
            title_map[val]=pattern.match(val.strip()).group(1)
        else:
            title_map[val]=val
            
    movies['title'] = movies['title'].map(title_map)
    
    #电影类型转数字字典
    genres_set = set()
    for val in movies['genres'].str.split('|'):
        genres_set.update(val)
        
    genres_set.add('<PAD>')
    genres2int = {val:ii for ii, val in enumerate(genres_set)}
    
    #将电影类型转成等长数字列表，长度是20
    genres_map = {val:[genres2int[row] for row in val.split('|')] for ii,val in enumerate(set(movies['genres']))}
    
    for key in genres_map:
        for cnt in range(max(genres2int.values()) - len(genres_map[key])):
            genres_map[key].insert(len(genres_map[key]) + cnt,genres2int['<PAD>'])
            
    movies['genres'] = movies['genres'].map(genres_map)
    
    #电影Title转数字字典
    title_set = set()
    for val in movies['title'].str.split():
        title_set.update(val)
        
    title_set.add('<PAD>')
    title2int = {val:ii for ii, val in enumerate(title_set)}
    
    #将电影Title转成等长数字列表，长度是15
    title_count = 15
    title_map = {val:[title2int[row] for row in val.split()] for ii,val in enumerate(set(movies['title']))}
    
    for key in title_map:
        #不足填充<PAD>
        for cnt in range(title_count - len(title_map[key])):
            title_map[key].insert(len(title_map[key]) + cnt,title2int['<PAD>'])
        #超过截断
        if(len(title_map[key])>title_count):
            title_map[key]=title_map[key][0:title_count]
                    
    movies['title'] = movies['title'].map(title_map)
    
    #读取评分数据集
    ratings = pd.read_csv('./data/ml-latest-small/ratings.csv')
    ratings = ratings.filter(regex='userId|movieId|rating')
    #合并两个表
    data = pd.merge(ratings,movies)
    
    #将数据分成X和y两张表
    target_fields = ['rating']
    features_pd, targets_pd = data.drop(target_fields, axis=1), data[target_fields]
    
    features = features_pd.values
    targets_values = targets_pd.values
    
    return title_count, title_set, genres2int, features, targets_values, ratings, movies, data, movies_orig
    

- title_count：Title字段的长度（15）
- title_set：Title文本的集合
- genres2int：电影类型转数字的字典
- features：是输入X
- targets_values：是学习目标y
- ratings：评分数据集的Pandas对象
- movies：电影数据的Pandas对象
- data：电影和评分两个数据集组合在一起的Pandas对象
- movies_orig：没有做数据处理的原始电影数据

In [138]:
title_count, title_set, genres2int, features, targets_values, ratings, movies, data, movies_orig = load_data()

pickle.dump((title_count, title_set, genres2int, features, targets_values, ratings, movies, data, movies_orig), open('preprocess0.p', 'wb'))


In [144]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,"[1880, 4817, 1728, 1728, 1728, 1728, 1728, 172...","[9, 3, 6, 16, 19, 14, 14, 14, 14, 14, 14, 14, ..."
1,2,"[2227, 1728, 1728, 1728, 1728, 1728, 1728, 172...","[9, 6, 19, 14, 14, 14, 14, 14, 14, 14, 14, 14,..."
2,3,"[10731, 6208, 9713, 1728, 1728, 1728, 1728, 17...","[16, 7, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14..."
3,4,"[4091, 10634, 1453, 1728, 1728, 1728, 1728, 17...","[16, 12, 7, 14, 14, 14, 14, 14, 14, 14, 14, 14..."
4,5,"[5187, 6839, 4673, 7467, 6445, 1286, 1728, 172...","[16, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 1..."


In [145]:
movies.values[0]

array([1,
       list([1880, 4817, 1728, 1728, 1728, 1728, 1728, 1728, 1728, 1728, 1728, 1728, 1728, 1728, 1728]),
       list([9, 3, 6, 16, 19, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14])],
      dtype=object)

In [146]:
#从本地读取预处理后的数据
title_count, title_set, genres2int, features, targets_values, ratings, movies, data, movies_orig = pickle.load(open('preprocess0.p', mode='rb'))

## 模型设计

In [149]:
import tensorflow as tf
import os
import pickle

def save_params(params):
    """
    Save parameters to file
    """
    pickle.dump(params, open('params.p', 'wb'))


def load_params():
    """
    Load parameters from file
    """
    return pickle.load(open('params.p', mode='rb'))


In [157]:
#嵌入矩阵的维度
embed_dim = 32
#用户ID个数
uid_max = max(features.take(0,1)) + 1  #611

#电影ID个数
movie_id_max = max(features.take(1,1)) + 1 # 193610
#电影类型个数
movie_categories_max = max(genres2int.values()) + 1 # 21
#电影名单词个数
movie_title_max = len(title_set) # 11504

#对电影类型嵌入向量做加和操作的标志
combiner = "sum"

#电影名长度
sentences_size = title_count # = 15
#文本卷积滑动窗口，分别滑动2, 3, 4, 5个单词
window_sizes = {2, 3, 4, 5}
#文本卷积核数量
filter_num = 8

#电影ID转下标的字典，数据集中电影ID跟下标不一致，比如第5行的数据电影ID不一定是5
movieid2idx = {val[0]:i for i, val in enumerate(movies.values)}

In [158]:
print(uid_max)
print(movie_id_max)
print(movie_categories_max)
print(movie_title_max)
print(sentences_size)
print(movieid2idx)

611
193610
21
11504
15
{1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 26: 25, 27: 26, 28: 27, 29: 28, 30: 29, 31: 30, 32: 31, 34: 32, 36: 33, 38: 34, 39: 35, 40: 36, 41: 37, 42: 38, 43: 39, 44: 40, 45: 41, 46: 42, 47: 43, 48: 44, 49: 45, 50: 46, 52: 47, 53: 48, 54: 49, 55: 50, 57: 51, 58: 52, 60: 53, 61: 54, 62: 55, 63: 56, 64: 57, 65: 58, 66: 59, 68: 60, 69: 61, 70: 62, 71: 63, 72: 64, 73: 65, 74: 66, 75: 67, 76: 68, 77: 69, 78: 70, 79: 71, 80: 72, 81: 73, 82: 74, 83: 75, 85: 76, 86: 77, 87: 78, 88: 79, 89: 80, 92: 81, 93: 82, 94: 83, 95: 84, 96: 85, 97: 86, 99: 87, 100: 88, 101: 89, 102: 90, 103: 91, 104: 92, 105: 93, 106: 94, 107: 95, 108: 96, 110: 97, 111: 98, 112: 99, 113: 100, 116: 101, 117: 102, 118: 103, 119: 104, 121: 105, 122: 106, 123: 107, 125: 108, 126: 109, 128: 110, 129: 111, 132: 112, 135: 113, 137: 114, 140: 115, 141: 116, 144: 117, 145

In [159]:
# Number of Epochs
num_epochs = 5
# Batch Size
batch_size = 256

dropout_keep = 0.5
# Learning Rate
learning_rate = 0.0001
# Show stats for every n number of batches
show_every_n_batches = 20

save_dir = './save'

In [None]:
def get_inputs():
    uid = tf.keras.layers.Input(shape=(1,), dtype='int32', name='uid')  

    movie_id = tf.keras.layers.Input(shape=(1,), dtype='int32', name='movie_id') 
    movie_categories = tf.keras.layers.Input(shape=(18,), dtype='int32', name='movie_categories') 
    movie_titles = tf.keras.layers.Input(shape=(15,), dtype='int32', name='movie_titles') 
    return uid, movie_id, movie_categories, movie_titles


def get_user_embedding(uid):
    uid_embed_layer = tf.keras.layers.Embedding(uid_max, embed_dim, input_length=1, name='uid_embed_layer')(uid)
    return uid_embed_layer


### 将User的嵌入矩阵一起全连接生成User的特征

In [None]:
def get_user_feature_layer(uid_embed_layer):
    #第一层全连接
    uid_fc_layer = tf.keras.layers.Dense(embed_dim, name="uid_fc_layer", activation='relu')(uid_embed_layer)
    
    #第二层全连接
    user_combine_layer = tf.keras.layers.Dense(200, activation='tanh')(uid_fc_layer)  #(?, 1, 200)

    user_combine_layer_flat = tf.keras.layers.Reshape([200], name="user_combine_layer_flat")(user_combine_layer)
    return user_combine_layer, user_combine_layer_flat

In [None]:
def get_movie_id_embed_layer(movie_id):
    movie_id_embed_layer = tf.keras.layers.Embedding(movie_id_max, embed_dim, input_length=1, name='movie_id_embed_layer')(movie_id)
    return movie_id_embed_layer

In [None]:
def get_movie_categories_layers(movie_categories):
    movie_categories_embed_layer = tf.keras.layers.Embedding(movie_categories_max, embed_dim, input_length=18, name='movie_categories_embed_layer')(movie_categories)
    movie_categories_embed_layer = tf.keras.layers.Lambda(lambda layer: tf.reduce_sum(layer, axis=1, keepdims=True))(movie_categories_embed_layer)
#     movie_categories_embed_layer = tf.keras.layers.Reshape([1, 18 * embed_dim])(movie_categories_embed_layer)

    return movie_categories_embed_layer

In [None]:
def get_movie_cnn_layer(movie_titles):
    #从嵌入矩阵中得到电影名对应的各个单词的嵌入向量
    movie_title_embed_layer = tf.keras.layers.Embedding(movie_title_max, embed_dim, input_length=15, name='movie_title_embed_layer')(movie_titles)
    sp=movie_title_embed_layer.shape
    movie_title_embed_layer_expand = tf.keras.layers.Reshape([sp[1], sp[2], 1])(movie_title_embed_layer)
    #对文本嵌入层使用不同尺寸的卷积核做卷积和最大池化
    pool_layer_lst = []
    for window_size in window_sizes:
        conv_layer = tf.keras.layers.Conv2D(filter_num, (window_size, embed_dim), 1, activation='relu')(movie_title_embed_layer_expand)
        maxpool_layer = tf.keras.layers.MaxPooling2D(pool_size=(sentences_size - window_size + 1 ,1), strides=1)(conv_layer)
        pool_layer_lst.append(maxpool_layer)
    #Dropout层
    pool_layer = tf.keras.layers.concatenate(pool_layer_lst, 3, name ="pool_layer")  
    max_num = len(window_sizes) * filter_num
    pool_layer_flat = tf.keras.layers.Reshape([1, max_num], name = "pool_layer_flat")(pool_layer)

    dropout_layer = tf.keras.layers.Dropout(dropout_keep, name = "dropout_layer")(pool_layer_flat)
    return pool_layer_flat, dropout_layer

In [None]:
def get_movie_feature_layer(movie_id_embed_layer, movie_categories_embed_layer, dropout_layer):
    #第一层全连接
    movie_id_fc_layer = tf.keras.layers.Dense(embed_dim, name="movie_id_fc_layer", activation='relu')(movie_id_embed_layer)
    movie_categories_fc_layer = tf.keras.layers.Dense(embed_dim, name="movie_categories_fc_layer", activation='relu')(movie_categories_embed_layer)

    #第二层全连接
    movie_combine_layer = tf.keras.layers.concatenate([movie_id_fc_layer, movie_categories_fc_layer, dropout_layer], 2)  
    movie_combine_layer = tf.keras.layers.Dense(200, activation='tanh')(movie_combine_layer)

    movie_combine_layer_flat = tf.keras.layers.Reshape([200], name="movie_combine_layer_flat")(movie_combine_layer)
    return movie_combine_layer, movie_combine_layer_flat

In [None]:
import tensorflow as tf
import datetime
from tensorflow import keras
from tensorflow.python.ops import summary_ops_v2
import time

MODEL_DIR = "./models"


class mv_network(object):
    def __init__(self, batch_size=256):
        self.batch_size = batch_size
        self.best_loss = 9999
        self.losses = {'train': [], 'test': []}

        # 获取输入占位符
        uid, user_gender, user_age, user_job, movie_id, movie_categories, movie_titles = get_inputs()
        # 获取User的4个嵌入向量
        uid_embed_layer, gender_embed_layer, age_embed_layer, job_embed_layer = get_user_embedding(uid, user_gender,
                                                                                                   user_age, user_job)
        # 得到用户特征
        user_combine_layer, user_combine_layer_flat = get_user_feature_layer(uid_embed_layer, gender_embed_layer,
                                                                             age_embed_layer, job_embed_layer)
        # 获取电影ID的嵌入向量
        movie_id_embed_layer = get_movie_id_embed_layer(movie_id)
        # 获取电影类型的嵌入向量
        movie_categories_embed_layer = get_movie_categories_layers(movie_categories)
        # 获取电影名的特征向量
        pool_layer_flat, dropout_layer = get_movie_cnn_layer(movie_titles)
        # 得到电影特征
        movie_combine_layer, movie_combine_layer_flat = get_movie_feature_layer(movie_id_embed_layer,
                                                                                movie_categories_embed_layer,
                                                                                dropout_layer)
        # 计算出评分
        # 将用户特征和电影特征做矩阵乘法得到一个预测评分的方案
        inference = tf.keras.layers.Lambda(lambda layer: 
            tf.reduce_sum(layer[0] * layer[1], axis=1), name="inference")((user_combine_layer_flat, movie_combine_layer_flat))
        inference = tf.keras.layers.Lambda(lambda layer: tf.expand_dims(layer, axis=1))(inference)
        
        # 将用户特征和电影特征作为输入，经过全连接，输出一个值的方案
#         inference_layer = tf.keras.layers.concatenate([user_combine_layer_flat, movie_combine_layer_flat],
#                                                       1)  # (?, 400)
        # 你可以使用下面这个全连接层，试试效果
        #inference_dense = tf.keras.layers.Dense(64, kernel_regularizer=tf.nn.l2_loss, activation='relu')(
        #    inference_layer)
#         inference = tf.keras.layers.Dense(1, name="inference")(inference_layer)  # inference_dense

        self.model = tf.keras.Model(
            inputs=[uid, user_gender, user_age, user_job, movie_id, movie_categories, movie_titles],
            outputs=[inference])

        self.model.summary()

        self.optimizer = tf.keras.optimizers.Adam(learning_rate)
        # MSE损失，将计算值回归到评分
        self.ComputeLoss = tf.keras.losses.MeanSquaredError()
        self.ComputeMetrics = tf.keras.metrics.MeanAbsoluteError()

        if tf.io.gfile.exists(MODEL_DIR):
            #             print('Removing existing model dir: {}'.format(MODEL_DIR))
            #             tf.io.gfile.rmtree(MODEL_DIR)
            pass
        else:
            tf.io.gfile.makedirs(MODEL_DIR)

        train_dir = os.path.join(MODEL_DIR, 'summaries', 'train')
        test_dir = os.path.join(MODEL_DIR, 'summaries', 'eval')

        #         self.train_summary_writer = summary_ops_v2.create_file_writer(train_dir, flush_millis=10000)
        #         self.test_summary_writer = summary_ops_v2.create_file_writer(test_dir, flush_millis=10000, name='test')

        checkpoint_dir = os.path.join(MODEL_DIR, 'checkpoints')
        self.checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt')
        self.checkpoint = tf.train.Checkpoint(model=self.model, optimizer=self.optimizer)

        # Restore variables on creation if a checkpoint exists.
        self.checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

    def compute_loss(self, labels, logits):
        return tf.reduce_mean(tf.keras.losses.mse(labels, logits))

    def compute_metrics(self, labels, logits):
        return tf.keras.metrics.mae(labels, logits)  #

    @tf.function
    def train_step(self, x, y):
        # Record the operations used to compute the loss, so that the gradient
        # of the loss with respect to the variables can be computed.
        #         metrics = 0
        with tf.GradientTape() as tape:
            logits = self.model([x[0],
                                 x[1],
                                 x[2],
                                 x[3],
                                 x[4],
                                 x[5],
                                 x[6]], training=True)
            loss = self.ComputeLoss(y, logits)
            # loss = self.compute_loss(labels, logits)
            self.ComputeMetrics(y, logits)
            # metrics = self.compute_metrics(labels, logits)
        grads = tape.gradient(loss, self.model.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.model.trainable_variables))
        return loss, logits

    def training(self, features, targets_values, epochs=5, log_freq=50):

        for epoch_i in range(epochs):
            # 将数据集分成训练集和测试集，随机种子不固定
            train_X, test_X, train_y, test_y = train_test_split(features,
                                                                targets_values,
                                                                test_size=0.2,
                                                                random_state=0)

            train_batches = get_batches(train_X, train_y, self.batch_size)
            batch_num = (len(train_X) // self.batch_size)

            train_start = time.time()
            #             with self.train_summary_writer.as_default():
            if True:
                start = time.time()
                # Metrics are stateful. They accumulate values and return a cumulative
                # result when you call .result(). Clear accumulated values with .reset_states()
                avg_loss = tf.keras.metrics.Mean('loss', dtype=tf.float32)
                #                 avg_mae = tf.keras.metrics.Mean('mae', dtype=tf.float32)

                # Datasets can be iterated over like any other Python iterable.
                for batch_i in range(batch_num):
                    x, y = next(train_batches)
                    categories = np.zeros([self.batch_size, 18])
                    for i in range(self.batch_size):
                        categories[i] = x.take(6, 1)[i]

                    titles = np.zeros([self.batch_size, sentences_size])
                    for i in range(self.batch_size):
                        titles[i] = x.take(5, 1)[i]

                    loss, logits = self.train_step([np.reshape(x.take(0, 1), [self.batch_size, 1]).astype(np.float32),
                                                    np.reshape(x.take(2, 1), [self.batch_size, 1]).astype(np.float32),
                                                    np.reshape(x.take(3, 1), [self.batch_size, 1]).astype(np.float32),
                                                    np.reshape(x.take(4, 1), [self.batch_size, 1]).astype(np.float32),
                                                    np.reshape(x.take(1, 1), [self.batch_size, 1]).astype(np.float32),
                                                    categories.astype(np.float32),
                                                    titles.astype(np.float32)],
                                                   np.reshape(y, [self.batch_size, 1]).astype(np.float32))
                    avg_loss(loss)
                    #                     avg_mae(metrics)
                    self.losses['train'].append(loss)

                    if tf.equal(self.optimizer.iterations % log_freq, 0):
                        #                         summary_ops_v2.scalar('loss', avg_loss.result(), step=self.optimizer.iterations)
                        #                         summary_ops_v2.scalar('mae', self.ComputeMetrics.result(), step=self.optimizer.iterations)
                        # summary_ops_v2.scalar('mae', avg_mae.result(), step=self.optimizer.iterations)

                        rate = log_freq / (time.time() - start)
                        print('Step #{}\tEpoch {:>3} Batch {:>4}/{}   Loss: {:0.6f} mae: {:0.6f} ({} steps/sec)'.format(
                            self.optimizer.iterations.numpy(),
                            epoch_i,
                            batch_i,
                            batch_num,
                            loss, (self.ComputeMetrics.result()), rate))
                        # print('Step #{}\tLoss: {:0.6f} mae: {:0.6f} ({} steps/sec)'.format(
                        #     self.optimizer.iterations.numpy(), loss, (avg_mae.result()), rate))
                        avg_loss.reset_states()
                        self.ComputeMetrics.reset_states()
                        # avg_mae.reset_states()
                        start = time.time()

            train_end = time.time()
            print(
                '\nTrain time for epoch #{} ({} total steps): {}'.format(epoch_i + 1, self.optimizer.iterations.numpy(),
                                                                         train_end - train_start))
            #             with self.test_summary_writer.as_default():
            self.testing((test_X, test_y), self.optimizer.iterations)
            # self.checkpoint.save(self.checkpoint_prefix)
        self.export_path = os.path.join(MODEL_DIR, 'export')
        tf.saved_model.save(self.model, self.export_path)

    def testing(self, test_dataset, step_num):
        test_X, test_y = test_dataset
        test_batches = get_batches(test_X, test_y, self.batch_size)

        """Perform an evaluation of `model` on the examples from `dataset`."""
        avg_loss = tf.keras.metrics.Mean('loss', dtype=tf.float32)
        #         avg_mae = tf.keras.metrics.Mean('mae', dtype=tf.float32)

        batch_num = (len(test_X) // self.batch_size)
        for batch_i in range(batch_num):
            x, y = next(test_batches)
            categories = np.zeros([self.batch_size, 18])
            for i in range(self.batch_size):
                categories[i] = x.take(6, 1)[i]

            titles = np.zeros([self.batch_size, sentences_size])
            for i in range(self.batch_size):
                titles[i] = x.take(5, 1)[i]

            logits = self.model([np.reshape(x.take(0, 1), [self.batch_size, 1]).astype(np.float32),
                                 np.reshape(x.take(2, 1), [self.batch_size, 1]).astype(np.float32),
                                 np.reshape(x.take(3, 1), [self.batch_size, 1]).astype(np.float32),
                                 np.reshape(x.take(4, 1), [self.batch_size, 1]).astype(np.float32),
                                 np.reshape(x.take(1, 1), [self.batch_size, 1]).astype(np.float32),
                                 categories.astype(np.float32),
                                 titles.astype(np.float32)], training=False)
            test_loss = self.ComputeLoss(np.reshape(y, [self.batch_size, 1]).astype(np.float32), logits)
            avg_loss(test_loss)
            # 保存测试损失
            self.losses['test'].append(test_loss)
            self.ComputeMetrics(np.reshape(y, [self.batch_size, 1]).astype(np.float32), logits)
            # avg_loss(self.compute_loss(labels, logits))
            # avg_mae(self.compute_metrics(labels, logits))

        print('Model test set loss: {:0.6f} mae: {:0.6f}'.format(avg_loss.result(), self.ComputeMetrics.result()))
        # print('Model test set loss: {:0.6f} mae: {:0.6f}'.format(avg_loss.result(), avg_mae.result()))
        #         summary_ops_v2.scalar('loss', avg_loss.result(), step=step_num)
        #         summary_ops_v2.scalar('mae', self.ComputeMetrics.result(), step=step_num)
        # summary_ops_v2.scalar('mae', avg_mae.result(), step=step_num)

        if avg_loss.result() < self.best_loss:
            self.best_loss = avg_loss.result()
            print("best loss = {}".format(self.best_loss))
            self.checkpoint.save(self.checkpoint_prefix)

    def forward(self, xs):
        predictions = self.model(xs)
        # logits = tf.nn.softmax(predictions)

        return predictions

In [None]:
def get_batches(Xs, ys, batch_size):
    for start in range(0, len(Xs), batch_size):
        end = min(start + batch_size, len(Xs))
        yield Xs[start:end], ys[start:end]

## 训练网络

In [None]:
#将用户特征和电影特征作为输入，经过全连接，输出一个值的训练
mv_net=mv_network()
mv_net.training(features, targets_values, epochs=5)
#将用户特征和电影特征做矩阵乘法得到一个预测评分的训练

In [None]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import matplotlib.pyplot as plt

plt.plot(mv_net.losses['train'], label='Training loss')
plt.legend()
_ = plt.ylim()

In [None]:
plt.plot(mv_net.losses['test'], label='Test loss')
plt.legend()
_ = plt.ylim()

In [None]:
def rating_movie(mv_net, user_id_val, movie_id_val):
    categories = np.zeros([1, 18])
    categories[0] = movies.values[movieid2idx[movie_id_val]][2]
    
    titles = np.zeros([1, sentences_size])
    titles[0] = movies.values[movieid2idx[movie_id_val]][1]
    
    inference_val = mv_net.model([np.reshape(users.values[user_id_val-1][0], [1, 1]),
              np.reshape(users.values[user_id_val-1][1], [1, 1]),
              np.reshape(users.values[user_id_val-1][2], [1, 1]),
              np.reshape(users.values[user_id_val-1][3], [1, 1]),
              np.reshape(movies.values[movieid2idx[movie_id_val]][0], [1, 1]),
              categories,  
              titles])

    return (inference_val.numpy())

In [None]:
rating_movie(mv_net, 234, 1401)

## 生成Movie特征矩阵
将训练好的电影特征组合成电影特征矩阵并保存到本地

In [None]:
movie_layer_model = keras.models.Model(inputs=[mv_net.model.input[4], mv_net.model.input[5], mv_net.model.input[6]], 
                                 outputs=mv_net.model.get_layer("movie_combine_layer_flat").output)
movie_matrics = []

for item in movies.values:
    categories = np.zeros([1, 18])
    categories[0] = item.take(2)

    titles = np.zeros([1, sentences_size])
    titles[0] = item.take(1)

    movie_combine_layer_flat_val = movie_layer_model([np.reshape(item.take(0), [1, 1]), categories, titles])  
    movie_matrics.append(movie_combine_layer_flat_val)

pickle.dump((np.array(movie_matrics).reshape(-1, 200)), open('movie_matrics.p', 'wb'))
movie_matrics = pickle.load(open('movie_matrics.p', mode='rb'))

In [None]:
movie_matrics = pickle.load(open('movie_matrics.p', mode='rb'))


## 生成User特征矩阵
将训练好的用户特征组合成用户特征矩阵并保存到本地

In [None]:
user_layer_model = keras.models.Model(inputs=[mv_net.model.input[0], mv_net.model.input[1], mv_net.model.input[2], mv_net.model.input[3]], 
                                 outputs=mv_net.model.get_layer("user_combine_layer_flat").output)
users_matrics = []

for item in users.values:

    user_combine_layer_flat_val = user_layer_model([np.reshape(item.take(0), [1, 1]), 
                                                    np.reshape(item.take(1), [1, 1]), 
                                                    np.reshape(item.take(2), [1, 1]), 
                                                    np.reshape(item.take(3), [1, 1])])  
    users_matrics.append(user_combine_layer_flat_val)

pickle.dump((np.array(users_matrics).reshape(-1, 200)), open('users_matrics.p', 'wb'))
users_matrics = pickle.load(open('users_matrics.p', mode='rb'))

In [None]:
users_matrics = pickle.load(open('users_matrics.p', mode='rb'))

## 开始推荐电影
使用生产的用户特征矩阵和电影特征矩阵做电影推荐

In [None]:
### 推荐同类型的电影
#思路是计算当前看的电影特征向量与整个电影特征矩阵的余弦相似度，取相似度最大的top_k个，这里加了些随机选择在里面，保证每次的推荐稍稍有些不同。
def recommend_same_type_movie(movie_id_val, top_k = 20):
   
    norm_movie_matrics = tf.sqrt(tf.reduce_sum(tf.square(movie_matrics), 1, keepdims=True))
    normalized_movie_matrics = movie_matrics / norm_movie_matrics

    #推荐同类型的电影
    probs_embeddings = (movie_matrics[movieid2idx[movie_id_val]]).reshape([1, 200])
    probs_similarity = tf.matmul(probs_embeddings, tf.transpose(normalized_movie_matrics))
    sim = (probs_similarity.numpy())
    #     results = (-sim[0]).argsort()[0:top_k]
    #     print(results)
        
    print("您看的电影是：{}".format(movies_orig[movieid2idx[movie_id_val]]))
    print("以下是给您的推荐：")
    p = np.squeeze(sim)
    p[np.argsort(p)[:-top_k]] = 0
    p = p / np.sum(p)
    results = set()
    while len(results) != 5:
        c = np.random.choice(3883, 1, p=p)[0]
        results.add(c)
    for val in (results):
        print(val)
        print(movies_orig[val])
        
    return results

recommend_same_type_movie(1401, 20)

### 推荐您喜欢的电影
#思路是使用用户特征向量与电影特征矩阵计算所有电影的评分，取评分最高的top_k个，同样加了些随机选择部分。
def recommend_your_favorite_movie(user_id_val, top_k = 10):

    #推荐您喜欢的电影
    probs_embeddings = (users_matrics[user_id_val-1]).reshape([1, 200])

    probs_similarity = tf.matmul(probs_embeddings, tf.transpose(movie_matrics))
    sim = (probs_similarity.numpy())
    #     print(sim.shape)
    #     results = (-sim[0]).argsort()[0:top_k]
    #     print(results)
        
    #     sim_norm = probs_norm_similarity.eval()
    #     print((-sim_norm[0]).argsort()[0:top_k])
    
    print("以下是给您的推荐：")
    p = np.squeeze(sim)
    p[np.argsort(p)[:-top_k]] = 0
    p = p / np.sum(p)
    results = set()
    while len(results) != 5:
        c = np.random.choice(3883, 1, p=p)[0]
        results.add(c)
    for val in (results):
        print(val)
        print(movies_orig[val])

    return results

recommend_your_favorite_movie(234, 10)

### 看过这个电影的人还看了（喜欢）哪些电影
- 首先选出喜欢某个电影的top_k个人，得到这几个人的用户特征向量。
- 然后计算这几个人对所有电影的评分
- 选择每个人评分最高的电影作为推荐
- 同样加入了随机选择

In [None]:
import random

def recommend_other_favorite_movie(movie_id_val, top_k = 20):

    probs_movie_embeddings = (movie_matrics[movieid2idx[movie_id_val]]).reshape([1, 200])
    probs_user_favorite_similarity = tf.matmul(probs_movie_embeddings, tf.transpose(users_matrics))
    favorite_user_id = np.argsort(probs_user_favorite_similarity.numpy())[0][-top_k:]
    #     print(normalized_users_matrics.numpy().shape)
    #     print(probs_user_favorite_similarity.numpy()[0][favorite_user_id])
    #     print(favorite_user_id.shape)
    
    print("您看的电影是：{}".format(movies_orig[movieid2idx[movie_id_val]]))
        
    print("喜欢看这个电影的人是：{}".format(users_orig[favorite_user_id-1]))
    probs_users_embeddings = (users_matrics[favorite_user_id-1]).reshape([-1, 200])
    probs_similarity = tf.matmul(probs_users_embeddings, tf.transpose(movie_matrics))
    sim = (probs_similarity.numpy())
    #     results = (-sim[0]).argsort()[0:top_k]
    #     print(results)
    
    #     print(sim.shape)
    #     print(np.argmax(sim, 1))
    p = np.argmax(sim, 1)
    print("喜欢看这个电影的人还喜欢看：")

    if len(set(p)) < 5:
        results = set(p)
    else:
        results = set()
        while len(results) != 5:
            c = p[random.randrange(top_k)]
            results.add(c)
    for val in (results):
        print(val)
        print(movies_orig[val])
        
    return results

recommend_other_favorite_movie(1401, 20)

In [153]:
uid_max = max(features.take(0,1)) + 1 
print(uid_max)

611


In [155]:
movie_id_max = max(features.take(1,1))
print(movie_id_max)

193609


In [156]:
movies

Unnamed: 0,movieId,title,genres
0,1,"[1880, 4817, 1728, 1728, 1728, 1728, 1728, 172...","[9, 3, 6, 16, 19, 14, 14, 14, 14, 14, 14, 14, ..."
1,2,"[2227, 1728, 1728, 1728, 1728, 1728, 1728, 172...","[9, 6, 19, 14, 14, 14, 14, 14, 14, 14, 14, 14,..."
2,3,"[10731, 6208, 9713, 1728, 1728, 1728, 1728, 17...","[16, 7, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14..."
3,4,"[4091, 10634, 1453, 1728, 1728, 1728, 1728, 17...","[16, 12, 7, 14, 14, 14, 14, 14, 14, 14, 14, 14..."
4,5,"[5187, 6839, 4673, 7467, 6445, 1286, 1728, 172...","[16, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 1..."
...,...,...,...
9737,193581,"[6529, 10929, 2029, 6839, 4673, 3106, 1728, 17...","[0, 3, 16, 19, 14, 14, 14, 14, 14, 14, 14, 14,..."
9738,193583,"[10273, 8404, 10273, 8547, 5389, 1728, 1728, 1...","[3, 16, 19, 14, 14, 14, 14, 14, 14, 14, 14, 14..."
9739,193585,"[1879, 1728, 1728, 1728, 1728, 1728, 1728, 172...","[12, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 1..."
9740,193587,"[669, 7117, 4406, 7303, 444, 1728, 1728, 1728,...","[0, 3, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,..."


In [53]:
movies = pd.read_csv('./data/ml-latest-small/movies.csv')

In [54]:
movies['title']

0                                Toy Story (1995)
1                                  Jumanji (1995)
2                         Grumpier Old Men (1995)
3                        Waiting to Exhale (1995)
4              Father of the Bride Part II (1995)
                          ...                    
9737    Black Butler: Book of the Atlantic (2017)
9738                 No Game No Life: Zero (2017)
9739                                 Flint (2017)
9740          Bungo Stray Dogs: Dead Apple (2018)
9741          Andrew Dice Clay: Dice Rules (1991)
Name: title, Length: 9742, dtype: object

In [55]:
#将Title中的年份去掉
pattern = re.compile(r'^(.*)\((\d+)\)$')

title_set = set(movies['title'])
title_map = {}

for ii,val in enumerate(title_set):
    if(pattern.match(val.strip())):
        title_map[val]=pattern.match(val.strip()).group(1)
    else:
        print(val)
        title_map[val]=val

movies['title'] = movies['title'].map(title_map)

Cosmos
Babylon 5
Hyena Road
Maria Bamford: Old Baby
Black Mirror
Paterson
Nocturnal Animals
Death Note: Desu nôto (2006–2007)
Ready Player One
Generation Iron 2
Moonlight
The Adventures of Sherlock Holmes and Doctor Watson
The OA


In [56]:
movies['title']

0                                Toy Story 
1                                  Jumanji 
2                         Grumpier Old Men 
3                        Waiting to Exhale 
4              Father of the Bride Part II 
                       ...                 
9737    Black Butler: Book of the Atlantic 
9738                 No Game No Life: Zero 
9739                                 Flint 
9740          Bungo Stray Dogs: Dead Apple 
9741          Andrew Dice Clay: Dice Rules 
Name: title, Length: 9742, dtype: object

In [72]:
#电影类型转数字字典
genres_set = set()
for val in movies['genres'].str.split('|'):
    genres_set.update(val)

genres_set.add('<PAD>')
genres2int = {val:ii for ii, val in enumerate(genres_set)}
    
#将电影类型转成等长数字列表，长度是20
genres_map = {val:[genres2int[row] for row in val.split('|')] for ii,val in enumerate(set(movies['genres']))}

for key in genres_map:
    for cnt in range(max(genres2int.values()) - len(genres_map[key])):
        genres_map[key].insert(len(genres_map[key]) + cnt,genres2int['<PAD>'])
    
movies['genres'] = movies['genres'].map(genres_map)


In [105]:
#电影Title转数字字典
title_set = set()
for val in movies['title'].str.split():
    title_set.update(val)

title_set.add('<PAD>')
title2int = {val:ii for ii, val in enumerate(title_set)}

#将电影Title转成等长数字列表，长度是15
title_count = 15
title_map = {val:[title2int[row] for row in val.split()] for ii,val in enumerate(set(movies['title']))}
    
for key in title_map:
    #不足填充
    for cnt in range(title_count - len(title_map[key])):
        title_map[key].insert(len(title_map[key]) + cnt,title2int['<PAD>'])
    #超过截断
    if(len(title_map[key])>title_count):
        title_map[key]=title_map[key][0:title_count]
                    
movies['title'] = movies['title'].map(title_map)

In [127]:
ratings = pd.read_csv('./data/ml-latest-small/ratings.csv')
ratings = ratings.filter(regex='userId|movieId|rating')

#合并两个表
data = pd.merge(ratings,movies)

#将数据分成X和y两张表
target_fields = ['rating']
features_pd, targets_pd = data.drop(target_fields, axis=1), data[target_fields]
    
features = features_pd.values
targets_values = targets_pd.values

In [131]:
features

array([[1, 1,
        list([1880, 4817, 1728, 1728, 1728, 1728, 1728, 1728, 1728, 1728, 1728, 1728, 1728, 1728, 1728]),
        list([9, 3, 6, 16, 19, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14])],
       [5, 1,
        list([1880, 4817, 1728, 1728, 1728, 1728, 1728, 1728, 1728, 1728, 1728, 1728, 1728, 1728, 1728]),
        list([9, 3, 6, 16, 19, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14])],
       [7, 1,
        list([1880, 4817, 1728, 1728, 1728, 1728, 1728, 1728, 1728, 1728, 1728, 1728, 1728, 1728, 1728]),
        list([9, 3, 6, 16, 19, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14])],
       ...,
       [610, 160836,
        list([10697, 1728, 1728, 1728, 1728, 1728, 1728, 1728, 1728, 1728, 1728, 1728, 1728, 1728, 1728]),
        list([0, 12, 10, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14])],
       [610, 163937,
        list([1472, 699, 1728, 1728, 1728, 1728, 1728, 1728, 1728, 1728, 1728, 1728, 1728, 1728, 1728

In [113]:
movies['title'][100]

[2510,
 7992,
 5086,
 1728,
 1728,
 1728,
 1728,
 1728,
 1728,
 1728,
 1728,
 1728,
 1728,
 1728,
 1728]

In [98]:
title_map

{'Guns of Navarone, The ': [6713,
  6839,
  8236,
  7507,
  1728,
  1728,
  1728,
  1728,
  1728,
  1728,
  1728,
  1728,
  1728,
  1728,
  1728],
 'Loose Cannons ': [3715,
  4493,
  1728,
  1728,
  1728,
  1728,
  1728,
  1728,
  1728,
  1728,
  1728,
  1728,
  1728,
  1728,
  1728],
 'Mr. Deeds Goes to Town ': [11108,
  9027,
  8415,
  10634,
  1776,
  1728,
  1728,
  1728,
  1728,
  1728,
  1728,
  1728,
  1728,
  1728,
  1728],
 'Hoodlum ': [10848,
  1728,
  1728,
  1728,
  1728,
  1728,
  1728,
  1728,
  1728,
  1728,
  1728,
  1728,
  1728,
  1728,
  1728],
 "It's Kind of a Funny Story ": [3666,
  1726,
  6839,
  8536,
  10146,
  4817,
  1728,
  1728,
  1728,
  1728,
  1728,
  1728,
  1728,
  1728,
  1728],
 'Adventures of Rocky and Bullwinkle, The ': [2169,
  6839,
  5017,
  7992,
  8655,
  7507,
  1728,
  1728,
  1728,
  1728,
  1728,
  1728,
  1728,
  1728,
  1728],
 'Pat Garrett and Billy the Kid ': [6284,
  11314,
  7992,
  7909,
  4673,
  9283,
  1728,
  1728,
  1728,
  172

In [80]:
len(movies['genres'][18])

20

In [70]:
for key in genres_map:
    print(key,genres_map[key],len(genres_map[key]))

Crime|Drama|Sci-Fi|Thriller [20, 12, 17, 10] 4
Adventure|Children|Sci-Fi [9, 6, 17] 3
Children|Comedy|Musical|Romance [6, 16, 15, 7] 4
Adventure|Children|Comedy|Drama [9, 6, 16, 12] 4
Fantasy|Horror|Mystery|Thriller [19, 5, 11, 10] 4
Drama|Romance|Thriller|War [12, 7, 10, 13] 4
Animation|Children|Comedy|Drama [3, 6, 16, 12] 4
Action|Comedy|Crime|Drama|Thriller [0, 16, 20, 12, 10] 5
Action|Crime|Drama|Mystery|Thriller [0, 20, 12, 11, 10] 5
Action|Adventure|Horror|Sci-Fi [0, 9, 5, 17] 4
Drama|Horror|Mystery|Sci-Fi|Thriller [12, 5, 11, 17, 10] 5
Adventure|Animation|Children|Comedy [9, 3, 6, 16] 4
Comedy|Crime|Horror|Mystery|Thriller [16, 20, 5, 11, 10] 5
Drama|Fantasy|Horror|Thriller|War [12, 19, 5, 10, 13] 5
Children|Comedy|Mystery [6, 16, 11] 3
Action|Adventure|Comedy|War [0, 9, 16, 13] 4
Adventure|Animation|Children|Comedy|Fantasy|War [9, 3, 6, 16, 19, 13] 6
Animation|Mystery|Sci-Fi [3, 11, 17] 3
Adventure|Drama [9, 12] 2
Adventure|Drama|Fantasy [9, 12, 19] 3
Action|Adventure|Animation

Adventure|Western [9, 4] 2
Children|Comedy|Sci-Fi [6, 16, 17] 3
Children|Comedy|Drama|Fantasy [6, 16, 12, 19] 4
Adventure|Comedy|Mystery [9, 16, 11] 3
Drama|Film-Noir|Thriller [12, 2, 10] 3
Adventure|Drama|Horror|Thriller [9, 12, 5, 10] 4
Action|Adventure|Mystery|Sci-Fi [0, 9, 11, 17] 4
Comedy|Fantasy|Horror [16, 19, 5] 3
Animation|Children|Fantasy|Musical|Romance [3, 6, 19, 15, 7] 5
Crime|Film-Noir|Thriller [20, 2, 10] 3
Crime|Documentary|War [20, 1, 13] 3
Action|Drama|Thriller|War [0, 12, 10, 13] 4
Documentary|Drama|Musical [1, 12, 15] 3
Adventure|Documentary|Western [9, 1, 4] 3
Action|Sci-Fi|Thriller|Western|IMAX [0, 17, 10, 4, 8] 5
Adventure|Comedy|Crime|Romance [9, 16, 20, 7] 4
Action|Adventure|Animation|Children|Comedy [0, 9, 3, 6, 16] 5
Adventure|War [9, 13] 2
Adventure|Children|Drama [9, 6, 12] 3
Action|Romance|Western [0, 7, 4] 3
Drama|Sci-Fi|War [12, 17, 13] 3
Animation|Drama|Sci-Fi [3, 12, 17] 3
Action|Animation|Crime|Sci-Fi|Thriller [0, 3, 20, 17, 10] 5
Adventure|Animation|

Comedy|Drama|Romance|Sci-Fi [16, 12, 7, 17] 4
Action|Adventure|Sci-Fi [0, 9, 17] 3
Drama|Sci-Fi|Thriller [12, 17, 10] 3
Fantasy|Horror [19, 5] 2
Adventure|Animation|Comedy|Fantasy|Musical [9, 3, 16, 19, 15] 5
Comedy|Drama|Sci-Fi|War [16, 12, 17, 13] 4
Action|Comedy|Drama|War [0, 16, 12, 13] 4
Children|Comedy|Romance [6, 16, 7] 3
Animation|Children|Comedy|Crime [3, 6, 16, 20] 4
Film-Noir|Mystery|Thriller [2, 11, 10] 3
Animation|Comedy|Musical [3, 16, 15] 3
Comedy|Crime|Mystery|Thriller [16, 20, 11, 10] 4
Action|Romance|Sci-Fi|Thriller [0, 7, 17, 10] 4
Drama|Horror [12, 5] 2
Children|Comedy|Crime|Musical [6, 16, 20, 15] 4
Crime|Drama|Fantasy|Thriller [20, 12, 19, 10] 4
Comedy|Crime|Romance [16, 20, 7] 3
Adventure|Children|Fantasy|Sci-Fi|Thriller [9, 6, 19, 17, 10] 5
Comedy|Crime|Fantasy [16, 20, 19] 3
Comedy|Fantasy|Thriller [16, 19, 10] 3
Animation|Children|Fantasy|War [3, 6, 19, 13] 4
Action|Adventure|Crime|IMAX [0, 9, 20, 8] 4
Animation|Documentary|Drama|War [3, 1, 12, 13] 4
Action|Ch

In [66]:
for i,val in enumerate(genres_set):
    print(i,val)

0 Action
1 Documentary
2 Film-Noir
3 Animation
4 Western
5 Horror
6 Children
7 Romance
8 IMAX
9 Adventure
10 Thriller
11 Mystery
12 Drama
13 War
14 <PAD>
15 Musical
16 Comedy
17 Sci-Fi
18 (no genres listed)
19 Fantasy
20 Crime


In [64]:
genres_map

{'Crime|Drama|Sci-Fi|Thriller': [20, 12, 17, 10],
 'Adventure|Children|Sci-Fi': [9, 6, 17],
 'Children|Comedy|Musical|Romance': [6, 16, 15, 7],
 'Adventure|Children|Comedy|Drama': [9, 6, 16, 12],
 'Fantasy|Horror|Mystery|Thriller': [19, 5, 11, 10],
 'Drama|Romance|Thriller|War': [12, 7, 10, 13],
 'Animation|Children|Comedy|Drama': [3, 6, 16, 12],
 'Action|Comedy|Crime|Drama|Thriller': [0, 16, 20, 12, 10],
 'Action|Crime|Drama|Mystery|Thriller': [0, 20, 12, 11, 10],
 'Action|Adventure|Horror|Sci-Fi': [0, 9, 5, 17],
 'Drama|Horror|Mystery|Sci-Fi|Thriller': [12, 5, 11, 17, 10],
 'Adventure|Animation|Children|Comedy': [9, 3, 6, 16],
 'Comedy|Crime|Horror|Mystery|Thriller': [16, 20, 5, 11, 10],
 'Drama|Fantasy|Horror|Thriller|War': [12, 19, 5, 10, 13],
 'Children|Comedy|Mystery': [6, 16, 11],
 'Action|Adventure|Comedy|War': [0, 9, 16, 13],
 'Adventure|Animation|Children|Comedy|Fantasy|War': [9, 3, 6, 16, 19, 13],
 'Animation|Mystery|Sci-Fi': [3, 11, 17],
 'Adventure|Drama': [9, 12],
 'Adven

In [52]:
exp_set={'Fatal Instinct (1993)',
 'Bachelor, The (1999)',
 'Bone Tomahawk (2015)',
 'Superman/Doomsday (2007) ',
 'From Dusk Till Dawn 2: Texas Blood Money (1999) ',
 'Dawn of the Dead (2004)',
 'csdds ',
 'Expendables 2, The (2012)',
 'Only the Lonely (1991)'}

pattern = re.compile(r'^(.*)\((\d+)\)$')

di={}
for ii,val in enumerate(exp_set):
    if(pattern.match(val.strip())):
        s = pattern.match(val.strip()).group(1)
        di[val]=s
    else:
        print(val)
        di[val]=val


print(di)
#exp_map = {val:pattern.match(val).group(1) for ii,val in enumerate(exp_set)}

csdds 
{'Fatal Instinct (1993)': 'Fatal Instinct ', 'Bachelor, The (1999)': 'Bachelor, The ', 'Bone Tomahawk (2015)': 'Bone Tomahawk ', 'From Dusk Till Dawn 2: Texas Blood Money (1999) ': 'From Dusk Till Dawn 2: Texas Blood Money ', 'csdds ': 'csdds ', 'Superman/Doomsday (2007) ': 'Superman/Doomsday ', 'Dawn of the Dead (2004)': 'Dawn of the Dead ', 'Expendables 2, The (2012)': 'Expendables 2, The ', 'Only the Lonely (1991)': 'Only the Lonely '}


In [37]:
di={}
for ii,val in enumerate(exp_set):
    print(ii,val)
    s = pattern.search(val).group(1)
    print(s)
    di[val]=s

print(di)
print(exp_map)

0 Fatal Instinct (1993)
Fatal Instinct 
1 Bachelor, The (1999)
Bachelor, The 
2 Bone Tomahawk (2015)
Bone Tomahawk 
3 Dawn of the Dead (2004)
Dawn of the Dead 
4 Expendables 2, The (2012)
Expendables 2, The 
5 Only the Lonely (1991)
Only the Lonely 
{'Fatal Instinct (1993)': 'Fatal Instinct ', 'Bachelor, The (1999)': 'Bachelor, The ', 'Bone Tomahawk (2015)': 'Bone Tomahawk ', 'Dawn of the Dead (2004)': 'Dawn of the Dead ', 'Expendables 2, The (2012)': 'Expendables 2, The ', 'Only the Lonely (1991)': 'Only the Lonely '}
{'Fatal Instinct (1993)': 'Fatal Instinct ', 'Bachelor, The (1999)': 'Bachelor, The ', 'Bone Tomahawk (2015)': 'Bone Tomahawk ', 'Dawn of the Dead (2004)': 'Dawn of the Dead ', 'Expendables 2, The (2012)': 'Expendables 2, The ', 'Only the Lonely (1991)': 'Only the Lonely '}


In [35]:
import re 

#将Title中的年份去掉
pattern = re.compile(r'^(.*)\((\d+)\)$')



title_map = {val:pattern.search(val).group(1) for ii,val in enumerate(set(movies['title']))}
movies['title'] = movies['title'].map(title_map)

AttributeError: 'NoneType' object has no attribute 'group'