In [88]:
import numpy as np
import pandas as pd

In [18]:
matrix = np.load("matrix.npy")
matrix = matrix.astype(np.int32)

#### 余弦相似度

In [27]:
def cosine_similarity(userid1, userid2):
    v1 = matrix[userid1]
    v2 = matrix[userid2]
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

#### 计算用户相似度矩阵

In [37]:
user_similarity = np.zeros((len(matrix), len(matrix)))
for i in range(len(matrix)):
    user_similarity[i, i] = 1
    for j in range(i):
        user_similarity[i, j] = user_similarity[j, i] = cosine_similarity(i, j)

#### 获取与指定用户最相似的topK个用户

In [61]:
def simlar_users(userid, topK=10):
    # 得到目标用户与其他用户的相似度
    sim = user_similarity[userid, :]
    # 按相似度排序（由大到小）
    users = sim.argsort()[::-1]
    # 要除去用户本身
    users = users[1: topK+1]
    return users

#### 推荐电影

In [95]:
userid = 0   # 目标用户id
topN = 10   # 推荐电影的数量

In [102]:
scores = np.zeros(len(matrix[0]))
for uid in simlar_users(userid):
    for movieid in range(len(matrix[0])):
        # 相似用户评价过且目标用户未评价过
        if matrix[uid, movieid] > 0 and matrix[userid, movieid] == 0:
            scores[movieid] += matrix[uid, movieid] * user_similarity[userid, uid]

In [103]:
# 获取相似度最高的电影编号
candidates = scores.argsort()[::-1]
# 获取得分最高的前topN部电影
candidates = candidates[:topN]

In [104]:
df = pd.DataFrame(list(zip(candidates, scores[candidates])), columns=["movieid", "score"])
df

Unnamed: 0,movieid,score
0,2080,15.778746
1,2077,13.278529
2,2095,11.528618
3,363,11.521609
4,1281,10.743407
5,592,10.638346
6,595,10.211102
7,2136,9.5265
8,2084,9.516367
9,2079,8.861642
