In [6]:
import numpy as np
import pandas as pd
from itertools import permutations

In [2]:
matrix = np.load("matrix.npy")
matrix = matrix.astype(np.int32)

#### 统计每部电影共被看过的次数, 用矩阵P存储

In [20]:
P = (matrix>0).sum(0)

#### 统计同时喜欢电影i和j的用户数, 用矩阵C存储

In [7]:
co_occur = {}
for i in range(len(matrix)):
    temp = []
    for j in range(len(matrix[0])):
        if matrix[i][j]:
            temp.append(j)
    for x, y in permutations(temp, 2):
        co_occur[(x, y)] = co_occur.get((x, y), 0) + 1

In [13]:
C = np.zeros((len(matrix[0]), len(matrix[0])), dtype=np.int32)
for i in range(len(matrix[0])):
    for j in range(i):
        C[i, j] = C[j, i] = co_occur.get((i, j), 0)

#### Jaccard相似度计算

In [29]:
def jaccard_similarity(movieid1, movieid2):
    if movieid1 == movieid2:
        return 1
    if P[movieid1] == 0 or P[movieid2] == 0:
        return 0
    return C[movieid1][movieid2] / np.math.sqrt(P[movieid1] * P[movieid2])

In [32]:
jaccard_similarity(0, 1)

0.11168448404723388

#### 计算物品相似度矩阵

In [35]:
item_similarity = np.zeros((len(matrix[0]), len(matrix[0])))
for i in range(len(matrix[0])):
    item_similarity[i, i] = 1
    for j in range(i):
        item_similarity[i, j] = item_similarity[j, i] = jaccard_similarity(i, j)

In [36]:
item_similarity

array([[1.        , 0.11168448, 0.08143705, ..., 0.02526993, 0.01035242,
        0.04810651],
       [0.11168448, 1.        , 0.08505452, ..., 0.0255038 , 0.00845077,
        0.03913198],
       [0.08143705, 0.08505452, 1.        , ..., 0.02245774, 0.00632523,
        0.03437734],
       ...,
       [0.02526993, 0.0255038 , 0.02245774, ..., 1.        , 0.05120916,
        0.06679668],
       [0.01035242, 0.00845077, 0.00632523, ..., 0.05120916, 1.        ,
        0.05225919],
       [0.04810651, 0.03913198, 0.03437734, ..., 0.06679668, 0.05225919,
        1.        ]])

#### 获取与指定电影最相似的topK个电影

In [49]:
def similar_movies(movieid, topK=10):
    # 得到目标用户与其他用户的相似度
    sim = item_similarity[movieid, :]
    # 按相似度排序（由大到小）
    movies = sim.argsort()[::-1]
    # 要除去用户本身
    movies = movies[1: topK+1]
    return movies

#### 推荐电影

In [50]:
userid = 0   # 目标用户id
topN = 10   # 推荐电影的数量

In [51]:
scores = np.zeros(len(matrix[0]))
for movieid in range(len(matrix[0])):
    if matrix[userid][movieid] == 0:
        continue
    for mid in similar_movies(movieid):
        if matrix[userid, mid] == 0:   # 目标用户尚未评价过
            scores[mid] += matrix[userid][movieid] * item_similarity[movieid, mid]

In [54]:
# 获取相似度最高的电影编号
candidates = scores.argsort()[::-1]
# 获取得分最高的前topN部电影
candidates = candidates[:topN]

In [55]:
df = pd.DataFrame(list(zip(candidates, scores[candidates])), columns=["movieid", "score"])
df

Unnamed: 0,movieid,score
0,363,5.59803
1,2715,5.522734
2,2173,5.075565
3,1195,4.558809
4,2086,4.405349
5,1579,3.987923
6,1281,3.923969
7,2079,3.877979
8,595,3.380017
9,592,3.241676
