In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("demo_data.csv", header=0, index_col=None)
df

Unnamed: 0,name,item1,item2,item3,item4,item5
0,Alice,5,3,4,4,-1
1,user1,3,1,2,3,3
2,user2,4,3,4,3,5
3,user3,3,3,1,5,4
4,user4,1,5,5,2,1


## Alice对item5的评价未知(这里先用-1代替), 接下来用UserCF进行分析
1. 根据item1、2、3、4,找到和Alice最相似的n个User
2. 根据这 n 个用户对物品 5 的评分情况和与 Alice 的相似程度会猜测出 Alice 对物品5的评分

### Step1: 相似度计算
+ method1: 余弦相似度
+ method2: 皮尔逊相关系数

#### method1: 余弦相似度

In [3]:
df.iloc[1, 1:-1].values * df.iloc[1, 1:-1].values

array([9, 1, 4, 9], dtype=object)

In [4]:
sim_list = []  # 保存Alice与其他user的相似度
sim_score = .0
user_num = len(df) - 1  # 减去Alice自身
alice_vec = df.iloc[0, 1:-1].values
sqrt_sum_alice_vec_2 = np.sqrt(np.sum(alice_vec * alice_vec))
for i in range(user_num):
    user_vec = df.iloc[i+1, 1:-1].values
    sim_score = np.sum(user_vec * alice_vec) / (np.sqrt(np.sum(user_vec * user_vec)) * sqrt_sum_alice_vec_2)
    sim_list.append(sim_score)
sim_list

[0.9753213044447562,
 0.9922426389474776,
 0.8907235428302466,
 0.7966873563711506]

#### method2: 皮尔逊相关系数(本质上是一个"中心化"了的余弦相似度)


In [5]:
Alice_average = df.iloc[0, 1:-1].mean()  # Alice的平均评分
Alice_sub_vec = df.iloc[0, 1:-1].sub(Alice_average).values
Alice_sqrt_sum_sub_vec_2 = np.sqrt(np.sum(Alice_sub_vec * Alice_sub_vec))


In [6]:
pearson_sim_list = []
for i in range(user_num):
    user_average = df.iloc[i+1, 1:-1].mean()  # user的平均评分
    user_sub_vec = df.iloc[i+1, 1:-1].sub(user_average).values  # 减去user的平均评分
    pearson_score = np.sum(Alice_sub_vec * user_sub_vec) / (Alice_sqrt_sum_sub_vec_2 * np.sqrt(np.sum(user_sub_vec * user_sub_vec)))
    pearson_sim_list.append(pearson_score)
pearson_sim_list

[0.8528028654224417, 0.7071067811865475, 0.0, -0.7921180343813393]

> 上面是手算, 或者使用scipy.stats.pearsonr() api, 有相同的结果

In [7]:
from scipy.stats import pearsonr
pearson_sim_list = []
for i in range(user_num):
    user_vec = df.iloc[i+1, 1:-1].values
    pearson_score = pearsonr(user_vec, alice_vec)  # 可以不用减去平均值, 直接算
    pearson_sim_list.append(pearson_score[0])
pearson_sim_list

[0.8528028654224415, 0.7071067811865475, 0.0, -0.7921180343813393]

> 利用sklearn计算所有用户之间的余弦相似度和皮尔逊相关系数

In [8]:
from sklearn.metrics.pairwise import cosine_similarity
users = df.iloc[:, 1:-1].values
cosine_sim = cosine_similarity(users)
cosine_sim

array([[1.        , 0.9753213 , 0.99224264, 0.89072354, 0.79668736],
       [0.9753213 , 1.        , 0.94362852, 0.91160719, 0.67478587],
       [0.99224264, 0.94362852, 1.        , 0.85280287, 0.85811633],
       [0.89072354, 0.91160719, 0.85280287, 1.        , 0.67082039],
       [0.79668736, 0.67478587, 0.85811633, 0.67082039, 1.        ]])

In [9]:
pearson = np.corrcoef(users)
pearson

array([[ 1.        ,  0.85280287,  0.70710678,  0.        , -0.79211803],
       [ 0.85280287,  1.        ,  0.30151134,  0.42640143, -0.88662069],
       [ 0.70710678,  0.30151134,  1.        , -0.70710678, -0.14002801],
       [ 0.        ,  0.42640143, -0.70710678,  1.        , -0.59408853],
       [-0.79211803, -0.88662069, -0.14002801, -0.59408853,  1.        ]])

**根据皮尔逊系数，可以发现，与Alice相似度最高的分别是user1和user2,故对他们进行分析,推理出Alice对item5的打分情况**

In [10]:
for i in range(5):
    df.iloc[:, i+1] = df.iloc[:, i+1].astype("float")
Alice_item124_score = df.iloc[0, 1:-1].values * 1.0
user1_item5_score_centered = df.iloc[1, -1] - np.average(df.iloc[1, 1:].values)  # 利用其他用户对所有item的平均评分，并且中心化(所有物品包括item5)
user2_item5_score_centered = df.iloc[2, -1] - np.average(df.iloc[2, 1:].values)
print(df.iloc[1, -1], np.average(df.iloc[1, 1:].values))
print(df.iloc[2, -1], np.average(df.iloc[2, 1:].values))

# print("user1_item5_score_centered:", user1_item5_score_centered)
# print("user2_item5_score_centered:", user2_item5_score_centered)
weight_list = pearson_sim_list[0:2]
print("np.sum(weight_list)", np.sum(weight_list))
# Alice对item1~item4的打分情况取平均，加上后面的(加权平均值/加权和)得到Alice对item5的打分情况
Alice_item5_score = np.average(Alice_item124_score) + \
                    ((np.dot(weight_list, [user1_item5_score_centered, user2_item5_score_centered])) / np.sum(weight_list))
Alice_item5_score

3.0 2.4
5.0 3.8
np.sum(weight_list) 1.559909646608989


4.871979899370592