# 基于物品的过滤
基于物品的过滤是计算找出最相似的两个物品，然后根据用户的评价来给出推荐结果

In [1]:
# 引入相关的包
import numpy as np
import pandas as pd
import os
import zipfile

In [2]:
def load_data(path, file):
    if not os.path.isdir(path):
        return
    if not os.path.isfile(path+file):
        return
    # 解压压缩包
    movielens = zipfile.ZipFile(path+file)
    inpath = file.split('.')[0]+'/' 
    movielens.extractall(path)
    #读取movielens数据
    #links = pd.read_csv(path+inpath+'links.csv')
    movies = pd.read_csv(path+inpath+'movies.dat', names=['movieId', 'title', 'genres'], sep='::', engine='python')
    ratings = pd.read_csv(path+inpath+'ratings.dat', names=['userId', 'movieId', 'rating', 'timestamp'], sep='::', engine='python')
    #tags = pd.read_csv(path+inpath+'users.dat', sep='::', engine='python')
    return  movies, ratings
    

In [3]:
movies, ratings = load_data('./dataset/', 'ml-1m.zip')

In [4]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [6]:
ratings = ratings.set_index(['userId', 'movieId'])
ratings.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,rating,timestamp
userId,movieId,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1193,5,978300760
1,661,3,978302109
1,914,3,978301968
1,3408,4,978300275
1,2355,5,978824291


# 相似度计算
## 修正的余弦相似度
余弦相似度会遇到评分范围过大的问题，所以需要对用户的评价归一化，将其归一到一个更小的范围，使得分数分布更合理，因此需要用用户的评价减去他评价的所有物品的均值：
$$s(i,j) = \frac{\sum_{u \in U}(R_{u,i}-\overline{R_u})(R_{u,j}-\overline{R_u})}{\sqrt{\sum_{u \in U}(R_{u,i}-\overline{R_u})^2} \sqrt{\sum_{u \in U}(R_{u,j}- \overline{R_u})^2}}$$
其中：

$\overline{R_u}$ 表示用户对所有物品评价的平均值

$U$表示评价过物品 $i$和$j$ 的用户集合

In [7]:
def sim(item1, item2, ratings=ratings):
    average = ratings.groupby('userId')['rating'].mean()
    num = 0 # 分子
    de_1 = 0 # 分母1
    de_2 = 0 # 分母2
    for userId in ratings.index.levels[0]:
        if item1 in ratings.loc[userId].index and item2 in ratings.loc[userId].index:
            avg = average[userId]
            num += (ratings.loc[userId].loc[item1]['rating'] - avg)*(ratings.loc[userId].loc[item2]['rating'] - avg)
            de_1 += np.square(ratings.loc[userId].loc[item1]['rating'] - avg)
            de_2 += np.square(ratings.loc[userId].loc[item2]['rating'] - avg)
    if de_1 != 0 and de_2 != 0 and num != 0:
        return num / (np.sqrt(de_1)*np.sqrt(de_2))
    return 0

In [8]:
sim(1,4)

-0.19196048381477623

In [9]:
def predict(userId, item):
    similar = 0
    dem = 0
    if item not in ratings.loc[userId].index:
        for movieId in ratings.loc[userId].index:
            similar += sim(item, movieId) * ratings.loc[userId].loc[movieId]['rating']
            dem += sim(item, movieId)
        return similar / dem
    return ratings.loc[userId].loca[item]['rating']

In [10]:
predict(1, 234)

4.193735445493792

# Slope One算法
Slope One 算法是在一篇名为《Slope One: 基于在线评分系统的协同过滤算法》提出，其主要可以分为两步：
1. 首先计算出两个物品之间的差值
2. 进行预测

计算物品之间差异的公式为：
$$dev_{i,j} = \sum_{u \in S_{i,j}(X)} \frac{u_i-u_j}{card(S_{i,j}(X))}$$

$card(S)$表示$S$有多少个元素

$X$表示所有评分值的集合

$card(S_{j,i}(X))$表示同时评价过$i$和$j$的用户数

In [11]:
#因为movielens数据量太大，所以用玩具数据测试slope算法正确性
data = {
    'T':[4, 5, np.nan,5],
    'P':[3, 2, 3.5, np.nan],
    'W':[4, np.nan, 4, 3]
}
movies = {
    'movieId':['T', 'P', 'W']
}
ratings = pd.DataFrame(data, columns=['T', 'P', 'W'], index=['a', 'b', 'c', 'd'], dtype=np.float)
ratings = pd.DataFrame(ratings.stack(), columns=['rating'])
ratings

Unnamed: 0,Unnamed: 1,rating
a,T,4.0
a,P,3.0
a,W,4.0
b,T,5.0
b,P,2.0
c,P,3.5
c,W,4.0
d,T,5.0
d,W,3.0


In [12]:
def slope_one():
    import datetime
    print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
    card_ij = pd.DataFrame(index=movies['movieId'], columns=movies['movieId'], dtype=np.float16)
    card_ij.loc[:, :] = 0
    dev = card_ij.copy()
    num = card_ij.copy()
    i = 0
    for userId in ratings.index.levels[0]:#用户
        i += 1
        j = 0
        for movieId_1 in ratings.loc[userId].index:#用户对应的电影
            j += 1
            for movieId_2 in ratings.loc[userId].index:
                print('{}*{}'.format(i, j), end='\r')
                if movieId_1 != movieId_2:
                    dev.loc[movieId_1][movieId_2] += ratings.loc[userId].loc[movieId_1]['rating'] \
                    - ratings.loc[userId].loc[movieId_2]['rating']
                    num.loc[movieId_1][movieId_2] += 1
    card_ij = dev.div(num, fill_value=0)
    print()
    print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
    return card_ij, num

In [13]:
card, num = slope_one()


2019-03-23 11:29:46
1*11*11*11*21*21*21*31*31*32*12*12*22*23*13*13*23*24*14*14*24*2
2019-03-23 11:29:46


In [14]:
card

Unnamed: 0,T,P,W
T,,2.0,1.0
P,-2.0,,-0.75
W,-1.0,0.75,


# 使用加权的Slope One算法进行预测
$$P^{wS1}(u)_j = \frac{\sum_{i \in S(u) - \{j\}}(dev_{j,i} + u_i)c_{j,i}}{\sum_{i \in S(u)-\{j\}}c_{j,i}}$$
其中
$$c_{j,i} = card(S_{j,i}(\chi))$$

$p^{wS1}(u)_j$表示将预测用户$u$对物品$j$的评分

$\sum_{i \in S(u) - \{j\}}$表示$u$评价过的除了$j$以外的产品，整个分子表示对于用户评价过的所有商品找出与待预测的商品差值也就是$dev_{i,j}$并加上这个商品的评分，然后乘以同时评价过两个商品的用户数

In [15]:
def slope_one_predict(user, predict_item):
    up = 0
    de = 0
    for item in ratings.loc[user].index:
        if item != predict_item:
            up += (card.loc[predict_item][item] + ratings.loc[user].loc[item]['rating'])*num.loc[predict_item][item]
            de += num[predict_item][item]
    if up !=0 and de != 0:
        return up/de
    return 0

In [16]:
slope_one_predict('b', 'W')

3.375