# 基于物品的过滤
基于物品的过滤是计算找出最相似的两个物品，然后根据用户的评价来给出推荐结果

In [4]:
# 引入相关的包
import numpy as np
import pandas as pd
import os
import zipfile

In [9]:
def load_data(path, file):
    if not os.path.isdir(path):
        return
    if not os.path.isfile(path+file):
        return
    # 解压压缩包
    movielens = zipfile.ZipFile(path+file)
    inpath = file.split('.')[0]+'/' 
    movielens.extractall(path)
    #读取movielens数据
    links = pd.read_csv(path+inpath+'links.csv')
    movies = pd.read_csv(path+inpath+'movies.csv')
    ratings = pd.read_csv(path+inpath+'ratings.csv')
    tags = pd.read_csv(path+inpath+'tags.csv')
    return links, movies, ratings, tags
    

In [97]:
links, movies, ratings, tags = load_data('./dataset/', 'ml-latest-small.zip')

In [98]:
links.head(5)

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [99]:
movies.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [100]:
ratings = ratings.set_index(['userId', 'movieId'])
ratings.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,rating,timestamp
userId,movieId,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,4.0,964982703
1,3,4.0,964981247
1,6,4.0,964982224
1,47,5.0,964983815
1,50,5.0,964982931


In [67]:
tags.head(5)

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


# 相似度计算
## 修正的余弦相似度
余弦相似度会遇到评分范围过大的问题，所以需要对用户的评价归一化，将其归一到一个更小的范围，使得分数分布更合理，因此需要用用户的评价减去他评价的所有物品的均值：
$$s(i,j) = \frac{\sum_{u \in U}(R_{u,i}-\overline{R_u})(R_{u,j}-\overline{R_u})}{\sqrt{\sum_{u \in U}(R_{u,i}-\overline{R_u})^2} \sqrt{\sum_{u \in U}(R_{u,j}- \overline{R_u})^2}}$$
其中：

$\overline{R_u}$ 表示用户对所有物品评价的平均值

$U$表示评价过物品 $i$和$j$ 的用户集合

In [159]:
def sim(item1, item2, ratings=ratings):
    average = ratings.groupby('userId')['rating'].mean()
    num = 0 # 分子
    de_1 = 0 # 分母1
    de_2 = 0 # 分母2
    for userId in ratings.index.levels[0]:
        if item1 in ratings.loc[userId].index and item2 in ratings.loc[userId].index:
            avg = average[userId]
            num += (ratings.loc[userId].loc[item1]['rating'] - avg)*(ratings.loc[userId].loc[item2]['rating'] - avg)
            de_1 += np.square(ratings.loc[userId].loc[item1]['rating'] - avg)
            de_2 += np.square(ratings.loc[userId].loc[item2]['rating'] - avg)
    if de_1 != 0 and de_2 != 0 and num != 0:
        return num / (np.sqrt(de_1)*np.sqrt(de_2))
    return 0

In [160]:
sim(1,4)

-0.4133586855464322

In [165]:
def predict(userId, item):
    similar = 0
    dem = 0
    if item not in ratings.loc[userId].index:
        for movieId in ratings.loc[userId].index:
            similar += sim(item, movieId) * ratings.loc[userId].loc[movieId]['rating']
            dem += sim(item, movieId)
        return similar / dem
    return ratings.loc[userId].loca[item]['rating']

In [166]:
predict(1, 234)

4.568401173458102

# Slope One算法
Slope One 算法是在一篇名为《Slope One: 基于在线评分系统的协同过滤算法》提出，其主要可以分为两步：
1. 首先计算出两个物品之间的差值
2. 进行预测

计算物品之间差异的公式为：
$$dev_{i,j} = \sum_{u \in S_{i,j}(X)} \frac{u_i-u_j}{card(S_{i,j}(X))}$$

$card(S)$表示$S$有多少个元素

$X$表示所有评分值的集合

$card(S_{j,i}(X))$表示同时评价过$i$和$j$的用户数

In [177]:
def slope_one():
    card_ij = pd.DataFrame(index=movies['movieId'],columns=movies['movieId'])
    dev = pd.DataFrame(index=movies['movieId'],columns=movies['movieId'])
    num = pd.DataFrame(index=movies['movieId'],columns=movies['movieId'])
    for userId in ratings.index[0]:#用户
        for movieId_1 in ratings.loc[userId].index:#用户对应的电影
            for movieId_2 in ratings.loc[userId].index:
                if movieId_1 != movieId_2:
                    dev.loc[movieId_1][movieId_2] += ratings.loc[userId].loc[movieId_1]['rating'] \
                    - ratings.loc[userId].loc[movieId_2]['rating']
                    num.loc[movieId_1][movieId_2] += 1
    
    card_ij = dev/num
    return card_ij

In [178]:
slope_one()

KeyboardInterrupt: 