### 构建数据集

In [2]:
import pandas as pd
import numpy as np

In [3]:
users = ["User1", "User2", "User3", "User4", "User5"]
items = ["Item A", "Item B", "Item C", "Item D", "Item E"]
# 用户购买记录数据集
datasets = [
    [1,0,1,1,0],
    [1,0,0,1,1],
    [1,0,1,0,0],
    [0,1,0,1,1],
    [1,1,1,0,1],
]

In [4]:
df = pd.DataFrame(datasets,
                  columns=items,
                  index=users)
print(df)

       Item A  Item B  Item C  Item D  Item E
User1       1       0       1       1       0
User2       1       0       0       1       1
User3       1       0       1       0       0
User4       0       1       0       1       1
User5       1       1       1       0       1


### 计算相似度

In [5]:
# 杰卡德相似系数
from sklearn.metrics import jaccard_similarity_score

In [6]:
#计算ItemA ItemB的相似度

jaccard_similarity_score_AB = jaccard_similarity_score(df['Item A'],df['Item B'])
print(jaccard_similarity_score_AB)



0.2


#### 基于用户

In [7]:
#杰卡德距离
from sklearn.metrics.pairwise import pairwise_distances

In [8]:
#计算用户间相似度,1-相似距离
#单个
user_similar = 1 - pairwise_distances(df,metric='jaccard')
#矩阵
user_similar =pd.DataFrame(user_similar,columns=users,index=users)

print("用户之间的两两相似度：")
print(user_similar)


用户之间的两两相似度：
          User1  User2     User3  User4  User5
User1  1.000000   0.50  0.666667    0.2    0.4
User2  0.500000   1.00  0.250000    0.5    0.4
User3  0.666667   0.25  1.000000    0.0    0.5
User4  0.200000   0.50  0.000000    1.0    0.4
User5  0.400000   0.40  0.500000    0.4    1.0




#### 基于物品

In [9]:
#计算物品间相似度
item_similar =1 - pairwise_distances(df.T,metric='jaccard')
item_similar =pd.DataFrame(item_similar,columns=items,index=items)
print('物品之间两两相似度:\n',item_similar)


物品之间两两相似度:
         Item A    Item B  Item C  Item D    Item E
Item A    1.00  0.200000    0.75    0.40  0.400000
Item B    0.20  1.000000    0.25    0.25  0.666667
Item C    0.75  0.250000    1.00    0.20  0.200000
Item D    0.40  0.250000    0.20    1.00  0.500000
Item E    0.40  0.666667    0.20    0.50  1.000000




### 推荐
- 有了两两的相似度，接下来就可以筛选TOP-N相似结果，并进行推荐了
#### User-Based CF

In [10]:
import pandas as pd
import numpy as np
from pprint import pprint

In [14]:
topN_users = {}
#遍历每一行数据
for i in user_similar.index:
    #取出每一列数据,并删除自身,然后排序数据
    _df = user_similar.loc[i].drop([i])
    #sort_values 按照相似度降序排序
    _df_sorted =_df.sort_values(ascending =False)
    #从排序之后的结果中切片,取出前两条(相似度最高的两个)
    top2 =list(_df_sorted.index[:2])
#     top2 =list(_df_sorted.index)
    topN_users[i] =top2


In [15]:
print('Top2相似用户:\n',topN_users)

Top2相似用户:
 {'User1': ['User3', 'User2'], 'User2': ['User4', 'User1'], 'User3': ['User1', 'User5'], 'User4': ['User2', 'User5'], 'User5': ['User3', 'User4']}


In [16]:
#推荐字典
rs_results ={}
#遍历所有的最相似用户
for user,sim_users in topN_users.items():
    rs_result =set()  #存储推荐结果
    for sim_user in sim_users:
        #构建初始的推荐结果
        rs_result =rs_result.union(set(df.ix[sim_user].replace(0,np.nan).dropna().index))
    #过滤掉用户已购物品
    rs_result -= set(df.ix[user].replace(0,np.nan).dropna().index)
    rs_results[user] =rs_result
print('最终推荐结果:\n',rs_results)


最终推荐结果:
 {'User1': {'Item E'}, 'User2': {'Item B', 'Item C'}, 'User3': {'Item B', 'Item D', 'Item E'}, 'User4': {'Item A', 'Item C'}, 'User5': {'Item D'}}


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  # Remove the CWD from sys.path while we load stuff.


#### Item-Based CF

In [18]:
topN_items = {}
# 遍历每一行数据
for i in item_similar.index:
    # 取出每一列数据，并删除自身，然后排序数据
    _df = item_similar.loc[i].drop([i])
    _df_sorted = _df.sort_values(ascending=False)

    top2 = list(_df_sorted.index[:2])
    topN_items[i] = top2

print("Top2相似物品：")
pprint(topN_items)

Top2相似物品：
{'Item A': ['Item C', 'Item E'],
 'Item B': ['Item E', 'Item D'],
 'Item C': ['Item A', 'Item B'],
 'Item D': ['Item E', 'Item A'],
 'Item E': ['Item B', 'Item D']}


In [19]:
rs_results = {}
# 构建推荐结果
for user in df.index:    # 遍历所有用户
    rs_result = set()
    for item in df.ix[user].replace(0,np.nan).dropna().index:   # 取出每个用户当前已购物品列表
        # 根据每个物品找出最相似的TOP-N物品，构建初始推荐结果
        rs_result = rs_result.union(topN_items[item])
    # 过滤掉用户已购的物品
    rs_result -= set(df.ix[user].replace(0,np.nan).dropna().index)
    # 添加到结果中
    rs_results[user] = rs_result

print("最终推荐结果：")
pprint(rs_results)

最终推荐结果：
{'User1': {'Item B', 'Item E'},
 'User2': {'Item B', 'Item C'},
 'User3': {'Item B', 'Item E'},
 'User4': {'Item A'},
 'User5': {'Item D'}}


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  if __name__ == '__main__':
