# 1. 计算物品之间的相似度

In [1]:
from collections import defaultdict
import math

def ItemSimilarity(train):
    # calculate co-rated users between items
    C = defaultdict(int)
    N = defaultdict(int)
    for u, items in train.items():
        for i in items:
            N[i] += 1
            if i not in C:
                C[i] = defaultdict(int)
            for j in items:
                if i == j:
                    continue
                C[i][j] += 1
    print(C)
    # calcuate final similarity matrix W
    W = dict()
    for i, related_items in C.items():
        W[i] = dict()
        for j, cij in related_items.items():
            W[i][j] = cij / math.sqrt(N[i] * N[j])
    return W

In [2]:
# Items: A, B, C, D
# Users: a, b, c, d, e
train = {"A":["a", "b", "d"],
        "B":["b", "c", "e"],
        "C":["c", "d"],
        "D":["b", "c", "d"],
        "E":["a", "d"]}

In [3]:
ItemSimilarity(train)

defaultdict(<class 'int'>, {'a': defaultdict(<class 'int'>, {'b': 1, 'd': 2}), 'b': defaultdict(<class 'int'>, {'a': 1, 'd': 2, 'c': 2, 'e': 1}), 'd': defaultdict(<class 'int'>, {'a': 2, 'b': 2, 'c': 2}), 'c': defaultdict(<class 'int'>, {'b': 2, 'e': 1, 'd': 2}), 'e': defaultdict(<class 'int'>, {'b': 1, 'c': 1})})


{'a': {'b': 0.4082482904638631, 'd': 0.7071067811865475},
 'b': {'a': 0.4082482904638631,
  'd': 0.5773502691896258,
  'c': 0.6666666666666666,
  'e': 0.5773502691896258},
 'd': {'a': 0.7071067811865475,
  'b': 0.5773502691896258,
  'c': 0.5773502691896258},
 'c': {'b': 0.6666666666666666,
  'e': 0.5773502691896258,
  'd': 0.5773502691896258},
 'e': {'b': 0.5773502691896258, 'c': 0.5773502691896258}}

# 1.1 ItemCF + IUF

In [4]:
from collections import defaultdict
import math

def ItemSimilarity(train):
    # calculate co-rated users between items
    C = defaultdict(int)
    N = defaultdict(int)
    for u, items in train.items():
        for i in items:
            N[i] += 1
            if i not in C:
                C[i] = defaultdict(int)
            for j in items:
                if i == j:
                    continue
                C[i][j] += 1 / math.log(1 + len(items) * 1.0)
    print(C)
    # calcuate final similarity matrix W
    W = dict()
    for i, related_items in C.items():
        W[i] = dict()
        for j, cij in related_items.items():
            W[i][j] = cij / math.sqrt(N[i] * N[j])
    return W

In [5]:
ItemSimilarity(train)

defaultdict(<class 'int'>, {'a': defaultdict(<class 'int'>, {'b': 0.7213475204444817, 'd': 1.631586747071319}), 'b': defaultdict(<class 'int'>, {'a': 0.7213475204444817, 'd': 1.4426950408889634, 'c': 1.4426950408889634, 'e': 0.7213475204444817}), 'd': defaultdict(<class 'int'>, {'a': 1.631586747071319, 'b': 1.4426950408889634, 'c': 1.631586747071319}), 'c': defaultdict(<class 'int'>, {'b': 1.4426950408889634, 'e': 0.7213475204444817, 'd': 1.631586747071319}), 'e': defaultdict(<class 'int'>, {'b': 0.7213475204444817, 'c': 0.7213475204444817})})


{'a': {'b': 0.2944888920518062, 'd': 0.576853026474115},
 'b': {'a': 0.2944888920518062,
  'd': 0.4164701851078906,
  'c': 0.4808983469629878,
  'e': 0.4164701851078906},
 'd': {'a': 0.576853026474115,
  'b': 0.4164701851078906,
  'c': 0.470998523813926},
 'c': {'b': 0.4808983469629878,
  'e': 0.4164701851078906,
  'd': 0.470998523813926},
 'e': {'b': 0.4164701851078906, 'c': 0.4164701851078906}}

# 2. 将用户历史行为物品中的相似产品进行推荐列表

In [6]:
def recommend(train, user_id, W, K):
    rank = dict()
    ru = train[user_id]
    for i, pi in ru.items:
        for j, wij in sorted(W[i].items, key = itemgetter(1), reverse=True)[0:K]:
            if j in ru:
                continue
            rank[j] += pi * wij
    return rank

# 3. CF算法比较

|              | **UserCF**                                                   | **ItemCF**                                                   |
| ------------ | :----------------------------------------------------------- | ------------------------------------------------------------ |
| **性能**     | **适用于用户较少的场合**，如果用户很多，计算用户相似度矩阵代价很大 | **适用于物品数明显小于用户数的场合**，如果物品很多(eg: 网页)， 计算物品相似度矩阵代价很大 |
| **领域**     | 时效性较强，**用户个性化兴趣不太明显的领域** （UserCF适用于新闻推荐） | 长尾物品丰富，**用户个性化需求强烈的领域**                   |
| **实时性**   | 用户有新行为，**不一定造成推荐结果的立即变化**               | 用户有新行为，**一定会导致推荐结果的实时变化**               |
| **冷启动**   | 1. 在新用户对很少的物品产生行为后，不能立即对他进行个性化推荐，因为用户相似度表是每隔一段时间离线计算的<br /><br /> 2. 新物品 上线后一段时间，一旦有用户对产品产生行为，就可以将新物品推荐给和它产生行为的用户兴趣相似的其他用户 | 1. 新用户只要对一个物品产生行为，就可以给他推荐和该物品相关的其他物品<br /><br />2. 但没有办法在不离线更新物品相似度表的情况下将新物品推荐给用户 |
| **推荐理由** | 很难提供令用户信服的推荐解释                                 | 利用用户的历史行为给用户做推荐解释，可以令用户比较信服       |


