In [1]:
%pylab

Using matplotlib backend: TkAgg
Populating the interactive namespace from numpy and matplotlib


In [6]:
# 2种推荐系统
# 接下来可以结合根据词汇的主题分类做下一步改进
import math
from collections import Counter, defaultdict
# inner product
def dot(v, w):
    return sum(v_i * w_i
              for v_i, w_i in zip(v, w))

# dataset
users_interests = [
    ["Hadoop", "Big Data", "HBase", "Java", "Spark", "Storm", "Cassandra"],
    ["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"],
    ["Python", "scikit-learn", "scipy", "numpy", "statsmodels", "pandas"],
    ["R", "Python", "statistics", "regression", "probability"],
    ["machine learning", "regression", "decision trees", "libsvm"],
    ["Python", "R", "Java", "C++", "Haskell", "programming languages"],
    ["statistics", "probability", "mathematics", "theory"],
    ["machine learning", "scikit-learn", "Mahout", "neural networks"],
    ["neural networks", "deep learning", "Big Data", "artificial intelligence"],
    ["Hadoop", "Java", "MapReduce", "Big Data"],
    ["statistics", "R", "statsmodels"],
    ["C++", "deep learning", "artificial intelligence", "probability"],
    ["pandas", "R", "Python"],
    ["databases", "HBase", "Postgres", "MySQL", "MongoDB"],
    ["libsvm", "regression", "support vector machines"]
]

popular_interests = Counter(interest for user in users_interests for interest in user).most_common()

def freq_new_recommen(user, num_of_rec=5):
    suggestion = [(interest, frequency) for interest, frequency in popular_interests if interest not in user]
    return suggestion[:num_of_rec]
# freq_new_recommen(users_interests[0])

# user-based recommendation
def cosine_similarity(v, w):
    return dot(v, w) / math.sqrt(dot(v, v) * dot(w, w))

# Users with no identical interests will have similarity 0
unique_interests = sorted(list(set( interest 
                                for user in users_interests
                                for interest in user )))
# unique_interests

# 构建用户-兴趣商品的矩阵
# -----------------------------------
#         item_0 item_1 item_2 ...
# user_0    1     0       1    ...
# user_1    0     1       0    ...
# -----------------------------------
def make_user_interest_vector(user_interests):
    """given a list of interests, produce a vector whose ith element is 1
    if unique_interests[i] is in the list, 0 otherwise"""
    return [1 if interest in user_interests else 0
            for interest in unique_interests]

user_interest_matrix = list(map(make_user_interest_vector, users_interests))

# 用户-用户相似度矩阵
user_similarities = [[cosine_similarity(interest_vector_i, interest_vector_j)
                      for interest_vector_j in user_interest_matrix]
                     for interest_vector_i in user_interest_matrix]
# user_similarities[0][0]
# user_similarities[0][1]

# 寻找与user最相似的用户
def most_similar_user(user):
    # 元组：其他用户，相似度
    pairs = [(other, simi) for other, simi in enumerate(user_similarities[user])
            if user != other and simi > 0]
    # 根据相似度大小，降序排序
    return sorted(pairs, key = lambda pair:pair[1], reverse=True)
# most_similar_user(0)

# 基于用户相似度的推荐,默认推荐新的兴趣商品
def user_based_recommendation(user, include_familiar = False):
    # 推荐是最相似用户的商品，相似度就是用户的相似度
    suggestions = defaultdict(float)
    for other, simi in most_similar_user(user):
        for interest in users_interests[other]:
            suggestions[interest] += simi
    
    suggestions = sorted(suggestions.items(), 
                        key=lambda pair:pair[1],reverse=True)
    
    if include_familiar:
        return suggestions
    else:
        return [(suggestion, weight)
               for suggestion, weight in suggestions
               if suggestion not in users_interests[user]]
    
# user_based_recommendation(0)


# ------------------ 基于物品的推荐系统 ---------------------
# items - users
# 转置 用户-商品矩阵
# 同一件商品哪些用户感兴趣
interest_user_matrix = [[user_interest_vector[j]
                         for user_interest_vector in user_interest_matrix]            
                        for j, _ in enumerate(unique_interests)]
# interest_user_matrix    
# 计算行与行之间的相似度，物品相似度
interest_similarities = [[cosine_similarity(user_vector_i, user_vector_j)
                          for user_vector_j in interest_user_matrix]
                         for user_vector_i in interest_user_matrix]

# 得到与当前物品最相似的物品
def most_similar_item(item):
    similarities = interest_similarities[item]
    pairs = [(unique_interests[other_item], similarity)
            for other_item, similarity in enumerate(similarities)
            if item != other_item and similarity > 0]
    return sorted(pairs, key=lambda pair:pair[1],reverse=True)

# most_similar_item(0)

# 基于相似物品的推荐
def item_based_suggestions(user, include_familiar = False):
    suggestions = defaultdict(float)
    user_interest_vector = user_interest_matrix[user]
    # 针对该用户，对每一项用户感兴趣的物品计算相似物品及其相似度
    for item, is_interested in enumerate(user_interest_vector):
        if is_interested:
            similar_items = most_similar_item(item)
            for interest, simi in similar_items:
                suggestions[interest] += simi
    suggestions = sorted(suggestions.items(),
                         key=lambda pair: pair[1],
                         reverse=True)
    if include_familiar:
        return suggestions
    else:
        return [(suggestion, weight)
                for suggestion, weight in suggestions
                if suggestion not in users_interests[user]]

item_based_suggestions(0)
    

[('MapReduce', 1.861807319565799),
 ('MongoDB', 1.3164965809277263),
 ('Postgres', 1.3164965809277263),
 ('NoSQL', 1.2844570503761732),
 ('MySQL', 0.5773502691896258),
 ('databases', 0.5773502691896258),
 ('Haskell', 0.5773502691896258),
 ('programming languages', 0.5773502691896258),
 ('artificial intelligence', 0.4082482904638631),
 ('deep learning', 0.4082482904638631),
 ('neural networks', 0.4082482904638631),
 ('C++', 0.4082482904638631),
 ('Python', 0.2886751345948129),
 ('R', 0.2886751345948129)]