Recommender Systems
=====

Most of the code contained within this notebook is from Ch. 22 of *Data Science from Scratch* by J. Grus.



For each user, we have a list of topics of interests.  We'd like to recommend likely topics of interest to each user.

In [1]:
users_interests = [
    ["Hadoop", "Big Data", "HBase", "Java", "Spark", "Storm", "Cassandra"],
    ["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"],
    ["Python", "scikit-learn", "scipy", "numpy", "statsmodels", "pandas"],
    ["R", "Python", "statistics", "regression", "probability"],
    ["machine learning", "regression", "decision trees", "libsvm"],
    ["Python", "R", "Java", "C++", "Haskell", "programming languages"],
    ["statistics", "probability", "mathematics", "theory"],
    ["machine learning", "scikit-learn", "Mahout", "neural networks"],
    ["neural networks", "deep learning", "Big Data", "artificial intelligence"],
    ["Hadoop", "Java", "MapReduce", "Big Data"],
    ["statistics", "R", "statsmodels"],
    ["C++", "deep learning", "artificial intelligence", "probability"],
    ["pandas", "R", "Python"],
    ["databases", "HBase", "Postgres", "MySQL", "MongoDB"],
    ["libsvm", "regression", "support vector machines"]
]

user_ids = range(len(users_interests))

For each user, we could recommend the most popular interests that a user has not stated as a interest.

In [3]:
from collections import defaultdict, Counter

popular_interests = Counter(interest
                            for user_interests in users_interests
                            for interest in user_interests).most_common()

def most_popular_new_interests(user_interests, max_results=5):
    suggestions = [(interest, frequency) 
                   for interest, frequency in popular_interests
                   if interest not in user_interests]
    return [interest for interest,_ in suggestions[:max_results]]

print popular_interests

for i in user_ids:
    print "User: ", i, most_popular_new_interests(users_interests[i], 3)

[('Python', 4), ('R', 4), ('Java', 3), ('regression', 3), ('statistics', 3), ('probability', 3), ('HBase', 3), ('Big Data', 3), ('neural networks', 2), ('Hadoop', 2), ('deep learning', 2), ('pandas', 2), ('artificial intelligence', 2), ('libsvm', 2), ('C++', 2), ('Postgres', 2), ('MongoDB', 2), ('scikit-learn', 2), ('machine learning', 2), ('statsmodels', 2), ('Cassandra', 2), ('NoSQL', 1), ('Mahout', 1), ('Storm', 1), ('MySQL', 1), ('programming languages', 1), ('Haskell', 1), ('mathematics', 1), ('Spark', 1), ('numpy', 1), ('theory', 1), ('decision trees', 1), ('MapReduce', 1), ('scipy', 1), ('databases', 1), ('support vector machines', 1)]
User:  0 ['Python', 'R', 'regression']
User:  1 ['Python', 'R', 'Java']
User:  2 ['R', 'Java', 'regression']
User:  3 ['Java', 'HBase', 'Big Data']
User:  4 ['Python', 'R', 'Java']
User:  5 ['regression', 'statistics', 'probability']
User:  6 ['Python', 'R', 'Java']
User:  7 ['Python', 'R', 'Java']
User:  8 ['Python', 'R', 'Java']
User:  9 ['Pytho

In [8]:
#
# user-based filtering
#
import math

def dot(v, w):
    """v_1 * w_1 + ... + v_n * w_n"""
    return sum(v_i * w_i for v_i, w_i in zip(v, w))

def cosine_similarity(v, w):
    return dot(v, w) / math.sqrt(dot(v, v) * dot(w, w))

# Get a list of all interests, remove duplicates
unique_interests = sorted(list({ interest 
                                 for user_interests in users_interests
                                 for interest in user_interests }))

def make_user_interest_vector(user_interests):
    """given a list of interests, produce a vector whose i-th element is 1
    if unique_interests[i] is in the list, 0 otherwise"""
    return [1 if interest in user_interests else 0
            for interest in unique_interests]

# For each user's interest, apply the 'make_user_interest_vector' to create a matrix 
user_interest_matrix = map(make_user_interest_vector, users_interests)

# For each user, determine the similarity with the other users
user_similarities = [[cosine_similarity(interest_vector_i, interest_vector_j)
                      for interest_vector_j in user_interest_matrix]
                     for interest_vector_i in user_interest_matrix]

def most_similar_users_to(user_id, min_similarity = 0):
    pairs = [(other_user_id, similarity)                      # find other
             for other_user_id, similarity in                 # users with
                enumerate(user_similarities[user_id])         # nonzero 
             if user_id != other_user_id and similarity > min_similarity]  # similarity

    return sorted(pairs,                                      # sort them
                  key=lambda (_, similarity): similarity,     # most similar
                  reverse=True)                               # first

# Display the most similiar users for each user
for i in user_ids:
    print "User: ", i, most_similar_users_to(i, 0.5)

User:  0 [(9, 0.5669467095138409)]
User:  1 [(13, 0.6)]
User:  2 []
User:  3 [(10, 0.5163977794943222), (12, 0.5163977794943222)]
User:  4 [(14, 0.5773502691896258)]
User:  5 []
User:  6 []
User:  7 []
User:  8 []
User:  9 [(0, 0.5669467095138409)]
User:  10 [(3, 0.5163977794943222)]
User:  11 []
User:  12 [(3, 0.5163977794943222)]
User:  13 [(1, 0.6)]
User:  14 [(4, 0.5773502691896258)]


To get suggestions for each user, figure what users are similiar and weight their interests by the user's similiarity.  Optionally exclude current interests.  

In [10]:
def user_based_suggestions(user_id, include_current_interests=False):
    # sum up the similarities
    suggestions = defaultdict(float)
    for other_user_id, similarity in most_similar_users_to(user_id):
        for interest in users_interests[other_user_id]:
            suggestions[interest] += similarity

    # convert them to a sorted list
    suggestions = sorted(suggestions.items(),
                         key=lambda (_, weight): weight,
                         reverse=True)

    # and (maybe) exclude already-interests
    if include_current_interests:
        return suggestions
    else:
        return [(suggestion, weight) 
                for suggestion, weight in suggestions
                if suggestion not in users_interests[user_id]]

for i in user_ids:
    print "User: ", i
    print "Current Interests: ", users_interests[i]
    print user_based_suggestions(i)
    print "---"

 User:  0
Current Interests:  ['Hadoop', 'Big Data', 'HBase', 'Java', 'Spark', 'Storm', 'Cassandra']
[('MapReduce', 0.5669467095138409), ('MongoDB', 0.50709255283711), ('Postgres', 0.50709255283711), ('NoSQL', 0.3380617018914066), ('neural networks', 0.1889822365046136), ('deep learning', 0.1889822365046136), ('artificial intelligence', 0.1889822365046136), ('databases', 0.1690308509457033), ('MySQL', 0.1690308509457033), ('programming languages', 0.1543033499620919), ('Python', 0.1543033499620919), ('Haskell', 0.1543033499620919), ('C++', 0.1543033499620919), ('R', 0.1543033499620919)]
---
User:  1
Current Interests:  ['NoSQL', 'MongoDB', 'Cassandra', 'HBase', 'Postgres']
[('databases', 0.6), ('MySQL', 0.6), ('Java', 0.3380617018914066), ('Big Data', 0.3380617018914066), ('Hadoop', 0.3380617018914066), ('Storm', 0.3380617018914066), ('Spark', 0.3380617018914066)]
---
User:  2
Current Interests:  ['Python', 'scikit-learn', 'scipy', 'numpy', 'statsmodels', 'pandas']
[('R', 1.05634763368

Now determine the similiarity between items and then make recommentations based on similiar items.

In [None]:
#
# Item-Based Collaborative Filtering
#

interest_user_matrix = [[user_interest_vector[j]
                         for user_interest_vector in user_interest_matrix]
                        for j, _ in enumerate(unique_interests)]

interest_similarities = [[cosine_similarity(user_vector_i, user_vector_j)
                          for user_vector_j in interest_user_matrix]
                         for user_vector_i in interest_user_matrix]

def most_similar_interests_to(interest_id):
    similarities = interest_similarities[interest_id]
    pairs = [(unique_interests[other_interest_id], similarity)
             for other_interest_id, similarity in enumerate(similarities)
             if interest_id != other_interest_id and similarity > 0]
    return sorted(pairs,
                  key=lambda (_, similarity): similarity,
                  reverse=True)

def item_based_suggestions(user_id, include_current_interests=False):
    suggestions = defaultdict(float)
    user_interest_vector = user_interest_matrix[user_id]
    for interest_id, is_interested in enumerate(user_interest_vector):
        if is_interested == 1:
            similar_interests = most_similar_interests_to(interest_id)
            for interest, similarity in similar_interests:
                suggestions[interest] += similarity

    suggestions = sorted(suggestions.items(),
                         key=lambda (_, similarity): similarity,
                         reverse=True)

    if include_current_interests:
        return suggestions
    else:
        return [(suggestion, weight) 
                for suggestion, weight in suggestions
                if suggestion not in users_interests[user_id]]



Compare the three sets of recommendations.

In [None]:
for i in user_ids:
    print "User:", i

    print "-----"