In [42]:
from ch22_recommender import cosine_similarity

In [43]:
users_interests = [
["Hadoop", "Big Data", "HBase", "Java", "Spark", "Storm", "Cassandra"],
["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"],
["Python", "scikit-learn", "scipy", "numpy", "statsmodels", "pandas"],
["R", "Python", "statistics", "regression", "probability"],
["machine learning", "regression", "decision trees", "libsvm"],
["Python", "R", "Java", "C++", "Haskell", "programming languages"],
["statistics", "probability", "mathematics", "theory"],
["machine learning", "scikit-learn", "Mahout", "neural networks"],
["neural networks", "deep learning", "Big Data", "artificial intelligence"],
["Hadoop", "Java", "MapReduce", "Big Data"],
["statistics", "R", "statsmodels"],
["C++", "deep learning", "artificial intelligence", "probability"],
["pandas", "R", "Python"],
["databases", "HBase", "Postgres", "MySQL", "MongoDB"],
["libsvm", "regression", "support vector machines"]
]

# Recommending What’s Popular

One easy approach is to simply recommend what’s popular

In [44]:
from collections import Counter

# initialize
overall_ranking = Counter()

# interate through list of lists
for l in users_interests:
    partial_ranking = Counter(l)
    overall_ranking = overall_ranking + partial_ranking

print(overall_ranking)

# print the n most common subjects and their counts
for key, value in overall_ranking.most_common(6):
    print(key, value)
    

Counter({'Python': 4, 'R': 4, 'Big Data': 3, 'HBase': 3, 'Java': 3, 'statistics': 3, 'regression': 3, 'probability': 3, 'Hadoop': 2, 'Cassandra': 2, 'MongoDB': 2, 'Postgres': 2, 'scikit-learn': 2, 'statsmodels': 2, 'pandas': 2, 'machine learning': 2, 'libsvm': 2, 'C++': 2, 'neural networks': 2, 'deep learning': 2, 'artificial intelligence': 2, 'Spark': 1, 'Storm': 1, 'NoSQL': 1, 'scipy': 1, 'numpy': 1, 'decision trees': 1, 'Haskell': 1, 'programming languages': 1, 'mathematics': 1, 'theory': 1, 'Mahout': 1, 'MapReduce': 1, 'databases': 1, 'MySQL': 1, 'support vector machines': 1})
Python 4
R 4
Big Data 3
HBase 3
Java 3
statistics 3


In [45]:
def most_popular_new_interests(user_interests, max_results=5):
    recommended_new_interests = [interest for interest in overall_ranking if interest not in user_interests]
    recommended_new_interests = recommended_new_interests[:max_results]
    return recommended_new_interests

suggestions = most_popular_new_interests(users_interests[1], 5)
print('User ', 1, ' likes: ', users_interests[1], ' and we will be recommending: ', suggestions)

suggestions = most_popular_new_interests(users_interests[3], 5)
print('User ', 3, ' likes: ', users_interests[3], ' and we will be recommending: ', suggestions)

User  1  likes:  ['NoSQL', 'MongoDB', 'Cassandra', 'HBase', 'Postgres']  and we will be recommending:  ['Hadoop', 'Big Data', 'Java', 'Spark', 'Storm']
User  3  likes:  ['R', 'Python', 'statistics', 'regression', 'probability']  and we will be recommending:  ['Hadoop', 'Big Data', 'HBase', 'Java', 'Spark']


# User-Based Collaborative Filtering

One way of taking a user’s interests into account is to look for users who are somehow similar to him, and then suggest the things that those users are interested in.

In [55]:
from typing import Dict, List, Tuple

# list of unique interests
unique_interests = sorted(list(overall_ranking.keys()))
#print(unique_interests)


def make_user_interest_vector(user_interests):
    """given a list of interests, produce a vector whose ith element is 1
    if unique_interests[i] is in the list, 0 otherwise"""
    user_vector = [(1 if interest in user_interests else 0)for interest in unique_interests]
    return user_vector

user_vector = make_user_interest_vector(users_interests[1])
#print(user_vector)


user_interest_matrix = [make_user_interest_vector(user_interests) for user_interests in users_interests]

user_similarities = [[cosine_similarity(interest_vector_i, interest_vector_j)
                     for interest_vector_j in user_interest_matrix]
                     for interest_vector_i in user_interest_matrix]

#print(user_similarities)
#print(type(user_similarities))

def most_similar_users_to(user_id: int) -> List[Tuple[int, float]]:
    pairs = [(other_user_id, similarity)                      # Find other
             for other_user_id, similarity in                 # users with
                enumerate(user_similarities[user_id])         # nonzero
             if user_id != other_user_id and similarity > 0]  # similarity.

    return sorted(pairs,                                      # Sort them
                  key=lambda pair: pair[-1],                  # most similar
                  reverse=True)                               # first.


most_similar_to_zero = most_similar_users_to(0)
print(most_similar_to_zero)
user, score = most_similar_to_zero[0]
assert user == 9
assert 0.56 < score < 0.57
user, score = most_similar_to_zero[1]
assert user == 1
assert 0.33 < score < 0.34

[(9, 0.5669467095138409), (1, 0.3380617018914066), (8, 0.1889822365046136), (13, 0.1690308509457033), (5, 0.1543033499620919)]
