Recommender System
=====

**Objective**: Implement a recommender system using collaborative filtering.

In [None]:
MY_NAME = "Chris Phillips" # <-- Your name here

1) Complete the implementation to add item-based collaborative filtering.  Display predictions for each user by user-based and item-based recommendations.

In [11]:
users_interests = [
    ["Hadoop", "Big Data", "HBase", "Java", "Spark", "Storm", "Cassandra"],
    ["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"],
    ["Python", "scikit-learn", "scipy", "numpy", "statsmodels", "pandas"],
    ["R", "Python", "statistics", "regression", "probability"],
    ["machine learning", "regression", "decision trees", "libsvm"],
    ["Python", "R", "Java", "C++", "Haskell", "programming languages"],
    ["statistics", "probability", "mathematics", "theory"],
    ["machine learning", "scikit-learn", "Mahout", "neural networks"],
    ["neural networks", "deep learning", "Big Data", "artificial intelligence"],
    ["Hadoop", "Java", "MapReduce", "Big Data"],
    ["statistics", "R", "statsmodels"],
    ["C++", "deep learning", "artificial intelligence", "probability"],
    ["pandas", "R", "Python"],
    ["databases", "HBase", "Postgres", "MySQL", "MongoDB"],
    ["libsvm", "regression", "support vector machines"],
    ["R"]
]

user_ids = range(len(users_interests))

import math
from collections import defaultdict, Counter

def dot(v, w):
    """v_1 * w_1 + ... + v_n * w_n"""
    return sum(v_i * w_i for v_i, w_i in zip(v, w))

def cosine_similarity(v, w):
    return dot(v, w) / math.sqrt(dot(v, v) * dot(w, w))

# Get a list of all interests, remove duplicates
unique_interests = sorted(list({ interest 
                                 for user_interests in users_interests
                                 for interest in user_interests }))

def make_user_interest_vector(user_interests):
    """given a list of interests, produce a vector whose i-th element is 1
    if unique_interests[i] is in the list, 0 otherwise"""
    return [1 if interest in user_interests else 0
            for interest in unique_interests]

# For each user's interest, apply the 'make_user_interest_vector' to create a matrix 
user_interest_matrix = map(make_user_interest_vector, users_interests)


# For each user, determine the similarity with the other users
user_similarities = [[cosine_similarity(interest_vector_i, interest_vector_j)
                      for interest_vector_j in user_interest_matrix]
                     for interest_vector_i in user_interest_matrix]

### User-based Collaborative Filtering ###
def most_similar_users_to(user_id, min_similarity = 0):
    pairs = [(other_user_id, similarity)                      # find other
             for other_user_id, similarity in                 # users with
                enumerate(user_similarities[user_id])         # nonzero 
             if user_id != other_user_id and similarity > min_similarity]  # similarity

    return sorted(pairs,                                      # sort them
                  key=lambda (_, similarity): similarity,     # most similar
                  reverse=True)    

def user_based_suggestions(user_id, include_current_interests=False):
    # sum up the similarities
    suggestions = defaultdict(float)
    for other_user_id, similarity in most_similar_users_to(user_id):
        for interest in users_interests[other_user_id]:
            suggestions[interest] += similarity

    # convert them to a sorted list
    suggestions = sorted(suggestions.items(),
                         key=lambda (_, weight): weight,
                         reverse=True)

    # and (maybe) exclude already-interests
    if include_current_interests:
        return suggestions
    else:
        return [(suggestion, weight) 
                for suggestion, weight in suggestions
                if suggestion not in users_interests[user_id]]

### Item-based Collaborative Filtering ###
    
interest_user_matrix = [[user_interest_vector[j]
                         for user_interest_vector in user_interest_matrix]
                        for j, _ in enumerate(unique_interests)]

interest_similarities = [[cosine_similarity(user_vector_i, user_vector_j)
                          for user_vector_j in interest_user_matrix]
                         for user_vector_i in interest_user_matrix]
    
    
# Implement item_based_suggestions and most_similiar_items_to
def most_similar_interests_to(interest_id):
    similarities = interest_similarities[interest_id]
    pairs = [(unique_interests[other_interest_id], similarity)
             for other_interest_id, similarity in enumerate(similarities)
             if interest_id != other_interest_id and similarity > 0]
    return sorted(pairs,
                  key=lambda (_, similarity): similarity,
                  reverse=True)

def item_based_suggestions(user_id, include_current_interests=False):
    suggestions = defaultdict(float)
    user_interest_vector = user_interest_matrix[user_id]
    for interest_id, is_interested in enumerate(user_interest_vector):
        if is_interested == 1:
            similar_interests = most_similar_interests_to(interest_id)
            for interest, similarity in similar_interests:
                suggestions[interest] += similarity

    suggestions = sorted(suggestions.items(),
                         key=lambda (_, similarity): similarity,
                         reverse=True)

    if include_current_interests:
        return suggestions
    else:
        return [(suggestion, weight) 
                for suggestion, weight in suggestions
                if suggestion not in users_interests[user_id]]


# Save suggestions here
suggestions_cosine_user_based = map(user_based_suggestions, user_ids)
suggestions_cosine_item_based = map(item_based_suggestions, user_ids)
    
# Display current iterests and top 4 user- and item-based recommendations
print "---"
for i in user_ids:
    print "User:", i
    print "Current Interests:", users_interests[i]
    print "User-based recommendations:", user_based_suggestions(i, False)[:4]
    print "Item-based recommendations:", item_based_suggestions(i, False)[:4]
    print "User-based recommendations:", suggestions_cosine_user_based[i][:4]
    print "Item-based recommendations:", suggestions_cosine_item_based[i][:4]
    print "---"
    
    

---
User: 0
Current Interests: ['Hadoop', 'Big Data', 'HBase', 'Java', 'Spark', 'Storm', 'Cassandra']
User-based recommendations: [('MapReduce', 0.5669467095138409), ('MongoDB', 0.50709255283711), ('Postgres', 0.50709255283711), ('NoSQL', 0.3380617018914066)]
Item-based recommendations: [('MapReduce', 1.861807319565799), ('Postgres', 1.3164965809277263), ('MongoDB', 1.3164965809277263), ('NoSQL', 1.2844570503761732)]
User-based recommendations: [('MapReduce', 0.5669467095138409), ('MongoDB', 0.50709255283711), ('Postgres', 0.50709255283711), ('NoSQL', 0.3380617018914066)]
Item-based recommendations: [('MapReduce', 1.861807319565799), ('Postgres', 1.3164965809277263), ('MongoDB', 1.3164965809277263), ('NoSQL', 1.2844570503761732)]
---
User: 1
Current Interests: ['NoSQL', 'MongoDB', 'Cassandra', 'HBase', 'Postgres']
User-based recommendations: [('databases', 0.6), ('MySQL', 0.6), ('Java', 0.3380617018914066), ('Big Data', 0.3380617018914066)]
Item-based recommendations: [('databases', 1.

2) Now go back and add an additional user that has an interest of 'R' only.  Compare the recommendations for this new user.  

i) Are they reasonable?  

The results for these recommendations seem to be reasonable, but more relevant with the user-based recommendations. Item-based recommendations seem to be a bit more lacking with one recommendation for Haskell, simply because there was only one peice of data to predict on.

ii) Speaking of 'reasonable', how would you quantity the 'reasonableness' of the recommendations (i.e, how could you measure success)?

Measuring success would involve determining how likely the user is to actually use the recommendation provided. In terms of actually quantifying this, we look at the similarity score. We see that the user-based recommendations have significantly higher similarity scores than the item-based recommendations, and this intuitively makes sense. These recommendations seemed more reasonable overall.  


3) Add a method to calculate the manhattan_similarity then redo the recommendations.  Compare with the previous user-based suggestions made with cosine similiarity.  

iii) Do you notice a difference?  If so, what is it?

Yes, it appears that the manhattan similarity tends to recommend less relevant interests for the particular user. For example, with user 3, their interests seem to be data science and statistics. The manhattan recommendation seems to be recommending more general mathematical topics. The cosine recommendation recommends things specific to this user, like pandas and statsmodels.

In [13]:
# Update the similarities making use of manhatten distance

def manhattan_similarity(v,w):
    return sum(abs(a-b) for a,b in zip(v,w))

# RECALCULATE USER SIMILARITY
user_similarities = [[manhattan_similarity(interest_vector_i, interest_vector_j)
                      for interest_vector_j in user_interest_matrix]
                     for interest_vector_i in user_interest_matrix]

# Display current interests and top 4 new user-based recommendations and the prior user- and item-based recommendations
#  You will probably need to go back and save the suggestions_cosine_user_based 
print "---"
for i in user_ids:
    print "User:", i
    print "Current Interests:", users_interests[i]
    # print new manhattan based similarity with cosine based similarity
    print "User-based recommendations (manhattan):", user_based_suggestions(i)[:4]
    print "Item-based recommendations (manhattan):", item_based_suggestions(i)[:4]
    print "User-based recommendations (cosine):", suggestions_cosine_user_based[i][:4]
    print "Item-based recommendations (cosine):", suggestions_cosine_item_based[i][:4]
    print "---"  


---
User: 0
Current Interests: ['Hadoop', 'Big Data', 'HBase', 'Java', 'Spark', 'Storm', 'Cassandra']
User-based recommendations (manhattan): [('R', 51.0), ('Python', 46.0), ('probability', 34.0), ('regression', 33.0)]
Item-based recommendations (manhattan): [('MapReduce', 1.861807319565799), ('Postgres', 1.3164965809277263), ('MongoDB', 1.3164965809277263), ('NoSQL', 1.2844570503761732)]
User-based recommendations (cosine): [('MapReduce', 0.5669467095138409), ('MongoDB', 0.50709255283711), ('Postgres', 0.50709255283711), ('NoSQL', 0.3380617018914066)]
Item-based recommendations (cosine): [('MapReduce', 1.861807319565799), ('Postgres', 1.3164965809277263), ('MongoDB', 1.3164965809277263), ('NoSQL', 1.2844570503761732)]
---
User: 1
Current Interests: ['NoSQL', 'MongoDB', 'Cassandra', 'HBase', 'Postgres']
User-based recommendations (manhattan): [('R', 43.0), ('Python', 40.0), ('Java', 28.0), ('probability', 28.0)]
Item-based recommendations (manhattan): [('databases', 1.9915638315627207)

4) Create an item-based recommender system using the movie data from [Movielens](https://grouplens.org/datasets/movielens/latest/).  The `ratings.csv` file contains four columns with a header.  The columns are userId, movieId, rating, and timestamp.  The `movies.csv` file contains three columns with a header.  The columns are movieId, title and genres.  

In [14]:
from zipfile import ZipFile
import numpy as np
from StringIO import StringIO
from zipfile import ZipFile
from urllib import urlopen
from collections import defaultdict

N = 7500

resp = urlopen("http://files.grouplens.org/datasets/movielens/ml-latest-small.zip")
zipObj = ZipFile(StringIO(resp.read()))

ratings_file = zipObj.open('ml-latest-small/ratings.csv')
ratings = np.genfromtxt(ratings_file, delimiter=',', skip_header=1)

# Pull off just the userId and MovieId and convert to integers.  
ratings = np.matrix(ratings[:N,:2], dtype=np.int)
ratings = [(str(user_id), str(movie_id))  
           for (user_id, movie_id) in ratings.tolist()]

user_ids = list(set([user_id for (user_id, _) in ratings]))
movie_ids =list(set([movie_id for (_, movie_id) in ratings]))
                      
# Create a map for the titles
movie_file = zipObj.open('ml-latest-small/movies.csv')
moviesByID = dict()
for line in movie_file:
    fields = line.split(',')
    id, title = fields[0], fields[1]

    moviesByID[id] = title
    


Attempt to reuse code from previous example.  Using the movie data, create a list of movie titles by user.

In [15]:
# Reformat data to mirror the users_interests from the first example. 
# i.e., a list of lists with each nest list being the movie titles.
interestByUserID = defaultdict(list)

for (user_id, movie_id) in ratings:
    interestByUserID[user_id].append(moviesByID[movie_id])

users_interests = [interests for interests in interestByUserID.values()] 


Make at most 10 recommendations for each user that has rated less than 25 movies.   

In [19]:
# Create a user-item matrix
# For each user's interest, apply the 'make_user_interest_vector' to create a matrix 
# Get a list of all interests, remove duplicates
unique_interests = sorted(list({ interest 
                                 for user_interests in users_interests
                                 for interest in user_interests }))

def make_user_interest_vector(user_interests):
    """given a list of interests, produce a vector whose i-th element is 1
    if unique_interests[i] is in the list, 0 otherwise"""
    return [1 if interest in user_interests else 0
            for interest in unique_interests]

user_interest_matrix = map(make_user_interest_vector, users_interests)

# For each user, determine the similarity with the other users
user_similarities = [[cosine_similarity(interest_vector_i, interest_vector_j)
                      for interest_vector_j in user_interest_matrix]
                     for interest_vector_i in user_interest_matrix]

for i in range(len(users_interests)):
    if len(users_interests[i]) < 25:
        print "User:", i, 
        print "Current Interests:", users_interests[i][:3]
        print "Suggestsions:", user_based_suggestions(i)[:3]
        print "---"

User: 3 Current Interests: ['Braveheart (1995)', '"Shawshank Redemption', 'Forrest Gump (1994)']
Suggestsions: [('Pulp Fiction (1994)', 2.82554758663331), ('"Silence of the Lambs', 2.442635000154732), ('Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)', 2.1845457766952117)]
---
User: 6 Current Interests: ['GoldenEye (1995)', 'Babe (1995)', 'Seven (a.k.a. Se7en) (1995)']
Suggestsions: [('"Shawshank Redemption', 4.961645843525836), ('Dances with Wolves (1990)', 4.361511105142404), ('Braveheart (1995)', 4.061392381422561)]
---
User: 44 Current Interests: ['Apollo 13 (1995)', 'Batman Forever (1995)', 'Die Hard: With a Vengeance (1995)']
Suggestsions: [('Braveheart (1995)', 4.0917399653900715), ('"Usual Suspects', 3.7295069125148497), ('"Mask', 3.3494269502184624)]
---
User: 46 Current Interests: ['"American President', 'Get Shorty (1995)', 'Clueless (1995)']
Suggestsions: [('Pulp Fiction (1994)', 3.6953595126157155), ('"Shawshank Redemption', 3.571262451794194

iv) Summarize the process the that is used to make user-based recommendations with collaborative filtering.  

The CF process makes recommendations by looking at other users, who have similar tastes, and looks for anything that the current user hasn't seen. The more similar users that have seen the movie, the higher the weight of the recommendation.