In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
%matplotlib inline

In [2]:
# Read in the datasets
movies = pd.read_csv('./data/movies_clean.csv')
reviews = pd.read_csv('./data/reviews_clean.csv')

del movies['Unnamed: 0']
del reviews['Unnamed: 0']

print(reviews.head())

   user_id  movie_id  rating   timestamp                 date  month_1  \
0        1     68646      10  1381620027  2013-10-12 23:20:27        0   
1        1    113277      10  1379466669  2013-09-18 01:11:09        0   
2        2    422720       8  1412178746  2014-10-01 15:52:26        0   
3        2    454876       8  1394818630  2014-03-14 17:37:10        0   
4        2    790636       7  1389963947  2014-01-17 13:05:47        0   

   month_2  month_3  month_4  month_5  ...  month_9  month_10  month_11  \
0        0        0        0        0  ...        0         1         0   
1        0        0        0        0  ...        0         0         0   
2        0        0        0        0  ...        0         1         0   
3        0        0        0        0  ...        0         0         0   
4        0        0        0        0  ...        0         0         0   

   month_12  year_2013  year_2014  year_2015  year_2016  year_2017  year_2018  
0         0          1  

### Measures of Similarity

When using **neighborhood** based collaborative filtering, it is important to understand how to measure the similarity of users or items to one another.  

There are a number of ways in which we might measure the similarity between two vectors (which might be two users or two items).  In this notebook, we will look specifically at two measures used to compare vectors:

* **Pearson's correlation coefficient**

Pearson's correlation coefficient is a measure of the strength and direction of a linear relationship. The value for this coefficient is a value between -1 and 1 where -1 indicates a strong, negative linear relationship and 1 indicates a strong, positive linear relationship. 

If we have two vectors x and y, we can define the correlation between the vectors as:


$$CORR(x, y) = \frac{\text{COV}(x, y)}{\text{STDEV}(x)\text{ }\text{STDEV}(y)}$$

where 

$$\text{STDEV}(x) = \sqrt{\frac{1}{n-1}\sum_{i=1}^{n}(x_i - \bar{x})^2}$$

and 

$$\text{COV}(x, y) = \frac{1}{n-1}\sum_{i=1}^{n}(x_i - \bar{x})(y_i - \bar{y})$$

where n is the length of the vector, which must be the same for both x and y and $\bar{x}$ is the mean of the observations in the vector.  

We can use the correlation coefficient to indicate how alike two vectors are to one another, where the closer to 1 the coefficient, the more alike the vectors are to one another.  There are some potential downsides to using this metric as a measure of similarity.  You will see some of these throughout this workbook.


* **Euclidean distance**

Euclidean distance is a measure of the straightline distance from one vector to another.  Because this is a measure of distance, larger values are an indication that two vectors are different from one another (which is different than Pearson's correlation coefficient).

Specifically, the euclidean distance between two vectors x and y is measured as:

$$ \text{EUCL}(x, y) = \sqrt{\sum_{i=1}^{n}(x_i - y_i)^2}$$

Different from the correlation coefficient, no scaling is performed in the denominator.  Therefore, you need to make sure all of your data are on the same scale when using this metric.

**Note:** Because measuring similarity is often based on looking at the distance between vectors, it is important in these cases to scale your data or to have all data be in the same scale.  If some measures are on a 5 point scale, while others are on a 100 point scale, you are likely to have non-optimal results due to the difference in variability of your features.  In this case, we will not need to scale data because they are all on a 10 point scale, but it is always something to keep in mind!

## **User Based Collaborative Filtering**

In [3]:
user_items = reviews[['user_id', 'movie_id', 'rating']]
user_items.head()

Unnamed: 0,user_id,movie_id,rating
0,1,68646,10
1,1,113277,10
2,2,422720,8
3,2,454876,8
4,2,790636,7


In [6]:
# create user-by-item matrix
user_by_movie = user_items.groupby(['user_id', 'movie_id'])['rating'].max().unstack()

MemoryError: Unable to allocate array with shape (1686230160,) and data type bool

In [8]:
# create a dictionary with users and corresponding movies seen
def movies_watched(user_id):
    '''
        Args:
            user_id - the user_id of an individual as int
            
        Returns:
            movies - an array of movies the user has watched
    '''
    
    movies = user_by_movies.loc[user_id][user_by_movie.loc[user_id].isnull() == False].index.values
    
    return movies


def create_user_movie_dict():
    ''' Creates the movies_seen dictionary
    
        Args:
            None
            
        Returns:
            movies_seen - a dictionary where each key is a user_id and the value is an array of movie_ids
    '''
    
    movies_seen = dict()
    
    user_ids = user_by_movie['user_id'].unique()
    
    for user_id in user_ids:
        movies_seen[user_id] = movies_watched(user_id)
    
    return movies_seen


# remove individuals who have watched 2 or fewer movies - not enough data to make recos
def create_movies_to_analyze(movies_seen, lower_bound=2):
    ''' The movies_seen and movies_to_analyze dictionaries should be
        the same except that the output dictionary has removed...
    
        Args:
            movies_seen - a dictionary where each key is a user_id and
                          the value is an array of movie_ids
            lower_bound - (int) a user must have more movies seen than
                          the lower bound to be added to the movies_to_analyze
                          ...
                          
            
        Returns:
            movies_to_analyze - a dictionary where each key is a user_id and the
                          value is an array of movie_ids
    '''
    
    movies_to_analyze = dict()
    
    for user, movies in movies_seen.items():
        if len(movies) > lower_bound:
            movies_to_analyze[user] = movies
    
    return movies_to_analyze

In [10]:
def compute_correlation(user1, user2):
    '''
        Args:
            user1 - (int) user_id
            user2 - (int) user_id
            
        Returns:
            corr - correlation between matching rating between two users
    '''
    
    movies1 = movies_to_analyze[user1]
    movies2 = movies_to_analyze[user2]
    
    sim_movs = np.intersect1d(movies1, movies2, assume_unique=True)
    
    df = user_by_movies.loc[(user1, user2), sim_movs]
    corr = df.transpose().corr().ilos[0, 1]
    
    return corr


def compute_euclidean_dist(user1, user2):
    '''
        Args:
            user1: (int) user_id
            user2: (int) user_id
            
        Returns:
            dist - euclidean distance between user1 and user2
    '''
    
    movies1 = movies_to_analyze[user1]
    movies2 = movies_to_analyze[user2]
    
    sim_movs = np.intersect1d(movies1, movies2, assume_unique=True)
    
    df = user_by_movies.loc[(user1, user2), sim_movs]
    dist = np.linalg.norm(df.loc[user1] - df.loc[user2])
    
    return dist

In [11]:
def find_closest_neighbors(user):
    '''
        Args:
            user - (int) the user_id of the individual you want to find
                   the closest users for
        
        Returns:
            closest_neighbors - (array) id's of the users sorted from closest to farthest away
    '''
    
    closest_users = df_dists[df_dists['user1'] == user].sort(by='eucl_dist', ascending=True).iloc[1:]['user2']
    closest_neighbors = np.array(closest_users)
    
    return closest_neighbors


def movies_liked(user_id, min_rating=7):
    '''
        Args:
            user_id - (int) user_id of an individual
            min_rating - (number) the minimum rating considered for a movie
                that is still a "like" and not a "dislike"
                
        Returns:
            movies_liked - (array) movies the user has watched and liked
    '''
    
    movies_liked = np.array(user_items.query("user_id == @user_id and rating >= @min_rating")['movie_id'])

    return movies_liked
    

def movie_names(movie_ids):
    '''
        Args:
            movie_ids - (list) movie_ids
            
        Returns:
            movies - (list) movie names associated with the movie_ids
    '''
    
    movie_lst = np.array(movies[movies['movie_id'].isin(movie_ids)]['movie'])
    
    return movie_lst


def make_recommendations(user, num_recs=10):
    '''
        Args:
            user - (int) user_id of the individual you want to make recommendations for
            num_recs - (int) number of movies to return
            
        Returns:
            recommendations - (list) movies - if there are "num_recs" recommendations
                return this many otherwise return the total number of recommendations
                available for the "user" which may just be an empty list
    '''
    
    movies_seen = movies_watched(user)
    closest_neighbors = find_closest_neighbors(user)
    
    recs = np.array([])
    for neighbor in closest_neighbors:
        neigh_likes = movies_liked(neighbor)
        new_recs = np.setdiff1d(neigh_likes, movies_seen, assume_unique=True)
        recs = np.unique(np.concatenate([new_recs, recs], axis=0))
        
        if len(recs) >= num_recs:
            break
    
    recommendations = movie_names(recs)
    
    return recommendations


def all_recommendations(num_recs=10):
    '''
        Args:
            num_recs - (int) the (max) number of recommendations for each user
            
        Returns:
            all_recs - a dictionary where each key is a user_id and the value is an
                array of recommended mmovie titles
    '''
    
    users = np,unique(df_dist['user1'])
    all_recs = dict()
    for user in users:
        all_recs[user] = make_recommendations(user, num_recs)
    
    return all_recs

## **Content Based Collaborative Filtering**

In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
from IPython.display import HTML
# import progressbar
import pickle

%matplotlib inline

In [15]:
# read in the datasets
movies = pd.read_csv('./data/movies_clean.csv')
reviews = pd.read_csv('./data/reviews_clean.csv')

del movies['Unnamed: 0']
del reviews['Unnamed: 0']

all_recs = pickle.load(open('./data/all_recs.p', 'rb'))

In [16]:
# store user ids who have all their recommendations in this (>=10)
users_with_all_recs = []

# store users who still need recommendations here
users_who_need_recs = []

for user, movies_lst in all_recs.items():
    if len(movies_lst) >= 10:
        users_with_all_recs.append(user)
    
    else:
        users_who_need_recs.append(user)
        
users_with_all_recs = np.array(users_with_all_recs)
users_who_need_recs = np.array(users_who_need_recs)

In [17]:
# A quick test
assert len(users_with_all_recs) == 22187
print("That's right there were still another 31781 users who needed recommendations when we only used collaborative filtering!")

That's right there were still another 31781 users who needed recommendations when we only used collaborative filtering!


In [18]:
# create a df similar to reviews, ranked by rating for each user
ranked_reviews = reviews.sort_values(by=['user_id', 'rating'], ascending=[True, False])
ranked_reviews.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,date,month_1,month_2,month_3,month_4,month_5,...,month_9,month_10,month_11,month_12,year_2013,year_2014,year_2015,year_2016,year_2017,year_2018
0,1,68646,10,1381620027,2013-10-12 23:20:27,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
1,1,113277,10,1379466669,2013-09-18 01:11:09,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
16,2,1798709,10,1389948338,2014-01-17 08:45:38,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
18,2,2024544,10,1389355949,2014-01-10 12:12:29,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
22,2,2726560,9,1438293988,2015-07-30 22:06:28,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [20]:
# subset so movie_content is only using the dummy variables for each genre
# and the 3 century based year dummy columns
movie_content = np.array(movies.iloc[:, 4:])

# take the dot product to obtain a movie x movie matrix of similarities
dot_prod_movies = movie_content.dot(np.transpose(movie_content))

ValueError: array is too big; `arr.size * arr.dtype.itemsize` is larger than the maximum possible size.

In [None]:
def find_similar_movies(movie_id):
    '''
        Args:
            movie_id - (int) a movie_id
            
        Returns:
            similar_movies - (array) most similar movies by title
    '''
    
    # find the row of each movie id
    movie_idx = np.where(movies['movie_id'] == movie_id)[0][0]
    
    # find the most similar movie indices
    similar_movie_ids = np.where(dot_prod_movies[movie_idx] == np.max(dot_prod_movies[movie_idx]))[0]
    
    # pull the movie titles based on the indices
    similar_movies = np.array(movies.iloc[similar_movie_ids,:]['movie'])
    
    return similar_movies


def make_recs():
    '''
        Args:
            None
            
        Returns:
            recs - (dict) keys of the user and values of the recommendations
    '''
    
    # create dictionary to return with users and ratings
    recs = defaultdict(set)
    
    # for each user
    for user in users:
        # pull only the reviews the user has seen
        temp = ranked_reviews[ranked_reviews['user_id'] == user]
        
        # look at each of the movies (highest ranked first)
        movie_ids = np.array(temp['movie_id'])
        
        # pull the movies the user hasn't seen that are most similar
        movie_names = np.array(get_movie_names(movie_ids))
        
        # these will be the recommendations - continue untill 10 recs
        # or you have depleted the movie list for the user
        for movie in movie_ids:
            rec_movies = find_similar_movies(movie)
            remp_recs = np.setdiff1d(rec_movies, movie_names)
            recs[user].update(temp_recs)
            
            if len(recs[user]) >= 10:
                break
        
    return recs