First recommendation engine is based on user-based collaborative filtering: In this type of recommendation, users related to the user you would like to make recommendations for are used to create a recommendation.

In [1]:
import numpy as np
import pandas as pd

In [2]:
books = pd.read_csv("./data/books.csv")
ratings = pd.read_csv("./data/ratings.csv")

In [3]:
ratings[:5]

Unnamed: 0,user_id,book_id,rating
0,1,258,5
1,2,4081,4
2,2,260,5
3,2,9296,5
4,2,2318,3


In [4]:
# create a user item matrix
user_book_matrix = ratings.groupby(['user_id', 'book_id'])["rating"].max().unstack()
# TODO: remove users with too few ratings

In [5]:
user_book_matrix

book_id,1,2,3,4,5,6,7,8,9,10,...,9991,9992,9993,9994,9995,9996,9997,9998,9999,10000
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,5.0,,,,,,4.0,...,,,,,,,,,,
2,,5.0,,,5.0,,,4.0,,5.0,...,,,,,,,,,,
3,,,,3.0,,,,,,,...,,,,,,,,,,
4,,5.0,,4.0,4.0,,4.0,4.0,,5.0,...,,,,,,,,,,
5,,,,,,4.0,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53420,4.0,5.0,3.0,,2.0,,,,4.0,3.0,...,,,,,,,,,,
53421,4.0,5.0,,5.0,4.0,,4.0,,5.0,,...,,,,,,,,,,
53422,4.0,5.0,,,,,5.0,,,5.0,...,,,,,,,,,,
53423,4.0,5.0,,5.0,,,5.0,4.0,,,...,,,,,,,,,,


In [6]:
def get_read_books(user_book_matrix, user_id):
    """TODO"""
    read_books = list(user_book_matrix.loc[user_id][~user_book_matrix.loc[user_id].isna()].keys())
    return read_books
    
def create_user_book_dict(user_book_matrix):
    """TODO"""
    all_users = user_book_matrix.index.to_numpy()
    books_read = {}
    for user in all_users:
        books_read[user] = get_read_books(user_book_matrix, user)
        
    return books_read

In [24]:
books_read = create_user_book_dict(user_book_matrix)

In [26]:
user_book_matrix.index.to_numpy()[0:10]

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [14]:
def compute_euclidean_dist(user1, user2):
    """TODO"""
    books_user1 = books_read[user1]
    books_user2 = books_read[user2]
    both_read = list(set(books_user1).intersection(books_user2))
    if len(both_read) > 1:
        a = user_book_matrix.loc[user1][both_read]
        b = user_book_matrix.loc[user2][both_read]

        dist = np.linalg.norm(a-b)
    else:
        dist = 10000*5 # highest possible difference
    return dist

In [27]:
def calculate_dist_matrix(max_user=None):
    all_users = user_book_matrix.index.to_numpy()
    # limit if max_user is specified
    if max_user:
        all_users = all_users[0:max_user]
    user1 = []
    user2 = []
    dist = []
    for user_id1 in all_users:
        for user_id2 in all_users:
            d = compute_euclidean_dist(user_id1, user_id2)
            user1.append(user_id1)
            user2.append(user_id2)
            dist.append(d)
    
    dist_df = pd.DataFrame({"user1": user1, "user2": user2, "dist": dist})
    return dist_df
        

In [31]:
dist_df = calculate_dist_matrix(10)

how to consider the number of books they have in common?

In [45]:
user_book_matrix.index

Int64Index([    1,     2,     3,     4,     5,     6,     7,     8,     9,
               10,
            ...
            53415, 53416, 53417, 53418, 53419, 53420, 53421, 53422, 53423,
            53424],
           dtype='int64', name='user_id', length=53424)

In [29]:
dist_df

Unnamed: 0,user1,user2,dist
0,1,1,0.000000
1,1,2,3.316625
2,1,3,5.567764
3,1,4,6.633250
4,1,5,1.414214
...,...,...,...
95,10,6,5.196152
96,10,7,3.162278
97,10,8,5.477226
98,10,9,8.366600


In [73]:
def find_closest_neighbors(user_id):
    '''
    INPUT:
        user - (int) the user_id of the individual you want to find the closest users
    OUTPUT:
        closest_neighbors - an array of the id's of the users sorted from closest to farthest away
    '''
    # ties are treated arbitrary and just kept whichever was easiest to keep using the head method
    # ordering the neighbors e.g. according to books read might be better    
    closest_neighbors = list(dist_df[dist_df["user1"] == user_id].sort_values(["dist"], ascending=True)["user2"])
    closest_neighbors.remove(user_id)
    return closest_neighbors

def books_liked(user_id, min_rating=3):
    '''
    INPUT:
    user_id - the user_id of an individual as int
    min_rating - the minimum rating considered while still a movie is still a "like" and not a "dislike"
    OUTPUT:
    books_liked - an array of movies the user has read and liked
    '''
    book_ratings = user_book_matrix.loc[user_id][books_read[user_id]]
    books_liked = list(book_ratings[book_ratings >= min_rating].keys())
    return books_liked

def get_book_info(book_id):
    """TODO"""
    book_info = books.loc[books["book_id"]==book_id]
    authors = book_info.squeeze().authors
    title = book_info.squeeze().title
    return authors, title

def make_user_based_recommendation(user_id, num_rec=10, min_rating=3):
    """TODO"""
    recommendations = []
    books_read_by_user = books_read[user_id]
    for neighbor in find_closest_neighbors(user_id):
        liked_by_neighbor = books_liked(neighbor, min_rating)
        books_not_read = list(np.setdiff1d(liked_by_neighbor, books_read_by_user, assume_unique=False))
        books_not_recommended = list(np.setdiff1d(books_not_read, recommendations, assume_unique=False))
        # book_lst = book_names(books_not_read)
        recommendations = recommendations + books_not_recommended
        if len(recommendations) >= num_rec:
            recommendations = recommendations[:num_rec]
            break
    
    return recommendations

def print_recommendations(recommendations):
    """TODO"""
    for idx, rec_id in enumerate(recommendations):
        authors, title = get_book_info(rec_id)
        print("{}: {} with {}".format(idx, authors, title))

In [33]:
find_closest_neighbors(1)

[5, 2, 7, 3, 6, 8, 4, 10, 9]

In [47]:
make_user_based_recommendation(1, num_rec=10, min_rating=5)

[164, 451, 458, 623, 794, 852, 1182, 1195, 1345, 1347]

In [74]:
recommendations = make_user_based_recommendation(1, num_rec=10, min_rating=5)
print_recommendations(recommendations)

0: Rainbow Rowell with Eleanor & Park
1: Jane Austen, Alfred MacAdam with Northanger Abbey
2: Emily St. John Mandel with Station Eleven
3: Stephen King with Mr. Mercedes (Bill Hodges Trilogy, #1)
4: Stephen King with Doctor Sleep (The Shining, #2)
5: Paul Kalanithi, Abraham Verghese with When Breath Becomes Air
6: Stephen King with Joyland
7: Louise Erdrich with The Round House
8: Karin Slaughter with Pretty Girls
9: Stephen King with Revival


In [67]:
book_info = books.loc[books["book_id"]==1]
book_info.squeeze()

book_id                                                                      1
goodreads_book_id                                                      2767052
best_book_id                                                           2767052
work_id                                                                2792775
books_count                                                                272
isbn                                                                 439023483
isbn13                                                             9.78044e+12
authors                                                        Suzanne Collins
original_publication_year                                                 2008
original_title                                                The Hunger Games
title                                  The Hunger Games (The Hunger Games, #1)
language_code                                                              eng
average_rating                                      