In [33]:
import pandas as pd
import numpy as np

In [6]:
# data reference https://github.com/zygmuntz/goodbooks-10k/blob/master/tags.csv
ratings = pd.read_csv("./data/ratings.csv")


In [7]:
ratings[:2]

Unnamed: 0,user_id,book_id,rating
0,1,258,5
1,2,4081,4


In [14]:
# create user-item matrix as numpy array
n_users = ratings["user_id"].nunique()
n_books = ratings["book_id"].nunique()
user_book_matrix_np = np.zeros((n_users, n_books))
for row in ratings.itertuples():
    user_book_matrix_np[row[1]-1, row[2]-1] = row[3]
user_book_matrix_np

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [4., 5., 0., ..., 0., 0., 0.],
       [4., 5., 0., ..., 0., 0., 0.],
       [4., 5., 4., ..., 0., 0., 0.]])

In [16]:
# calculate the fraction of user-item values filled
sparsity = float(len(user_book_matrix_np.nonzero()[0]))
sparsity /= (user_book_matrix_np.shape[0] * user_book_matrix_np.shape[1])
sparsity *= 100
print("Sparsity: {:4.2f}%".format(sparsity))

Sparsity: 1.12%


In [23]:
def fast_similarity(user_book_matrix, kind='user', epsilon=1e-9):
    """Calculate the similarity matrix for user or items.
    
    :param ratings: User-item matrix with ratings as values
    :type: np.array[n_users, n_items]
    :param kind: Type of similarity
    :type: string
    :return: Similarity matrix
    :rtype: np.array[n_users, n_users] or [n_items, n_items]
    """
    # add small number for handling dived-by-zero errors
    if kind == 'user':
        sim = user_book_matrix.dot(user_book_matrix.T) + epsilon
    elif kind == 'item':
        sim = user_book_matrix.T.dot(user_book_matrix) + epsilon
    else:
        raise ValueError("Only user/item are possible kinds for the similarity.")
    
    norms = np.array([np.sqrt(np.diagonal(sim))])
    return (sim / norms / norms.T)

In [24]:
# calculate the similarity matrix
similarity_books = fast_similarity(user_book_matrix_np, "item")

In [25]:
def find_k_most_similar_books(similarity_books, book_id, k=5):
    """Find the k most similar books.
    
    :param similarity_books: Similarity matrix of the books
    :type: np.array[n_books, n_books]
    :param book_id: Book id for which to find similar ones
    :type: int
    :param k: Amount of similar books to find
    :type: int
    :return: List of ids of the k most similar books
    :rtype: array
    """
    all_similarities = similarity_books[:, book_id-1]
    # prevent that highest similarity is with itself
    all_similarities[book_id - 1] = 0
    # k indexes with highes similarity to book_id
    most_similar_books = np.flip(np.argpartition(all_similarities, -k)[-k:]) + 1
    return most_similar_books


In [26]:
find_k_most_similar_books(similarity_books, 1, 5)

array([17, 20,  2,  3, 12])

In [31]:
def make_item_based_recommendation(books_liked, n_rec, similarity):
    """Make item based recommendations for books.
    
    :param books_liked: Ids of books liked
    :type: array
    :param n_rec: Number of recommendations to make
    :type: int
    :param similarity_books: Similarity matrix of the books
    :type: np.array[n_books, n_books]
    :return: Book ids of the recommendations
    :rtype: array
    """
    # for each book find most similar ones
    book_similarity = {}
    for book_id in books_liked:
        most_similar = find_k_most_similar_books(similarity, book_id)
        # add similar book ids and similarities to dict
        for similar_book_id in most_similar:
            book_similarity[similar_book_id] = similarity[book_id-1][similar_book_id-1]

    # sort the dict by the similarity
    sorted_book_similarity = dict(sorted(book_similarity.items(), key=lambda item: item[1], reverse=True))
    print(sorted_book_similarity)
    possible_recommendations = list(sorted_book_similarity.keys())
    # count actual recommendations
    cnt_rec = 0
    recommendations = []
    for rec_book_id in possible_recommendations:
        # only not yet recommended and liked books are actual recommendations
        if rec_book_id not in recommendations and rec_book_id not in books_liked:
            recommendations.append(rec_book_id)
            cnt_rec += 1
            # leave loop once enough recommendations are obtained
            if cnt_rec >= n_rec:
                break
    
    return recommendations

In [32]:
# obtain 10 recommendations based on movies one and two
make_item_based_recommendation([1, 2], 10, similarity_books)

{17: 0.7230112211972394, 20: 0.6873469586897267, 18: 0.6803923969196116, 23: 0.6728205638365854, 24: 0.6692005033391328, 21: 0.6574468707948864, 27: 0.6520871016336011, 2: 0.5898402968141494, 3: 0.5611418959119355, 12: 0.5233165065265468}


[17, 20, 18, 23, 24, 21, 27, 3, 12]

In [22]:
similarity_books

Unnamed: 0,user_id,book_id,rating
user_id,1.0,0.549771,0.84549
book_id,0.549771,1.0,0.608573
rating,0.84549,0.608573,1.0
