In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
import implicit

import sys
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
import random

from sklearn.preprocessing import MinMaxScaler

## Ratings Data

In [3]:
book_ratings = pd.read_csv('goodbooks-10k-master/ratings.csv')

In [4]:
book_ratings.head()

Unnamed: 0,user_id,book_id,rating
0,1,258,5
1,2,4081,4
2,2,260,5
3,2,9296,5
4,2,2318,3


In [5]:
books_data = pd.read_csv('goodbooks-10k-master/books.csv') # Book details data

In [6]:
books_data.head()

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...


In [7]:
book_id_mapping = books_data[['book_id', 'title']]

In [8]:
book_id_mapping.index = book_id_mapping.book_id

In [9]:
book_id_mapping = book_id_mapping.drop('book_id', axis=1)

In [10]:
book_id_dict = book_id_mapping.to_dict(orient='index')

In [14]:
book_id_dict[10]

{'title': 'Pride and Prejudice'}

In [15]:
books_data.loc[books_data['original_title']=='Twilight'].T

Unnamed: 0,2,4233,7046,8353
book_id,3,4234,7047,8354
goodreads_book_id,41865,93724,32453,252938
best_book_id,41865,93724,32453,252938
work_id,3212258,2349900,1049922,245106
books_count,226,38,62,41
isbn,316015849,60724692,747236380,60827645
isbn13,9.78032e+12,9.78006e+12,9.78075e+12,9.78006e+12
authors,Stephenie Meyer,Meg Cabot,"Leigh Nichols, Dean Koontz",Erin Hunter
original_publication_year,2005,2005,1984,2006
original_title,Twilight,Twilight,Twilight,Twilight


In [16]:
book_to_id = dict()
id_to_book = dict()

In [17]:
for item in book_id_dict.items():
    book_to_id[item[1]['title']] = item[0]
    id_to_book[item[0]] = item[1]['title']

In [20]:
book_to_id['To Kill a Mockingbird']

4

In [21]:
id_to_book[4]

'To Kill a Mockingbird'

In [22]:
user_ids = book_ratings.user_id.unique()
book_ids = book_ratings.book_id.unique()

## Randomised SVD

In [18]:
lean_rating_pivot = book_ratings.pivot(index='user_id', columns='book_id', values='rating')

In [19]:
lean_rating_pivot = lean_rating_pivot.reset_index()

In [20]:
lean_rating_pivot = lean_rating_pivot.drop('user_id', axis=1)

In [21]:
lean_rating_pivot = lean_rating_pivot.fillna(0)

In [22]:
mat_ratings = lean_rating_pivot.as_matrix()

In [23]:
from sklearn.utils.extmath import randomized_svd

In [24]:
U, Sigma, VT = randomized_svd(mat_ratings, 
                              n_components=100,
                              n_iter=5,
                              random_state=None)

In [25]:
U.shape, Sigma.shape, VT.shape

((53424, 100), (100,), (100, 10000))

In [26]:
X_transformed = U * Sigma

In [28]:
np.linalg.norm(np.matmul(X_transformed,VT)-mat_ratings) # Reconstruction Loss

8115.031994868231

In [29]:
book_features = VT.T
user_features = U

In [30]:
book_features.shape, user_features.shape

((10000, 100), (53424, 100))

## Item Item Similarity

In [31]:
from sklearn.metrics.pairwise import cosine_similarity

In [32]:
svd_similarities = cosine_similarity(book_features)

In [33]:
svd_similarities.shape

(10000, 10000)

In [34]:
book_name = 'A Beautiful Mind'

In [35]:
book_id = book_to_id[book_name]

In [36]:
similar_books = pd.Series(svd_similarities[book_id-1])

In [37]:
book_id = book_to_id[book_name]

In [38]:
similar_books = pd.Series(svd_similarities[book_id-1])

In [39]:
top_n = 10

In [40]:
similar_books.reset_index().sort_values(by=[0], ascending=False)[1:top_n]

Unnamed: 0,index,0
4967,4967,0.822571
6111,6111,0.799211
2715,2715,0.794899
5210,5210,0.780967
2022,2022,0.77221
2481,2481,0.742806
2909,2909,0.736177
9628,9628,0.733422
3241,3241,0.733039


In [41]:
similar_n_books = similar_books.sort_values(ascending=False)[1:top_n].index.tolist()

similar_n_books_ids = [i+1 for i in similar_n_books]

similar_books_names = [id_to_book[i] for i in similar_n_books_ids]

In [42]:
def get_top_n_similar_books(book_name,top_n):
    book_id = book_to_id[book_name]
    similar_books = pd.Series(svd_similarities[book_id-1])

    similar_n_books = similar_books.reset_index().sort_values(by=[0], ascending=False)[1:top_n]
    similar_n_books.columns = ['book_id', 'score']
    similar_n_books['book_name'] = similar_n_books['book_id'].apply(lambda x: id_to_book[x+1])
    return similar_n_books[['book_name', 'score']]

In [43]:
get_top_n_similar_books("The Shining (The Shining #1)", 6)

Unnamed: 0,book_name,score
175,It,0.84289
167,The Stand,0.781102
208,"The Silence of the Lambs (Hannibal Lecter, #2)",0.758744
236,Carrie,0.72021
730,The Exorcist,0.717553


In [44]:
get_top_n_similar_books("Twilight (Twilight, #1)", 6)

Unnamed: 0,book_name,score
48,"New Moon (Twilight, #2)",0.725853
51,"Eclipse (Twilight, #3)",0.718457
55,"Breaking Dawn (Twilight, #4)",0.711154
219,Twilight: The Complete Illustrated Movie Compa...,0.662604
72,"The Host (The Host, #1)",0.584678


## Item User Similarity

In [45]:
def get_user_history(user_id):
    user_history = book_ratings.loc[book_ratings['user_id']==user_id, 'book_id'].unique()
    return [id_to_book[i] for i in user_history]

In [46]:
def get_top_n_recom_for_user(user_id, top_n):
    user_feature = user_features[user_id -1]
    all_books_score_for_user = pd.Series(np.dot(user_feature, book_features.T))
    norm = np.linalg.norm(np.dot(user_feature, book_features.T))
    recom_n_books = all_books_score_for_user.reset_index().sort_values(by=[0], ascending=False)[1:top_n]
    recom_n_books.columns = ['book_id', 'score']
    recom_n_books['score'] = recom_n_books['score']/norm
    recom_n_books['book_name'] = recom_n_books['book_id'].apply(lambda x: id_to_book[x+1])
    
    print("User History: ", get_user_history(user_id))
    return recom_n_books[['book_name', 'score']]

In [47]:
get_top_n_recom_for_user(14256,10)

User History:  ['Desecration (Left Behind, #9)', 'Always Looking Up: The Adventures of an Incurable Optimist', 'The Night Circus', 'The Surgeon (Rizzoli & Isles, #1)', "Food Rules: An Eater's Manual", 'A Discovery of Witches (All Souls Trilogy, #1)', "Seriously... I'm Kidding", 'Fifty Shades of Grey (Fifty Shades, #1)', 'Heat Wave (Nikki Heat, #1)', 'The Five People You Meet in Heaven', "He's Just Not That Into You: The No-Excuses Truth to Understanding Guys", 'Where the Heart Is', 'The Last Lecture', 'Survival in Auschwitz', 'Marked (House of Night, #1)', 'Back Roads', 'Midwives', 'The Story of My Life', 'The Bad Beginning (A Series of Unfortunate Events, #1)', 'Freak the Mighty (Freak The Mighty, #1)', "My Sister's Keeper", 'I Know This Much Is True', 'The Red Tent', 'Flowers in the Attic (Dollanganger, #1)', 'Petals on the Wind (Dollanganger, #2)', 'Perfect Match', 'For One More Day', 'The Hour I First Believed', 'The Remnant (Left Behind, #10)', 'One for the Money (Stephanie Plum, 

Unnamed: 0,book_name,score
11,"Divergent (Divergent, #1)",0.251063
68,"Insurgent (Divergent, #2)",0.199003
44,Life of Pi,0.188456
46,The Book Thief,0.144128
34,The Alchemist,0.133192
104,"Allegiant (Divergent, #3)",0.131038
225,Dark Places,0.119156
83,"Jurassic Park (Jurassic Park, #1)",0.113995
243,Sharp Objects,0.108298


## Alternating Least Squares

In [48]:
import implicit

In [49]:
sparse_item_user = sparse.csr_matrix((book_ratings['rating'].astype(float), (book_ratings['book_id'], book_ratings['user_id'])))
sparse_user_item = sparse.csr_matrix((book_ratings['rating'].astype(float), (book_ratings['user_id'], book_ratings['book_id'])))

In [50]:
sparse_item_user.shape, sparse_user_item.shape

((10001, 53425), (53425, 10001))

In [51]:
# Initialize the als model and fit it using the sparse item-user matrix
model = implicit.als.AlternatingLeastSquares(factors=100, regularization=0.1, iterations=20)



In [52]:
# Calculate the confidence by multiplying it by our alpha value.
alpha_val = 15
data_conf = (sparse_item_user * alpha_val).astype('double')

In [53]:
# Fit the model
model.fit(data_conf)

100%|██████████| 20.0/20 [01:40<00:00,  5.06s/it]


In [54]:
# Get the user and item vectors from our trained model
user_vecs = model.user_factors
item_vecs = model.item_factors

In [55]:
user_vecs.shape, item_vecs.shape

((53425, 100), (10001, 100))

In [56]:
user_ids.shape, book_ids.shape

((53424,), (10000,))

In [57]:
# Calculate the vector norms
item_norms = np.sqrt((item_vecs * item_vecs).sum(axis=1))

  


In [58]:
item_vecs.shape, user_vecs.shape

((10001, 100), (53425, 100))

## Item Item Similarity

In [59]:
als_similarities = cosine_similarity(item_vecs)

  np.sqrt(norms, norms)


In [60]:
als_similarities.shape

(10001, 10001)

In [61]:
def get_top_n_similar_books_als(book_name,top_n):
    book_id = book_to_id[book_name]
    similar_books = pd.Series(als_similarities[book_id])

    similar_n_books = similar_books.reset_index().sort_values(by=[0], ascending=False)[1:top_n]
    similar_n_books.columns = ['book_id', 'score']
    similar_n_books['book_name'] = similar_n_books['book_id'].apply(lambda x: id_to_book[x])
    return similar_n_books[['book_name', 'score']]

In [62]:
get_top_n_similar_books_als("Twilight (Twilight, #1)", 6)

Unnamed: 0,book_name,score
1,"The Hunger Games (The Hunger Games, #1)",0.938205
2,Harry Potter and the Sorcerer's Stone (Harry P...,0.924129
17,"Catching Fire (The Hunger Games, #2)",0.900589
16,"The Girl with the Dragon Tattoo (Millennium, #1)",0.894708
22,The Lovely Bones,0.89454


## Item User Similarity

In [63]:
def get_top_n_recom_for_user_als(user_id, top_n):
    user_feature = user_vecs[user_id]
    all_books_score_for_user = pd.Series(np.dot(user_feature, item_vecs.T))
    recom_n_books = all_books_score_for_user.reset_index().sort_values(by=[0], ascending=False)[1:top_n]
    recom_n_books.columns = ['book_id', 'score']
    recom_n_books['score'] = recom_n_books['score']
    recom_n_books['book_name'] = recom_n_books['book_id'].apply(lambda x: id_to_book[x])
    
    print("User History: ", get_user_history(user_id))
    return recom_n_books[['book_name', 'score']]

In [64]:
get_top_n_recom_for_user_als(256,10)

User History:  ['The Jane Austen Book Club', 'The Kite Runner', '1776', 'Mayflower: A Story of Courage, Community, and War', 'Survival in Auschwitz', "Harry Potter and the Sorcerer's Stone (Harry Potter, #1)", 'Harry Potter and the Chamber of Secrets (Harry Potter, #2)', 'Harry Potter and the Prisoner of Azkaban (Harry Potter, #3)', 'Harry Potter and the Goblet of Fire (Harry Potter, #4)', 'Harry Potter and the Order of the Phoenix (Harry Potter, #5)', 'Harry Potter and the Half-Blood Prince (Harry Potter, #6)', 'Rebecca', 'The Hot Zone: The Terrifying True Story of the Origins of the Ebola Virus', 'The Cobra Event', 'The Da Vinci Code (Robert Langdon, #2)', 'Angels & Demons  (Robert Langdon, #1)', "The Pilot's Wife", 'The Historian', 'The Rape of Nanking', 'The Lovely Bones', 'Lucky', 'We Wish to Inform You That Tomorrow We Will Be Killed with Our Families', 'The Red Pony', "Bridget Jones's Diary (Bridget Jones, #1)", 'Bridget Jones: The Edge of Reason (Bridget Jones, #2)', 'Reading L

Unnamed: 0,book_name,score
737,A Long Way Gone: Memoirs of a Boy Soldier,1.277437
301,Heart of Darkness,1.256303
1552,"The Killer Angels (The Civil War Trilogy, #2)",1.212703
357,All Quiet on the Western Front,1.200518
688,Under the Banner of Heaven: A Story of Violent...,1.193962
387,The Crucible,1.186809
217,"The Devil in the White City: Murder, Magic, an...",1.173642
87,Night (The Night Trilogy #1),1.172897
264,The Sun Also Rises,1.164089


## Adding weights for recency

In [65]:
book_ratings['row_number'] = book_ratings.index

In [66]:
book_ratings['book_recency_rank'] = book_ratings.groupby(['user_id']).row_number.rank(pct=False)

In [None]:
total_books_per_user = book_ratings.groupby(['user_id']).book_recency_rank.max().reset_index()

In [None]:
total_books_per_user = total_books_per_user.rename(columns={'book_recency_rank':'total_books'})

In [None]:
book_ratings_final = book_ratings.merge(total_books_per_user, on=['user_id'], how='inner')

In [None]:
book_ratings_final = book_ratings_final.drop('row_number', axis=1)

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.hist(total_books_per_user['total_books'], bins=50)