In [2]:
from surprise import SVD, NMF, Dataset, Reader, accuracy, KNNBaseline
from surprise.model_selection import cross_validate, train_test_split
import pandas as pd
import numpy as np
from main import get_top_n

In [3]:
bdf = pd.read_csv('books.csv')
bdf.head(2)

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...


In [4]:
rdf = pd.read_csv('ratings.csv')
rdf.head(2)

Unnamed: 0,user_id,book_id,rating
0,1,258,5
1,2,4081,4


In [5]:
df = bdf.merge(rdf, on='book_id')
df.head(2)

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url,user_id,rating
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,2886,5
1,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,6158,5


In [6]:
df = df[['book_id', 'user_id', 'rating', 'original_publication_year', 'title', 'ratings_count']]
df.head(2)

Unnamed: 0,book_id,user_id,rating,original_publication_year,title,ratings_count
0,1,2886,5,2008.0,"The Hunger Games (The Hunger Games, #1)",4780653
1,1,6158,5,2008.0,"The Hunger Games (The Hunger Games, #1)",4780653


In [7]:
df.rating.describe()

count    5.976479e+06
mean     3.919866e+00
std      9.910868e-01
min      1.000000e+00
25%      3.000000e+00
50%      4.000000e+00
75%      5.000000e+00
max      5.000000e+00
Name: rating, dtype: float64

In [8]:
reader = Reader(rating_scale=(1.0, 5.0))
data = Dataset.load_from_df(df[['user_id', 'book_id', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=0.25)
nmf = NMF()
nmf.fit(trainset)
predictions = nmf.test(testset)
accuracy.rmse(predictions)

RMSE: 0.8650


0.8649850566379184

In [9]:
df.sort_values(['ratings_count']).iloc[[-1,0]]

Unnamed: 0,book_id,user_id,rating,original_publication_year,title,ratings_count
0,1,2886,5,2008.0,"The Hunger Games (The Hunger Games, #1)",4780653
5660963,7639,7015,4,1968.0,درخت زیبای من,2716


In [10]:
users = df.user_id.unique().shape[0]
users

53424

In [11]:
books_reviewed = df.shape[0]
books_reviewed

5976479

In [12]:
# avg books read per user
books_reviewed / users

111.86880428271938

In [13]:
df[(df.original_publication_year >= 2000) & (df.original_publication_year <= 2010)].shape[0]

2016441

In [14]:
preds_37 = [nmf.predict(37, bid) for bid in range(10_000)]

In [15]:
preds_37[:5]

[Prediction(uid=37, iid=0, r_ui=None, est=3.9200697668348297, details={'was_impossible': True, 'reason': 'User and item are unkown.'}),
 Prediction(uid=37, iid=1, r_ui=None, est=4.523977480952467, details={'was_impossible': False}),
 Prediction(uid=37, iid=2, r_ui=None, est=4.492804912910157, details={'was_impossible': False}),
 Prediction(uid=37, iid=3, r_ui=None, est=3.4193534321601553, details={'was_impossible': False}),
 Prediction(uid=37, iid=4, r_ui=None, est=4.481608206337953, details={'was_impossible': False})]

In [16]:
top_n = get_top_n(preds_37, n=5)
top_n

defaultdict(list,
            {37: [(267, 5.0),
              (422, 5.0),
              (769, 5.0),
              (862, 5.0),
              (1010, 5.0)]})

In [17]:
bids = [bid for bid, _ in top_n[37]]
bids

[267, 422, 769, 862, 1010]

In [18]:
df[df.book_id.isin(bids)].title.unique().tolist()

['The Nightingale',
 'Harry Potter Boxset (Harry Potter, #1-7)',
 'The Complete Sherlock Holmes',
 'Words of Radiance (The Stormlight Archive, #2)',
 'The Essential Calvin and Hobbes: A Calvin and Hobbes Treasury']

In [19]:
knn = KNNBaseline(sim_options={'name': 'cosine', 'user_based':False})
knn.fit(trainset)

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x1cd841e80>

In [20]:
gatsby = df[df.title == 'The Great Gatsby'].book_id.unique()[0]
gatsby

5

In [21]:
bookids = knn.get_neighbors(gatsby, 10)
bookids

[125, 134, 136, 336, 419, 423, 531, 664, 700, 744]

In [22]:
df[df.book_id.isin(bookids)].title.unique().tolist()

['Hamlet',
 'City of Glass (The Mortal Instruments, #3)',
 'Divine Secrets of the Ya-Ya Sisterhood',
 "1st to Die (Women's Murder Club, #1)",
 'Blood Promise (Vampire Academy, #4)',
 'The Elite (The Selection, #2)',
 'Beowulf',
 "Breakfast at Tiffany's",
 'The Constant Princess (The Plantagenet and Tudor Novels, #6)',
 'The Power of Now: A Guide to Spiritual Enlightenment']