In [70]:
from surprise import SVD, NMF, Dataset, Reader, accuracy, KNNBaseline
from surprise.model_selection import cross_validate, train_test_split
import pandas as pd
import numpy as np
from main import get_top_n

### Load the data. Split into Train and Test.

In [5]:
r = pd.read_csv('ratings.csv')
tr = pd.read_csv('to_read.csv')
b = pd.read_csv('books.csv')

t = pd.read_csv('tags.csv')
bt = pd.read_csv('book_tags.csv')

### Create a User/Book ratings matrix using NMF.

In [6]:
br = pd.merge(b, r, on = 'book_id')
br.head()

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url,user_id,rating
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,2886,5
1,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,6158,5
2,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,3991,4
3,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,5281,5
4,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,5721,5


In [7]:
br.rating.describe()

count    5.976479e+06
mean     3.919866e+00
std      9.910868e-01
min      1.000000e+00
25%      3.000000e+00
50%      4.000000e+00
75%      5.000000e+00
max      5.000000e+00
Name: rating, dtype: float64

In [10]:
br.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5976479 entries, 0 to 5976478
Data columns (total 25 columns):
book_id                      int64
goodreads_book_id            int64
best_book_id                 int64
work_id                      int64
books_count                  int64
isbn                         object
isbn13                       float64
authors                      object
original_publication_year    float64
original_title               object
title                        object
language_code                object
average_rating               float64
ratings_count                int64
work_ratings_count           int64
work_text_reviews_count      int64
ratings_1                    int64
ratings_2                    int64
ratings_3                    int64
ratings_4                    int64
ratings_5                    int64
image_url                    object
small_image_url              object
user_id                      int64
rating                       int64

### Fit on the Training data, and predict based on the Test data.

In [9]:
reader = Reader(rating_scale=(1.0, 5.0))

In [12]:
data = Dataset.load_from_df(br[['user_id', 'book_id', 'rating']], reader)

In [13]:
trainset, testset = train_test_split(data, test_size=0.25)

In [17]:
nmf = NMF()

In [18]:
nmf.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.NMF at 0x11f741470>

In [19]:
predictions = nmf.test(testset)

### What is the RMSE for the Training set?

In [20]:
accuracy.rmse(predictions)

RMSE: 0.8642


0.8641734687683914

### Which book had the most ratings? The fewest ratings? (The actual name not the ID)

In [31]:
rating_counts = br.groupby('original_title').count().rating

In [39]:
rating_counts.sort_values()
# least: kindle paperwhite user's guide 2nd edition  
# most: The Hunger Games

original_title
kindle paperwhite user's guide 2nd edition                                                               33
進撃の巨人 悔いなき選択 1                                                                                           41
Batman Chronicles: Volume 1                                                                              45
Manga Classics: Les Misérables                                                                           48
The Magic (The Secret #3)                                                                                48
How I Raised Myself from Failure to Success in Selling                                                   52
Trouble in Mudbug                                                                                        52
Travel Team                                                                                              53
Hard As It Gets                                                                                          53
The summer I 

### What is the average number of books read across all users?

In [55]:
br.groupby('user_id').count().book_id.mean()

111.86880428271938

### How many books were published between 2000 and 2010?

In [49]:
b[(b.original_publication_year >= 2000) & (b.original_publication_year <= 2010)].count().book_id

3594

### What are the top 10 most similar books to "The Great Gatsby"? You will have to use a KNN-based model to answer this? Print out the actual book names, not their IDs.

In [56]:
knn = KNNBaseline(sim_options = {'name': 'cosine', 'user_based':False})

In [57]:
knn.fit(trainset)

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x1564c7e48>

In [64]:
GG_ids = knn.get_neighbors(5, 10)
type(GG_ids)

list

In [93]:
b.loc[b.book_id.isin(GG_ids)].title

15      The Girl with the Dragon Tattoo (Millennium, #1)
26     Harry Potter and the Half-Blood Prince (Harry ...
125                            Dune (Dune Chronicles #1)
266                                      The Nightingale
309                     Good in Bed (Cannie Shapiro, #1)
332                                          Still Alice
336        The Ultimate Hitchhiker's Guide to the Galaxy
395              Hotel on the Corner of Bitter and Sweet
565                        After You (Me Before You, #2)
567    Batman: The Dark Knight Returns (The Dark Knig...
Name: title, dtype: object

### What are the top 5 books you would recommend to User #37? (The actual book names, not IDs)

In [71]:
top_n = get_top_n(predictions, n = 5)

In [88]:
user_dict = top_n[37]
user_dict

[(110, 4.77235929698858),
 (143, 4.624305382281158),
 (215, 4.588993061930992),
 (17, 4.508182023061381),
 (39, 4.451183137384545)]

In [90]:
recs = [elem[0] for elem in user_dict]
recs

[110, 143, 215, 17, 39]

In [94]:
b.loc[b.book_id.isin(recs)].title

16               Catching Fire (The Hunger Games, #2)
38     A Game of Thrones (A Song of Ice and Fire, #1)
109    A Clash of Kings  (A Song of Ice and Fire, #2)
142                       All the Light We Cannot See
214                                  Ready Player One
Name: title, dtype: object