In [24]:
import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity

import os
import warnings
warnings.filterwarnings('ignore')


In [25]:
data_path = os.path.join('..','data','raw')

In [26]:
books = pd.read_csv(os.path.join(data_path,'Books.csv'))
ratings = pd.read_csv(os.path.join(data_path,'Ratings.csv'))
users = pd.read_csv(os.path.join(data_path,'Users.csv'))

In [27]:
book_ratings = books.merge(ratings ,on = 'ISBN')
user_rating = users.merge(ratings , on = 'User-ID')

In [28]:
book_num_ratings = book_ratings.groupby('Book-Title')['Book-Rating'].count().reset_index().rename(columns = {'Book-Rating':'Num-Ratings' })
book_avg_ratings = book_ratings.groupby('Book-Title')['Book-Rating'].mean().reset_index().rename(columns = {'Book-Rating':'Avg-Ratings' })
final_rating = book_num_ratings.merge(book_avg_ratings , on = 'Book-Title').merge(books[['Book-Title','Book-Author','Image-URL-M']].groupby('Book-Title').first(), on = 'Book-Title')

In [29]:
books.to_pickle(os.path.join('..','data','intermediate','books.pkl'))

## Recomendação Baseada em Popularidade

Recomendação dos livros mais avaliados e com nota mais alta

In [7]:
popular_books = final_rating[final_rating['Num-Ratings'] > 250].sort_values(by = 'Avg-Ratings'  , ascending= False).reset_index(drop = True).head(50)

In [8]:
popular_books.head(15)

Unnamed: 0,Book-Title,Num-Ratings,Avg-Ratings,Book-Author,Image-URL-M
0,Harry Potter and the Prisoner of Azkaban (Book 3),428,5.852804,J. K. Rowling,http://images.amazon.com/images/P/0439136350.0...
1,Harry Potter and the Goblet of Fire (Book 4),387,5.824289,J. K. Rowling,http://images.amazon.com/images/P/0439139597.0...
2,Harry Potter and the Sorcerer's Stone (Book 1),278,5.73741,J. K. Rowling,http://images.amazon.com/images/P/0590353403.0...
3,Harry Potter and the Order of the Phoenix (Boo...,347,5.501441,J. K. Rowling,http://images.amazon.com/images/P/043935806X.0...
4,Harry Potter and the Chamber of Secrets (Book 2),556,5.183453,J. K. Rowling,http://images.amazon.com/images/P/0439064872.0...
5,The Hobbit : The Enchanting Prelude to The Lor...,281,5.007117,J.R.R. TOLKIEN,http://images.amazon.com/images/P/0345339681.0...
6,The Fellowship of the Ring (The Lord of the Ri...,368,4.94837,J.R.R. TOLKIEN,http://images.amazon.com/images/P/0345339703.0...
7,Harry Potter and the Sorcerer's Stone (Harry P...,575,4.895652,J. K. Rowling,http://images.amazon.com/images/P/059035342X.0...
8,"The Two Towers (The Lord of the Rings, Part 2)",260,4.880769,J.R.R. TOLKIEN,http://images.amazon.com/images/P/0345339711.0...
9,To Kill a Mockingbird,510,4.7,Harper Lee,http://images.amazon.com/images/P/0446310786.0...


In [10]:
popular_books.to_pickle(os.path.join('..','data','intermediate','popular_books.pkl'))

## Recomendação por Filtragem Colaborativa
Recomendação baseada na similaridade entre usuários:
* Apenas os usuários que deram mais de 200 notas serão levados em conta
* Apenas livros com pelo menos 50 notas

In [13]:
x = book_ratings.groupby('User-ID').count()['Book-Rating'] > 200
educated_users  = x[x].index

book_ratings = book_ratings[book_ratings['User-ID'].isin(educated_users)]

y  = book_ratings.groupby('Book-Title')['Book-Rating'].count() >= 50
famous_books = y[y].index

final = book_ratings[book_ratings['Book-Title'].isin(famous_books)]

In [14]:
x[x].index

Index([   254,   2276,   2766,   2977,   3363,   4017,   4385,   6251,   6323,
         6543,
       ...
       271705, 273979, 274004, 274061, 274301, 274308, 275970, 277427, 277639,
       278418],
      dtype='int64', name='User-ID', length=811)

In [15]:
pt = final.pivot_table(index='Book-Title', columns='User-ID', values='Book-Rating').fillna(0)

In [16]:
pt.head()

User-ID,254,2276,2766,2977,3363,4017,4385,6251,6323,6543,...,271705,273979,274004,274061,274301,274308,275970,277427,277639,278418
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4 Blondes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A Bend in the Road,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
pt.to_pickle(os.path.join('..','data','intermediate','pt.pkl'))

In [18]:
similarity_scores = cosine_similarity(pt)

def recommend(book_name):
    index = np.where(pt.index == book_name)[0][0]
    similar_books = sorted(enumerate(similarity_scores[index]),key= lambda x: x[1], reverse =True)[1:6]
    
    for i in similar_books:
        print(pt.index[i[0]])

In [19]:
recommend("4 Blondes")

The House of the Spirits
Pride and Prejudice
Pleading Guilty
Seabiscuit
Notes from a Small Island


In [21]:
pd.DataFrame(similarity_scores).to_pickle(os.path.join('..','data','intermediate','similarity_scores.pkl'))