In [841]:
import os
import sys
import numpy as np 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [563]:
data = pd.read_csv('books.csv')
tags = pd.read_csv('tags.csv')

In [564]:
tags.head()

Unnamed: 0,tag_id,tag_name
0,0,-
1,1,--1-
2,2,--10-
3,3,--12-
4,4,--122-


In [565]:
book_tags = pd.read_csv('book_tags.csv')

In [566]:
book_tags.head()

Unnamed: 0,goodreads_book_id,tag_id,count
0,1,30574,167697
1,1,11305,37174
2,1,11557,34173
3,1,8717,12986
4,1,33114,12716


In [567]:
tags = pd.merge( book_tags,tags, left_on = 'tag_id', right_on = 'tag_id', how = 'inner')

In [568]:
tags.shape

(999912, 4)

In [569]:
merged_books = pd.merge(data, tags, left_on ='book_id', right_on = 'goodreads_book_id',how = 'inner')


In [570]:
for i in merged_books.columns:
    merged_books[i] = merged_books[i].fillna(' ')
for i in data.columns:
    data[i] = data[i].fillna(' ')

In [571]:
final_books = merged_books.groupby('book_id')['tag_name'].apply(' '.join).reset_index()

In [572]:
final_books.head()

Unnamed: 0,book_id,tag_name
0,1,to-read fantasy favorites currently-reading yo...
1,2,to-read fantasy favorites currently-reading yo...
2,3,to-read fantasy favorites currently-reading yo...
3,5,to-read fantasy favorites currently-reading yo...
4,6,to-read fantasy young-adult fiction harry-pott...


In [574]:
data.head(10)

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780440000000.0,Suzanne Collins,2008,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9780320000000.0,Stephenie Meyer,2005,Twilight,...,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,2657,3275794,487,61120081,9780060000000.0,Harper Lee,1960,To Kill a Mockingbird,...,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9780740000000.0,F. Scott Fitzgerald,1925,The Great Gatsby,...,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...
5,6,11870085,11870085,16827462,226,525478817,9780530000000.0,John Green,2012,The Fault in Our Stars,...,2346404,2478609,140739,47994,92723,327550,698471,1311871,https://images.gr-assets.com/books/1360206420m...,https://images.gr-assets.com/books/1360206420s...
6,7,5907,5907,1540236,969,618260307,9780620000000.0,J.R.R. Tolkien,1937,The Hobbit or There and Back Again,...,2071616,2196809,37653,46023,76784,288649,665635,1119718,https://images.gr-assets.com/books/1372847500m...,https://images.gr-assets.com/books/1372847500s...
7,8,5107,5107,3036731,360,316769177,9780320000000.0,J.D. Salinger,1951,The Catcher in the Rye,...,2044241,2120637,44920,109383,185520,455042,661516,709176,https://images.gr-assets.com/books/1398034300m...,https://images.gr-assets.com/books/1398034300s...
8,9,960,960,3338963,311,1416524797,9781420000000.0,Dan Brown,2000,Angels & Demons,...,2001311,2078754,25112,77841,145740,458429,716569,680175,https://images.gr-assets.com/books/1303390735m...,https://images.gr-assets.com/books/1303390735s...
9,10,1885,1885,3060926,3455,679783261,9780680000000.0,Jane Austen,1813,Pride and Prejudice,...,2035490,2191465,49152,54700,86485,284852,609755,1155673,https://images.gr-assets.com/books/1320399351m...,https://images.gr-assets.com/books/1320399351s...


In [575]:
data= pd.merge(data,final_books,left_on = 'book_id', right_on = 'book_id', how= 'left')

In [577]:
features = ['authors','title','language_code','tag_name']

In [578]:
for i in features:
    data[i] = data[i].fillna(' ')

In [884]:
def combine_features(row):
    return row['authors']+' '+row['tag_name']+' '+row['title']

In [885]:
data['combined_features'] = data.apply(combine_features,axis = 1)

In [907]:
cv = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')

In [908]:
count_matrix = cv.fit_transform(data['combined_features'])

In [909]:
similarity_score = cosine_similarity(count_matrix)

In [928]:
book_user_likes = "To Kill a Mockingbird"

In [929]:
def get_index_from_title(title):
    return data[data.title == title].index.values[0]

In [930]:
get_index_from_title(book_user_likes)

3

In [931]:
book_index = get_index_from_title(book_user_likes)

In [932]:
similar_books = list(enumerate(similarity_score[book_index]))

In [933]:
sorted_similar_books =  sorted(similar_books, key = lambda x:x[1], reverse = True)

In [934]:
def get_title_from_index(index):
    return data[data.index == index].title.values[0]

In [935]:
i = 0
for book in sorted_similar_books:
    print(get_title_from_index(book[0]))
    i=i+1
    if i>10:
        break

To Kill a Mockingbird
Go Set a Watchman
The Last Boleyn
A Time to Kill
The Republic
The Case for Christ
The Piano Teacher
One Shot (Jack Reacher, #9)
Tripwire  (Jack Reacher, #3)
Faeries
The Enemy (Jack Reacher, #8)
