In [3]:
import pandas as pd
from matplotlib import pyplot as plt

In [192]:
import numpy as np
from sklearn.decomposition import NMF
from sklearn.feature_extraction import stop_words
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models.phrases import Phrases, Phraser
import re

In [318]:
books = pd.read_csv('goodbooks-10k-master/books.csv')
books.head(30)

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...
5,6,11870085,11870085,16827462,226,525478817,9780525000000.0,John Green,2012.0,The Fault in Our Stars,...,2346404,2478609,140739,47994,92723,327550,698471,1311871,https://images.gr-assets.com/books/1360206420m...,https://images.gr-assets.com/books/1360206420s...
6,7,5907,5907,1540236,969,618260307,9780618000000.0,J.R.R. Tolkien,1937.0,The Hobbit or There and Back Again,...,2071616,2196809,37653,46023,76784,288649,665635,1119718,https://images.gr-assets.com/books/1372847500m...,https://images.gr-assets.com/books/1372847500s...
7,8,5107,5107,3036731,360,316769177,9780317000000.0,J.D. Salinger,1951.0,The Catcher in the Rye,...,2044241,2120637,44920,109383,185520,455042,661516,709176,https://images.gr-assets.com/books/1398034300m...,https://images.gr-assets.com/books/1398034300s...
8,9,960,960,3338963,311,1416524797,9781417000000.0,Dan Brown,2000.0,Angels & Demons,...,2001311,2078754,25112,77841,145740,458429,716569,680175,https://images.gr-assets.com/books/1303390735m...,https://images.gr-assets.com/books/1303390735s...
9,10,1885,1885,3060926,3455,679783261,9780680000000.0,Jane Austen,1813.0,Pride and Prejudice,...,2035490,2191465,49152,54700,86485,284852,609755,1155673,https://images.gr-assets.com/books/1320399351m...,https://images.gr-assets.com/books/1320399351s...


In [6]:
books['first_author'] = books.authors.apply(lambda x: x.split(',')[0])

In [7]:
books.columns

Index(['book_id', 'goodreads_book_id', 'best_book_id', 'work_id',
       'books_count', 'isbn', 'isbn13', 'authors', 'original_publication_year',
       'original_title', 'title', 'language_code', 'average_rating',
       'ratings_count', 'work_ratings_count', 'work_text_reviews_count',
       'ratings_1', 'ratings_2', 'ratings_3', 'ratings_4', 'ratings_5',
       'image_url', 'small_image_url', 'first_author'],
      dtype='object')

### Assign the authors an ID

In [14]:
authors = books['first_author'].drop_duplicates().reset_index()
authors.columns = [['author_nr','first_author']]
authors['author_nr'] = 'author_' + authors['author_nr'].astype(str)
authors.head()

Unnamed: 0,author_nr,first_author
0,author_0,Suzanne Collins
1,author_1,J.K. Rowling
2,author_2,Stephenie Meyer
3,author_3,Harper Lee
4,author_4,F. Scott Fitzgerald


In [15]:
books1 = books.merge(authors,on='first_author')

### clean up book title formatting

In [149]:
stopword_list = ['the','of','and','at','to','with','a','an','in','for','you','on','']

In [167]:
titles_clean = books1['title'].str.lower().values.tolist()
titles_clean = [re.sub(pattern='[^a-z ]',repl='',string=x) for x in titles_clean]
titles_clean = [x.split(' ') for x in titles_clean]
titles_clean = [[x for x in x if x not in stopword_list] for x in titles_clean]
titles_clean[0:5]

[['hunger', 'games', 'hunger', 'games'],
 ['catching', 'fire', 'hunger', 'games'],
 ['mockingjay', 'hunger', 'games'],
 ['hunger', 'games', 'trilogy', 'boxset', 'hunger', 'games'],
 ['gregor', 'overlander', 'underland', 'chronicles']]

In [160]:
title_phrases = Phrases(titles_clean, min_count=1, threshold=1) 

In [171]:
titles_clean1 = [' '.join(x) for x in titles_clean]
titles_clean1[0:5]

['hunger games hunger games',
 'catching fire hunger games',
 'mockingjay hunger games',
 'hunger games trilogy boxset hunger games',
 'gregor overlander underland chronicles']

In [191]:
books1['title_clean'] = titles_clean1
title_and_author = books1['title_clean'] + " " + books1['author_nr']
title_and_author = title_and_author.values.tolist()
title_and_author[0:5]

['hunger games hunger games author_0',
 'catching fire hunger games author_0',
 'mockingjay hunger games author_0',
 'hunger games trilogy boxset hunger games author_0',
 'gregor overlander underland chronicles author_0']

In [195]:
vectorizer = TfidfVectorizer()
tfidf_titleauthor = vectorizer.fit_transform(title_and_author)
tfidf_titleauthor.shape

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(10000, 13085)

In [197]:
tfidf_titleauthor[:,0]

<10000x1 sparse matrix of type '<class 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [202]:
nmf = NMF(n_components=300, random_state=1,init='nndsvd',
          alpha=.01, l1_ratio=.5).fit(tfidf_titleauthor)

In [206]:
nmf.components_.shape

(300, 13085)

In [207]:
# From https://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [208]:
tfidf_feature_names = vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, 5)

Topic #0: death author_831 note author_584 gate
Topic #1: zoya author_996 author_9987 author_9985 author_9983
Topic #2: bosch harry author_520 universe haller
Topic #3: dark author_3264 tower author_3114 immortals
Topic #4: man author_1028 author_3149 invisible author_1469
Topic #5: night author_245 author_1256 huntress watch
Topic #6: cross alex author_241 mary hope
Topic #7: my name friend mother approves
Topic #8: is kinsey author_822 millhone silence
Topic #9: little author_543 golden author_41 critter
Topic #10: chronicles author_204 amber dragons author_251
Topic #11: dead walking author_501 sookie stackhouse
Topic #12: author_71 mile tower salems lot
Topic #13: author_1870 prey lucas davenport virgil
Topic #14: me shatter seattle author_3421 author_631
Topic #15: sea author_811 breeze half beach
Topic #16: love author_268 author_3705 author_2182 author_742
Topic #17: zoya author_996 author_9987 author_9985 author_9983
Topic #18: black butler company author_3770 magician
Topic #1

In [223]:
authors[authors['author_nr']=='author_1417']

Unnamed: 0,author_nr,first_author
764,author_1417,David Eddings


In [224]:
titleauth_vectors = nmf.transform(tfidf_titleauthor)

In [228]:
from numpy.linalg import norm
from numpy import dot

def cosine_sim(a,b):
    return dot(a,b)/(norm(a)*norm(b))

In [231]:
list(enumerate(title_and_author[0:25]))

[(0, 'hunger games hunger games author_0'),
 (1, 'catching fire hunger games author_0'),
 (2, 'mockingjay hunger games author_0'),
 (3, 'hunger games trilogy boxset hunger games author_0'),
 (4, 'gregor overlander underland chronicles author_0'),
 (5, 'gregor code claw underland chronicles author_0'),
 (6, 'gregor curse warmbloods underland chronicles author_0'),
 (7, 'gregor prophecy bane underland chronicles author_0'),
 (8, 'gregor marks secret underland chronicles author_0'),
 (9, 'harry potter sorcerers stone harry potter author_1'),
 (10, 'harry potter prisoner azkaban harry potter author_1'),
 (11, 'harry potter order phoenix harry potter author_1'),
 (12, 'harry potter chamber secrets harry potter author_1'),
 (13, 'harry potter goblet fire harry potter author_1'),
 (14, 'harry potter deathly hallows harry potter author_1'),
 (15, 'harry potter halfblood prince harry potter author_1'),
 (16, 'casual vacancy author_1'),
 (17, 'tales beedle bard author_1'),
 (18, 'harry potter bo

In [229]:
cosine_sim(titleauth_vectors[0],titleauth_vectors[1])

0.9723053610077146

In [232]:
cosine_sim(titleauth_vectors[9],titleauth_vectors[10])

0.9974494500429543

In [233]:
cosine_sim(titleauth_vectors[9],titleauth_vectors[14])

0.997362155832182

In [235]:
list(enumerate(title_and_author))[50:75]

[(50, 'fault our stars author_5'),
 (51, 'looking alaska author_5'),
 (52, 'paper towns author_5'),
 (53, 'abundance katherines author_5'),
 (54, 'will grayson will grayson author_5'),
 (55, 'let it snow three holiday romances author_5'),
 (56, 'hobbit author_6'),
 (57, 'fellowship ring lord rings author_6'),
 (58, 'two towers lord rings author_6'),
 (59, 'return king lord rings author_6'),
 (60, 'lord rings lord rings author_6'),
 (61, 'silmarillion middleearth universe author_6'),
 (62, 'jrr tolkien book boxed set hobbit lord rings author_6'),
 (63, 'children hrin author_6'),
 (64, 'unfinished tales nmenor middleearth author_6'),
 (65, 'catcher rye author_7'),
 (66, 'franny zooey author_7'),
 (67, 'nine stories author_7'),
 (68, 'raise high roof beam carpenters seymour introduction author_7'),
 (69, 'angels demons robert langdon author_8'),
 (70, 'da vinci code robert langdon author_8'),
 (71, 'deception point author_8'),
 (72, 'digital fortress author_8'),
 (73, 'lost symbol robert 

In [236]:
cosine_sim(titleauth_vectors[56],titleauth_vectors[57])

0.9908473545295369

In [240]:
cosine_sim(titleauth_vectors[63],titleauth_vectors[61])

0.15913711581858037

In [242]:
list(enumerate(title_and_author))[75:100]

[(75, 'angels demons da vinci code robert langdon author_8'),
 (76, 'pride prejudice author_9'),
 (77, 'sense sensibility author_9'),
 (78, 'emma author_9'),
 (79, 'persuasion author_9'),
 (80, 'northanger abbey author_9'),
 (81, 'mansfield park author_9'),
 (82, 'complete novels author_9'),
 (83, 'lady susan author_9'),
 (84, 'kite runner author_10'),
 (85, 'thousand splendid suns author_10'),
 (86, 'mountains echoed author_10'),
 (87, 'divergent divergent author_11'),
 (88, 'insurgent divergent author_11'),
 (89, 'allegiant divergent author_11'),
 (90, 'four divergent story collection divergent author_11'),
 (91,
  'free four tobias tells divergent knifethrowing scene divergent author_11'),
 (92, 'transfer divergent author_11'),
 (93, 'divergent series complete box set divergent author_11'),
 (94, 'world divergent path allegiant divergent author_11'),
 (95, 'initiate divergent author_11'),
 (96, 'divergent series book collection divergent author_11'),
 (97, 'traitor divergent author_

In [244]:
cosine_sim(titleauth_vectors[87],titleauth_vectors[89])

0.9999997498742668

In [246]:
cosine_sim(titleauth_vectors[87],titleauth_vectors[1])

0.0

This does a decent job of placing titles by the same author close together, but doesn't seem to do well with comparing across authors within similar genres

### Try using the tags instead

In [249]:
tags = pd.read_csv('goodbooks-10k-master/tags.csv')
tags.head()

Unnamed: 0,tag_id,tag_name
0,0,-
1,1,--1-
2,2,--10-
3,3,--12-
4,4,--122-


In [250]:
book_tags = pd.read_csv('goodbooks-10k-master/book_tags.csv')
book_tags.head()

Unnamed: 0,goodreads_book_id,tag_id,count
0,1,30574,167697
1,1,11305,37174
2,1,11557,34173
3,1,8717,12986
4,1,33114,12716


In [297]:
# there are negative counts??
book_tags[book_tags['count']<=0].head()

Unnamed: 0,goodreads_book_id,tag_id,count
922055,18607805,2272,-1
922054,18607805,6552,-1
922051,18607805,10197,-1
922053,18607805,17246,-1
922052,18607805,21619,-1


In [298]:
book_tags = book_tags[book_tags['count']>0]

#### Manually calculate idf

In [299]:
tag_total = book_tags['count'].sum()
tag_doc_cnt = book_tags.groupby('tag_id')['count'].sum().reset_index()
tag_doc_cnt.columns = ['tag_id','term_frequency']
tag_doc_cnt['idf'] = np.log((1+tag_total)/(1+tag_doc_cnt['term_frequency']))+1
tag_doc_cnt.tail()

Unnamed: 0,tag_id,term_frequency,idf
34245,34247,353,14.287836
34246,34248,7456,11.240224
34247,34249,222,14.749961
34248,34250,24,16.938257
34249,34251,384,14.20389


In [300]:
tag_doc_cnt[tag_doc_cnt['idf'] == np.Inf].head()

Unnamed: 0,tag_id,term_frequency,idf


#### relative term frequency

In [301]:
book_tags_total = book_tags.groupby('goodreads_book_id')['count'].sum().reset_index()
book_tags_total.columns = ['goodreads_book_id','total_book_tags']
book_tags_total.head()

Unnamed: 0,goodreads_book_id,total_book_tags
0,1,359447
1,2,73667
2,3,786374
3,5,227215
4,6,141246


In [302]:
book_tags_tf = book_tags.merge(book_tags_total,on='goodreads_book_id')
book_tags_tf['relative_tf'] = book_tags_tf['count']/book_tags_tf['total_book_tags']
book_tags_tf.head()

Unnamed: 0,goodreads_book_id,tag_id,count,total_book_tags,relative_tf
0,1,615,399,359447,0.00111
1,1,1120,283,359447,0.000787
2,1,1128,828,359447,0.002304
3,1,1691,1742,359447,0.004846
4,1,2104,1022,359447,0.002843


#### tf-idf calculation

In [303]:
book_tags_tfidf = book_tags_tf.merge(books[['goodreads_book_id','book_id']],on='goodreads_book_id')
book_tags_tfidf = book_tags_tfidf.merge(tag_doc_cnt,on='tag_id')
book_tags_tfidf['tfidf'] = book_tags_tfidf['count']*book_tags_tfidf['idf']
book_tags_tfidf['rel_tfidf'] = book_tags_tfidf['relative_tf']*book_tags_tfidf['idf']
book_tags_tfidf.head()

Unnamed: 0,goodreads_book_id,tag_id,count,total_book_tags,relative_tf,book_id,term_frequency,idf,tfidf,rel_tfidf
0,1,615,399,359447,0.00111,27,4198,11.814531,4713.997947,0.013115
1,67,615,13,13937,0.000933,3504,4198,11.814531,153.588906,0.01102
2,231,615,14,7388,0.001895,4081,4198,11.814531,165.403437,0.022388
3,597,615,12,16669,0.00072,3598,4198,11.814531,141.774374,0.008505
4,599,615,10,19147,0.000522,1392,4198,11.814531,118.145312,0.00617


In [304]:
book_tags_tfidf.describe()

Unnamed: 0,goodreads_book_id,tag_id,count,total_book_tags,relative_tf,book_id,term_frequency,idf,tfidf,rel_tfidf
count,999906.0,999906.0,999906.0,999906.0,999906.0,999906.0,999906.0,999906.0,999906.0,999906.0
mean,5263358.0,16324.557916,208.870892,20886.815714,0.010001,5000.483499,1713535.0,10.065752,698.634349,0.04413
std,7574000.0,9647.853899,3501.27564,39433.005048,0.060963,2886.743015,13992430.0,3.126542,5315.269325,0.117938
min,1.0,0.0,1.0,312.0,1.3e-05,1.0,1.0,1.394859,4.325659,0.000118
25%,46227.0,8067.0,7.0,5157.0,0.000701,2501.0,3987.0,8.03045,72.424867,0.007226
50%,394841.0,15808.0,15.0,9803.0,0.001522,5000.0,39077.0,9.583818,150.440502,0.015323
75%,9378297.0,24997.0,40.0,20308.0,0.003457,7500.0,184735.0,11.866088,366.052005,0.033071
max,33288640.0,34251.0,596234.0,786374.0,0.99071,10000.0,140718800.0,19.463986,831662.370782,3.857923


In [305]:
print(len(book_tags_tfidf.drop_duplicates('tag_id')))
print(np.max(book_tags_tfidf['book_id']))

34250
10000


In [306]:
book_tags = book_tags.sort_values(['goodreads_book_id','tag_id'])

### Fit NMF to tags

In [307]:
from scipy.sparse import csr_matrix

In [308]:
tfidf_tag_sparse = csr_matrix((book_tags_tfidf['tfidf'].values,
                               (book_tags_tfidf['book_id'].values-1
                               ,book_tags_tfidf['tag_id'].values
                               )))
tfidf_tag_sparse.shape

(10000, 34252)

In [309]:
np.max(tfidf_tag_sparse)

831662.3707822289

In [310]:
from sklearn.preprocessing import normalize

In [311]:
tfidf_tag_sparse_norm = normalize(tfidf_tag_sparse)

In [314]:
np.max(tfidf_tag_sparse_norm)

0.9999165469790066

In [392]:
%%time
tags_nmf = NMF(n_components=300, random_state=1,
          alpha=.01, l1_ratio=.5).fit(tfidf_tag_sparse_norm)
tags_nmf.components_.shape

CPU times: user 18min 23s, sys: 8.2 s, total: 18min 31s
Wall time: 17min 20s


In [393]:
print_top_words(tags_nmf, tags['tag_name'], 5)

Topic #0: to-read currently-reading favorites owned books-i-own
Topic #1: fiction currently-reading favorites owned books-i-own
Topic #2: fantasy fiction owned favorites sci-fi-fantasy
Topic #3: currently-reading to-read kindle kindle-unlimited owned
Topic #4: romance favorites kindle ebook owned
Topic #5: mystery mysteries fiction crime mystery-thriller
Topic #6: childrens children-s children-s-books kids children
Topic #7: science-fiction sci-fi fiction sf owned
Topic #8: non-fiction nonfiction favorites owned books-i-own
Topic #9: young-adult ya teen books-i-own owned
Topic #10: historical-fiction historical fiction history fiction-historical
Topic #11: comics graphic-novels graphic-novel comic comic-books
Topic #12: paranormal supernatural fantasy urban-fantasy paranormal-fantasy
Topic #13: classics classic literature books-i-own favorites
Topic #14: horror supernatural thriller horror-thriller books-i-own
Topic #15: favorites favourites all-time-favorites favorite books-i-own
Topi

Topic #140: ｆａｖｏｕｒｉｔｅｓ farsi-books farland farley-mowat farm
Topic #141: brad-thor scot-harvath thor thor-brad spy-thriller
Topic #142: series part-of-a-series ebook favorite-series to-buy
Topic #143: v-c-andrews vc-andrews gothic drama guilty-pleasures
Topic #144: feminism feminist women our-shared-shelf gender
Topic #145: pretty-little-liars pll sara-shepard pretty-little-liars-series contemporary
Topic #146: pulitzer pulitzer-prize pulitzer-prize-winners pulitzer-winners pulitzer-fiction
Topic #147: espionage spy spies spy-thriller thrillers
Topic #148: christine-feehan vampire carpathians dark-series vampires
Topic #149: romance dnf friends-to-lovers julia-quinn regency
Topic #150: daniel-silva gabriel-allon espionage spy silva
Topic #151: danielle-steel danielle-steele romance steel steel-danielle
Topic #152: historical bernard-cornwell medieval england historical-mystery
Topic #153: preston-child pendergast pendergast-series douglas-preston preston-and-child
Topic #154: magic wit

Topic #286: laurell-k-hamilton merry-gentry fae paranormal-romance meredith-gentry
Topic #287: hunger-games the-hunger-games read-in-2012 my-books dystopia
Topic #288: chuck-palahniuk palahniuk satire chuck american
Topic #289: demons gena-showalter lords-of-the-underworld demon larissa-ione
Topic #290: german german-literature deutsch germany literature
Topic #291: matthew-reilly action action-adventure maximum-ride adventure
Topic #292: read-in-2012 read-in-2011 read-2012 nook 2012-reads
Topic #293: shifters shapeshifters paranormal nalini-singh shifter
Topic #294: elin-hilderbrand beach-reads summer-reads women-s-fiction chick-lit
Topic #295: time-travel karen-marie-moning highlander highlanders scotland
Topic #296: werewolves shifters werewolf shapeshifters kelley-armstrong
Topic #297: مصطفى-محمود arabic فكر دين islamic
Topic #298: college college-romance na friends-to-lovers new-adult-romance
Topic #299: favorite favorite-books my-favorites all-time-favorites love



In [394]:
tag_vectors = tags_nmf.transform(tfidf_tag_sparse_norm)

In [437]:
np.save('models/tags_nmf.npy',tag_vectors)

### Examine the tag based vectors

In [395]:
book_dict = {}
for i,row in books.iterrows():
    book_dict[row['book_id']] = {
        "tag_vector":tag_vectors[row['book_id']-1]
        ,"book_title":row['title']
        ,"authors":row['authors']
    }

In [379]:
# little function to look up books with similar tag-based NMF vectors
from operator import itemgetter

def similar_book_tags(book_id,book_dict,nr_recs=10):
    vec_i = book_dict[book_id]['tag_vector']
    recs_list = []
    for rec_id,rec_dict in book_dict.items():
        if rec_id != book_id:
            rec_vec = book_dict[rec_id]['tag_vector']
            sim_ij = cosine_sim(vec_i,rec_vec)
            recs_list.append((rec_id
                              ,book_dict[rec_id]['book_title']
                              ,book_dict[rec_id]['authors']
                              ,sim_ij))
    recs_list.sort(key=itemgetter(3),reverse=True)
    return recs_list[:nr_recs]

In [396]:
similar_book_tags(7,book_dict)

[(19,
  'The Fellowship of the Ring (The Lord of the Rings, #1)',
  'J.R.R. Tolkien',
  0.9340245679420155),
 (165,
  'A Feast for Crows (A Song of Ice and Fire, #4)',
  'George R.R. Martin',
  0.8973982751395206),
 (37,
  'The Lion, the Witch, and the Wardrobe (Chronicles of Narnia, #1)',
  'C.S. Lewis',
  0.8865299885418731),
 (542,
  'The Horse and His Boy (Chronicles of Narnia, #5)',
  'C.S. Lewis',
  0.8863709824904458),
 (746,
  'The Lies of Locke Lamora (Gentleman Bastard, #1)',
  'Scott Lynch',
  0.8828784571266783),
 (1629,
  'The Tombs of Atuan (Earthsea Cycle, #2)',
  'Ursula K. Le Guin',
  0.8821309052186329),
 (3278,
  'The Mad Ship (Liveship Traders, #2)',
  'Robin Hobb',
  0.862595791534862),
 (9157,
  'The Redemption of Althalus',
  'David Eddings, Leigh Eddings',
  0.8622506752736607),
 (9408,
  'The Darkest Road (The Fionavar Tapestry, #3)',
  'Guy Gavriel Kay',
  0.86054792031002),
 (859,
  'The Way of Shadows (Night Angel, #1)',
  'Brent Weeks',
  0.8601131487017506

In [371]:
books[books['authors'].str.contains('Tolkien')][['book_id','title','authors']]

Unnamed: 0,book_id,title,authors
6,7,The Hobbit,J.R.R. Tolkien
18,19,The Fellowship of the Ring (The Lord of the Ri...,J.R.R. Tolkien
154,155,"The Two Towers (The Lord of the Rings, #2)",J.R.R. Tolkien
160,161,"The Return of the King (The Lord of the Rings,...",J.R.R. Tolkien
188,189,"The Lord of the Rings (The Lord of the Rings, ...",J.R.R. Tolkien
465,466,The Hobbit: Graphic Novel,"Chuck Dixon, J.R.R. Tolkien, David Wenzel, Sea..."
610,611,The Silmarillion (Middle-Earth Universe),"J.R.R. Tolkien, Christopher Tolkien, Ted Nasmith"
963,964,J.R.R. Tolkien 4-Book Boxed Set: The Hobbit an...,J.R.R. Tolkien
1128,1129,"The History of the Hobbit, Part One: Mr. Baggins","John D. Rateliff, J.R.R. Tolkien"
2308,2309,The Children of Húrin,"J.R.R. Tolkien, Christopher Tolkien, Alan Lee"


In [412]:
# fellowship of the ring
t=list(enumerate(tag_vectors[18]))
t.sort(key=itemgetter(1),reverse=True)
t[0:10]

[(2, 0.15057858357160644),
 (13, 0.044277696561006774),
 (203, 0.03024132777016839),
 (107, 0.020467167122230963),
 (196, 0.01664946361931723),
 (275, 0.01325301950221456),
 (60, 0.013111432757314656),
 (38, 0.012245447024945684),
 (0, 0.010580242672800208),
 (299, 0.010422315236785698)]

In [415]:
similar_book_tags(611,book_dict)

[(1686,
  'Belgarath the Sorcerer',
  'David Eddings, Leigh Eddings',
  0.7595518300053391),
 (39,
  'A Game of Thrones (A Song of Ice and Fire, #1)',
  'George R.R. Martin',
  0.7487529892713196),
 (9975,
  'The King Beyond the Gate (The Drenai Saga, #2)',
  'David Gemmell',
  0.7136884956085894),
 (2119,
  "The Magicians' Guild (Black Magician Trilogy, #1)",
  'Trudi Canavan',
  0.7015211117666034),
 (5554,
  'To Green Angel Tower, Part 2 (Memory, Sorrow, and Thorn, #3; Part 2)',
  'Tad Williams',
  0.6954310363418165),
 (1525,
  'Crossroads of Twilight (Wheel of Time, #10)',
  'Robert Jordan',
  0.6939289530926359),
 (7253,
  'The One Tree (The Second Chronicles of Thomas Covenant, #2)',
  'Stephen R. Donaldson',
  0.6908160988390297),
 (964,
  'J.R.R. Tolkien 4-Book Boxed Set: The Hobbit and The Lord of the Rings',
  'J.R.R. Tolkien',
  0.677188207576125),
 (593,
  'A Wizard of Earthsea (Earthsea Cycle, #1)',
  'Ursula K. Le Guin',
  0.674110828317726),
 (8035,
  'Black Sun Rising 

In [414]:
# fellowship of the ring vs. the hobbit
cosine_sim(tag_vectors[18],tag_vectors[6])

0.9340245679420155

In [352]:
books[books['authors'].str.contains('Le Guin')][['book_id','title','authors']]

Unnamed: 0,book_id,title,authors
592,593,"A Wizard of Earthsea (Earthsea Cycle, #1)",Ursula K. Le Guin
1573,1574,The Left Hand of Darkness,"Ursula K. Le Guin, Lech Jęczmyk"
1590,1591,"The Farthest Shore (Earthsea Cycle, #3)",Ursula K. Le Guin
1628,1629,"The Tombs of Atuan (Earthsea Cycle, #2)",Ursula K. Le Guin
2209,2210,The Dispossessed,Ursula K. Le Guin
3402,3403,The Lathe of Heaven,Ursula K. Le Guin
5660,5661,"Tehanu (Earthsea Cycle, #4)",Ursula K. Le Guin
7727,7728,The Earthsea Trilogy,Ursula K. Le Guin
8892,8893,"The Other Wind (Earthsea Cycle, #6)",Ursula K. Le Guin
9423,9424,"Tales from Earthsea (Earthsea Cycle, #5)",Ursula K. Le Guin


In [413]:
similar_book_tags(1629,book_dict)

[(8103,
  "Arrow's Flight (Heralds of Valdemar, #2)",
  'Mercedes Lackey',
  0.9827506079227314),
 (8893,
  'The Other Wind (Earthsea Cycle, #6)',
  'Ursula K. Le Guin',
  0.9805384541319503),
 (7893,
  "Arrow's Fall (Heralds of Valdemar, #3)",
  'Mercedes Lackey',
  0.9787978363886207),
 (6602,
  'The Courts of Chaos (The Chronicles of Amber #5)',
  'Roger Zelazny',
  0.9777899843252479),
 (8799,
  'Sign of the Unicorn (The Chronicles of Amber #3)',
  'Roger Zelazny',
  0.9765968376445637),
 (9157,
  'The Redemption of Althalus',
  'David Eddings, Leigh Eddings',
  0.9727192794373689),
 (1591,
  'The Farthest Shore (Earthsea Cycle, #3)',
  'Ursula K. Le Guin',
  0.9714442096741177),
 (9408,
  'The Darkest Road (The Fionavar Tapestry, #3)',
  'Guy Gavriel Kay',
  0.9706916145592804),
 (9711,
  'The Wandering Fire (The Fionavar Tapestry, #2)',
  'Guy Gavriel Kay',
  0.9698709978648941),
 (6005,
  'The Source of Magic (Xanth, #2)',
  'Piers Anthony',
  0.9678202353899648)]

In [417]:
t=list(enumerate(book_dict[1574]['tag_vector']))
t.sort(key=itemgetter(1),reverse=True)
t[0:10]

[(7, 0.08162869195331013),
 (134, 0.06832119471257266),
 (58, 0.06727662334721593),
 (0, 0.062148378644072994),
 (103, 0.04555281390617159),
 (197, 0.03776571190871602),
 (144, 0.03538460539449409),
 (240, 0.025372641637205238),
 (256, 0.01755869236886675),
 (1, 0.01633540416230895)]

In [355]:
cosine_sim(tag_vectors[592],tag_vectors[1573])

0.36445578939412204

In [356]:
cosine_sim(tag_vectors[1628],tag_vectors[1590])

0.9721817443499796

In [418]:
books[books['authors'].str.contains('Le Fanu')][['book_id','title','authors']]

Unnamed: 0,book_id,title,authors
5580,5581,Carmilla,J. Sheridan Le Fanu


In [419]:
t=list(enumerate(book_dict[5581]['tag_vector']))
t.sort(key=itemgetter(1),reverse=True)
t[0:10]

[(0, 0.09295312680311225),
 (242, 0.07338182301939958),
 (14, 0.04210736386594193),
 (31, 0.025877178511500128),
 (13, 0.02526255551693941),
 (103, 0.023042892590491654),
 (3, 0.009611277943027331),
 (196, 0.008337527618203274),
 (265, 0.007777409289690962),
 (2, 0.006808320481167683)]

In [420]:
similar_book_tags(5581,book_dict)

[(2637,
  'We Have Always Lived in the Castle',
  'Shirley Jackson, Jonathan Lethem',
  0.914663051581773),
 (3773, 'The Woman in Black', 'Susan Hill, John Lawrence', 0.9006665014750327),
 (6278, 'Jamaica Inn', 'Daphne du Maurier', 0.8868306706652832),
 (97,
  'Dracula',
  'Bram Stoker, Nina Auerbach, David J. Skal',
  0.8565524345513097),
 (390,
  'The Strange Case of Dr. Jekyll and Mr. Hyde',
  'Robert Louis Stevenson, Vladimir Nabokov, Mervyn Peake, Dan Chaon',
  0.8526449198595405),
 (7667, 'Lost Souls', 'Poppy Z. Brite', 0.8502002435809503),
 (1954,
  'The Haunting of Hill House',
  'Shirley Jackson, Laura   Miller',
  0.8498683841246023),
 (7650, "Lady Audley's Secret", 'Mary Elizabeth Braddon', 0.838096592630693),
 (1354,
  'The Strange Case of Dr. Jekyll and Mr. Hyde and Other Tales of Terror',
  'Robert Louis Stevenson, Robert Mighall',
  0.8347042040665524),
 (4592,
  'The House of the Seven Gables',
  'Nathaniel Hawthorne, Robert S. Levine',
  0.8114217853040327)]

In [421]:
similar_book_tags(97,book_dict)

[(95,
  'The Picture of Dorian Gray',
  'Oscar Wilde, Jeffrey Eugenides',
  0.9661327608218139),
 (83,
  'A Tale of Two Cities',
  'Charles Dickens, Richard Maxwell, Hablot Knight Browne',
  0.9385147311853879),
 (390,
  'The Strange Case of Dr. Jekyll and Mr. Hyde',
  'Robert Louis Stevenson, Vladimir Nabokov, Mervyn Peake, Dan Chaon',
  0.9383548873626602),
 (42,
  'Little Women (Little Women, #1)',
  'Louisa May Alcott',
  0.9375754029022191),
 (194,
  'Moby-Dick or, The Whale',
  'Herman Melville, Andrew Delbanco, Tom Quirk',
  0.9358970867226835),
 (1354,
  'The Strange Case of Dr. Jekyll and Mr. Hyde and Other Tales of Terror',
  'Robert Louis Stevenson, Robert Mighall',
  0.9351013897345731),
 (701, 'Mrs. Dalloway', 'Virginia Woolf, Maureen Howard', 0.9313318397818376),
 (160, 'Great Expectations', 'Charles Dickens', 0.9312162636740992),
 (9780,
  'Necronomicon: The Best Weird Tales',
  'H.P. Lovecraft, Les Edwards, Stephen Jones',
  0.9295131977041142),
 (361,
  'Oliver Twist',

In [423]:
books[books['authors'].str.contains('Jane Austen')][['book_id','title','authors']]

Unnamed: 0,book_id,title,authors
9,10,Pride and Prejudice,Jane Austen
75,76,Sense and Sensibility,"Jane Austen, Tony Tanner, Ros Ballaster"
170,171,Emma,"Jane Austen, Fiona Stafford"
229,230,Persuasion,"Jane Austen, James Kinsley, Deidre Shauna Lynch"
450,451,Northanger Abbey,"Jane Austen, Alfred MacAdam"
470,471,Mansfield Park,"Jane Austen, Kathryn Sutherland, Tony Tanner, ..."
1038,1039,Pride and Prejudice and Zombies (Pride and Pre...,"Seth Grahame-Smith, Jane Austen"
4652,4653,The Complete Novels,Jane Austen
4844,4845,Dawn of the Dreadfuls (Pride and Prejudice and...,"Steve Hockensmith, Jane Austen, Patrick Arrasmith"
6301,6302,Lady Susan,"Jane Austen, Robert William Chapman"


In [430]:
similar_book_tags(10,book_dict)

[(5, 'The Great Gatsby', 'F. Scott Fitzgerald', 0.923428936493511),
 (63,
  'Wuthering Heights',
  'Emily Brontë, Richard J. Dunn',
  0.9215082638372557),
 (103,
  'The Count of Monte Cristo',
  'Alexandre Dumas, Robin Buss',
  0.9141754721554975),
 (559,
  "Tess of the D'Urbervilles",
  'Thomas Hardy, Tim Dolin, Margaret R. Higonnet',
  0.910227790644946),
 (8, 'The Catcher in the Rye', 'J.D. Salinger', 0.9054905979908731),
 (4, 'To Kill a Mockingbird', 'Harper Lee', 0.8947442043973408),
 (2197, 'Jude the Obscure', 'Thomas Hardy', 0.8676931030019772),
 (590,
  'David Copperfield',
  'Charles Dickens, Jeremy Tambling',
  0.8650606212848461),
 (2875,
  'Of Human Bondage',
  'W. Somerset Maugham, Benjamin DeMott, Maeve Binchy',
  0.861648662377581),
 (1783,
  'The Portrait of a Lady',
  'Henry James, Patricia Crick',
  0.8602329665007536)]

In [434]:
books[books['authors'].str.contains('L.M. Montgomery')][['book_id','title','authors']]

Unnamed: 0,book_id,title,authors
132,133,"Anne of Green Gables (Anne of Green Gables, #1)",L.M. Montgomery
957,958,The Complete Anne of Green Gables Boxed Set (A...,L.M. Montgomery
1111,1112,"Anne of the Island (Anne of Green Gables, #3)",L.M. Montgomery
1209,1210,"Anne of Avonlea (Anne of Green Gables, #2)",L.M. Montgomery
2820,2821,"Anne's House of Dreams (Anne of Green Gables, #5)",L.M. Montgomery
3137,3138,"Anne of Windy Poplars (Anne of Green Gables, #4)",L.M. Montgomery
3903,3904,"Anne of Ingleside (Anne of Green Gables, #6)",L.M. Montgomery
4013,4014,"Emily of New Moon (Emily, #1)",L.M. Montgomery
4697,4698,"Rilla of Ingleside (Anne of Green Gables, #8)",L.M. Montgomery
5566,5567,"Rainbow Valley (Anne of Green Gables, #7)",L.M. Montgomery


In [435]:
similar_book_tags(133,book_dict)

[(5567,
  'Rainbow Valley (Anne of Green Gables, #7)',
  'L.M. Montgomery',
  0.7927136852819691),
 (7430,
  'Rose in Bloom (Eight Cousins, #2)',
  'Louisa May Alcott',
  0.7438221620121458),
 (1112,
  'Anne of the Island (Anne of Green Gables, #3)',
  'L.M. Montgomery',
  0.711326392547849),
 (3138,
  'Anne of Windy Poplars (Anne of Green Gables, #4)',
  'L.M. Montgomery',
  0.7089663660054109),
 (8591, 'Emily Climbs (Emily, #2)', 'L.M. Montgomery', 0.7002495388359219),
 (4698,
  'Rilla of Ingleside (Anne of Green Gables, #8)',
  'L.M. Montgomery',
  0.6999145383527379),
 (3904,
  'Anne of Ingleside (Anne of Green Gables, #6)',
  'L.M. Montgomery',
  0.6788495414554518),
 (1210,
  'Anne of Avonlea (Anne of Green Gables, #2)',
  'L.M. Montgomery',
  0.65702216492053),
 (937,
  'His Dark Materials (His Dark Materials #1-3)',
  'Philip Pullman',
  0.648474093162727),
 (1489,
  'On the Banks of Plum Creek  (Little House, #4)',
  'Laura Ingalls Wilder, Garth Williams',
  0.63803146264725)]