# Goodreads Topic Extraction and Recommendations

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np

# raw_data_path = '../input/goodreads10kcsv/'
# model_output_path = '../input/goodreads-models/'

raw_data_path = './data/raw/goodbooks-10k/'
model_output_path = './data/models/'

ratings_df = pd.read_csv(raw_data_path + 'ratings.csv',
                         dtype={
                             'rating': np.uint8,
                             'user_id': np.uint16,
                             'book_id': np.uint16
                         })

to_read_df = pd.read_csv(raw_data_path + 'to_read.csv')
books_df = pd.read_csv(raw_data_path + 'books.csv')

tags_df = pd.read_csv(raw_data_path + 'tags.csv')
book_tags_df = pd.read_csv(raw_data_path + 'book_tags.csv')

In [2]:
ratings_df.head()

Unnamed: 0,user_id,book_id,rating
0,1,258,5
1,2,4081,4
2,2,260,5
3,2,9296,5
4,2,2318,3


In [3]:
to_read_df.head()

Unnamed: 0,user_id,book_id
0,9,8
1,15,398
2,15,275
3,37,7173
4,34,380


In [4]:
books_df.head()

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...


In [5]:
tags_df.head()

Unnamed: 0,tag_id,tag_name
0,0,-
1,1,--1-
2,2,--10-
3,3,--12-
4,4,--122-


In [6]:
book_tags_df.head()

Unnamed: 0,goodreads_book_id,tag_id,count
0,1,30574,167697
1,1,11305,37174
2,1,11557,34173
3,1,8717,12986
4,1,33114,12716


In [7]:
bks_cols = ['authors', 'title']
books_cols = [
    'goodreads_book_id', 'book_id', 'authors', 'title', 'average_rating',
    'ratings_count'
]

In [8]:
books_df[books_cols].head(25)

Unnamed: 0,goodreads_book_id,book_id,authors,title,average_rating,ratings_count
0,2767052,1,Suzanne Collins,"The Hunger Games (The Hunger Games, #1)",4.34,4780653
1,3,2,"J.K. Rowling, Mary GrandPré",Harry Potter and the Sorcerer's Stone (Harry P...,4.44,4602479
2,41865,3,Stephenie Meyer,"Twilight (Twilight, #1)",3.57,3866839
3,2657,4,Harper Lee,To Kill a Mockingbird,4.25,3198671
4,4671,5,F. Scott Fitzgerald,The Great Gatsby,3.89,2683664
5,11870085,6,John Green,The Fault in Our Stars,4.26,2346404
6,5907,7,J.R.R. Tolkien,The Hobbit,4.25,2071616
7,5107,8,J.D. Salinger,The Catcher in the Rye,3.79,2044241
8,960,9,Dan Brown,"Angels & Demons (Robert Langdon, #1)",3.85,2001311
9,1885,10,Jane Austen,Pride and Prejudice,4.24,2035490


In [9]:
W10 = np.loadtxt(model_output_path + 'W_10' + '.csv', dtype=np.float16)
H10 = np.loadtxt(model_output_path + 'H_10' + '.csv', dtype=np.float16)
print('Model size (bytes): ' + '{:,}'.format(W10.nbytes + H10.nbytes))

Model size (bytes): 1,268,480


In [10]:
W25 = np.loadtxt(model_output_path + 'W_25' + '.csv', dtype=np.float16)
H25 = np.loadtxt(model_output_path + 'H_25' + '.csv', dtype=np.float16)
print('Model size (bytes): ' + '{:,}'.format(W25.nbytes + H25.nbytes))

Model size (bytes): 3,171,200


In [11]:
W50 = np.loadtxt(model_output_path + 'W_50' + '.csv', dtype=np.float16)
H50 = np.loadtxt(model_output_path + 'H_50' + '.csv', dtype=np.float16)
print('Model size (bytes): ' + '{:,}'.format(W50.nbytes + H50.nbytes))

Model size (bytes): 6,342,400


## Topic Models

In [12]:
# Clean-up a few common tags not clearly relevant to topic (manual insepection)
book_topic_remove_tags_list = [
    30574, 8717, 11557, 5207, 22743, 11590, 22753, 30521, 11743
]
tags_df.loc[book_topic_remove_tags_list]

Unnamed: 0,tag_id,tag_name
30574,30574,to-read
8717,8717,currently-reading
11557,11557,favorites
5207,5207,books-i-own
22743,22743,owned
11590,11590,favourites
22753,22753,owned-books
30521,30521,to-buy
11743,11743,fiction


### $k=10$

In [13]:
# Declare model
model_dfs = (pd.DataFrame(data=W10), pd.DataFrame(data=H10.T))

# Get the value of k from the model
n_components = len(model_dfs[1].columns)

#### Inspect Top Books in Topic

In [14]:
# Make a dictionary of dataframes containing book topic vectors
books_and_topics_dict = {
    topic:
    (model_dfs[1].join(books_df[books_cols]).sort_values(by=topic,
                                                         ascending=False))
    for topic in range(n_components)
}

In [15]:
show_rows = 12

#    
top_books_in_topic_df = pd.DataFrame(
    pd.concat(
        [
            pd.DataFrame(data=books_and_topics_dict[topic][
                ['authors', 'title']].head(show_rows).reset_index(drop=True))
            for topic in range(n_components)
        ],
        axis=1,
        #        ignore_index=True
    ))

# Make column multiindex with topic, author/title
top_books_in_topic_df.columns = pd.MultiIndex.from_product(
    [list(model_dfs[1].columns),
     list(top_books_in_topic_df.columns)[:2]],
    names=['topic', ''])

top_books_in_topic_df
#top_books_in_topic_df[[7, 8, 9]]

topic,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9
Unnamed: 0_level_1,authors,title,authors,title,authors,title,authors,title,authors,title,authors,title,authors,title,authors,title,authors,title,authors,title
0,"George Orwell, Erich Fromm, Celâl Üster",1984,"J.K. Rowling, Mary GrandPré, Rufus Beck",Harry Potter and the Prisoner of Azkaban (Harr...,Kathryn Stockett,The Help,George R.R. Martin,"A Game of Thrones (A Song of Ice and Fire, #1)",John Grisham,"The Firm (Penguin Readers, Level 5)",Veronica Roth,"Divergent (Divergent, #1)","E.B. White, Garth Williams, Rosemary Wells",Charlotte's Web,Stephen King,It,Stephenie Meyer,"Twilight (Twilight, #1)",Jane Austen,Pride and Prejudice
1,George Orwell,Animal Farm,"J.K. Rowling, Mary GrandPré",Harry Potter and the Goblet of Fire (Harry Pot...,Gillian Flynn,Gone Girl,George R.R. Martin,"A Clash of Kings (A Song of Ice and Fire, #2)",John Grisham,A Time to Kill,Suzanne Collins,"The Hunger Games (The Hunger Games, #1)",Maurice Sendak,Where the Wild Things Are,"Stephen King, Bernie Wrightson",The Stand,Stephenie Meyer,"Breaking Dawn (Twilight, #4)","Charlotte Brontë, Michael Mason",Jane Eyre
2,Kurt Vonnegut Jr.,Slaughterhouse-Five,"J.K. Rowling, Mary GrandPré",Harry Potter and the Deathly Hallows (Harry Po...,Khaled Hosseini,The Kite Runner,George R.R. Martin,"A Storm of Swords (A Song of Ice and Fire, #3)",John Grisham,The Client,Veronica Roth,"Insurgent (Divergent, #2)",Shel Silverstein,The Giving Tree,Stephen King,The Shining (The Shining #1),Stephenie Meyer,"Eclipse (Twilight, #3)",Louisa May Alcott,"Little Women (Little Women, #1)"
3,J.D. Salinger,The Catcher in the Rye,"J.K. Rowling, Mary GrandPré",Harry Potter and the Half-Blood Prince (Harry ...,Sara Gruen,Water for Elephants,J.R.R. Tolkien,The Fellowship of the Ring (The Lord of the Ri...,John Grisham,The Pelican Brief,John Green,The Fault in Our Stars,Shel Silverstein,Where the Sidewalk Ends,Stephen King,Misery,Stephenie Meyer,"New Moon (Twilight, #2)","Jane Austen, Tony Tanner, Ros Ballaster",Sense and Sensibility
4,Aldous Huxley,Brave New World,"J.K. Rowling, Mary GrandPré",Harry Potter and the Order of the Phoenix (Har...,Markus Zusak,The Book Thief,J.R.R. Tolkien,The Hobbit,Dan Brown,"Angels & Demons (Robert Langdon, #1)",Cassandra Clare,"City of Bones (The Mortal Instruments, #1)","Dr. Seuss, לאה נאור",Green Eggs and Ham,Stephen King,Carrie,E.L. James,"Fifty Shades of Grey (Fifty Shades, #1)","Emily Brontë, Richard J. Dunn",Wuthering Heights
5,F. Scott Fitzgerald,The Great Gatsby,"J.K. Rowling, Mary GrandPré",Harry Potter and the Chamber of Secrets (Harry...,"Stieg Larsson, Reg Keeland","The Girl with the Dragon Tattoo (Millennium, #1)",Orson Scott Card,"Ender's Game (Ender's Saga, #1)",John Grisham,The Runaway Jury,Rick Riordan,The Lightning Thief (Percy Jackson and the Oly...,Dr. Seuss,The Cat in the Hat,Stephen King,Pet Sematary,E.L. James,"Fifty Shades Darker (Fifty Shades, #2)",Harper Lee,To Kill a Mockingbird
6,William Golding,Lord of the Flies,"J.K. Rowling, Mary GrandPré",Harry Potter and the Sorcerer's Stone (Harry P...,Paula Hawkins,The Girl on the Train,George R.R. Martin,"A Feast for Crows (A Song of Ice and Fire, #4)","Stieg Larsson, Reg Keeland","The Girl with the Dragon Tattoo (Millennium, #1)",Suzanne Collins,"Catching Fire (The Hunger Games, #2)","Margaret Wise Brown, Clement Hurd",Goodnight Moon,Stephen King,'Salem's Lot,E.L. James,"Fifty Shades Freed (Fifty Shades, #3)","Jane Austen, Fiona Stafford",Emma
7,Ray Bradbury,Fahrenheit 451,Dan Brown,"The Da Vinci Code (Robert Langdon, #2)",Sue Monk Kidd,The Secret Life of Bees,Patrick Rothfuss,The Name of the Wind (The Kingkiller Chronicle...,James Patterson,"Along Came a Spider (Alex Cross, #1)",Cassandra Clare,"City of Glass (The Mortal Instruments, #3)",Shel Silverstein,A Light in the Attic,Stephen King,Needful Things,Charlaine Harris,"Dead Until Dark (Sookie Stackhouse, #1)",Margaret Mitchell,Gone with the Wind
8,Joseph Heller,Catch-22,Suzanne Collins,"Catching Fire (The Hunger Games, #2)",Khaled Hosseini,A Thousand Splendid Suns,George R.R. Martin,"A Dance with Dragons (A Song of Ice and Fire, #5)",John Grisham,The Rainmaker,Cassandra Clare,"City of Ashes (The Mortal Instruments, #2)",Dr. Seuss,How the Grinch Stole Christmas!,Stephen King,"The Gunslinger (The Dark Tower, #1)",Suzanne Collins,"The Hunger Games (The Hunger Games, #1)","William Shakespeare, Robert Jackson",Romeo and Juliet
9,John Steinbeck,Of Mice and Men,Suzanne Collins,"Mockingjay (The Hunger Games, #3)",Anthony Doerr,All the Light We Cannot See,Douglas Adams,The Hitchhiker's Guide to the Galaxy (Hitchhik...,Robert Ludlum,"The Bourne Identity (Jason Bourne, #1)",Suzanne Collins,"Mockingjay (The Hunger Games, #3)",Dr. Seuss,"Oh, The Places You'll Go!",Stephen King,The Green Mile,Suzanne Collins,"Catching Fire (The Hunger Games, #2)","Anne Frank, Eleanor Roosevelt, B.M. Mooyaart-D...",The Diary of a Young Girl


#### Inspect Top Tags in Topic

For each of the books with category score at least `cutoff` in a `topic`, compute the weighted sum of the top 100 tags for that book, with weights given by the score for that book within `topic`.

In [16]:
def column_dot(x, topic):
    return pd.Series(data=(x[topic] * x['count']).sum())

def show_top_book_topic_tags(dict_of_df, topic, cutoff=2):
    """Given a dictionary of topic dataframes, as in
    books_and_topics_dict, return the top book tags via a
    weighted sum by topic vector and book tag counts."""

    # Keep topic vector components greater than cutoff
    shortened_topic_df = dict_of_df[topic][(
        dict_of_df[topic])[topic] >= cutoff][books_cols + [topic]]

    # Join tags
    shortened_topic_tags_df = pd.merge(
        shortened_topic_df[[topic, 'goodreads_book_id']],
        book_tags_df,
        on='goodreads_book_id')

    # Compute weighted sum
    top_topic_tags_df = (
        shortened_topic_tags_df.groupby(['goodreads_book_id', 'tag_id']).apply(
            column_dot, topic).groupby('tag_id').sum()
        # Sort, join tag names, drop extraneous
        # weighted sum column is named 0
        .sort_values(by=0, ascending=False).join(tags_df).drop(
            labels=book_topic_remove_tags_list))

    return top_topic_tags_df

In [17]:
show_top_book_topic_tags(books_and_topics_dict, 7).head(show_rows).reset_index(drop=True)

Unnamed: 0,0,tag_id,tag_name
0,540307.478516,14821,horror
1,279919.134766,28663,stephen-king
2,193002.996094,11305,fantasy
3,64384.876953,30358,thriller
4,60575.052734,17273,king
5,44781.710938,26837,science-fiction
6,44575.900391,9221,default
7,42561.953125,7457,classics
8,35729.720703,26771,sci-fi
9,27013.197266,20939,mystery


In [18]:
# Concatenate the list of top tag_names for each topic
top_tag_names_df = pd.concat([
    show_top_book_topic_tags(
        books_and_topics_dict,
        topic).head(show_rows).reset_index(drop=True)['tag_name'].rename(topic)
    for topic in range(n_components)
],
          axis=1)

top_tag_names_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,classics,fantasy,historical-fiction,fantasy,mystery,young-adult,childrens,horror,young-adult,classics
1,classic,young-adult,young-adult,science-fiction,thriller,fantasy,classics,stephen-king,fantasy,fantasy
2,science-fiction,harry-potter,book-club,sci-fi,fantasy,ya,children,fantasy,romance,classic
3,sci-fi,ya,classics,classics,young-adult,dystopian,children-s-books,thriller,vampires,young-adult
4,literature,series,mystery,young-adult,crime,romance,fantasy,king,ya,romance
5,fantasy,magic,fantasy,series,suspense,series,children-s,science-fiction,paranormal,historical-fiction
6,school,childrens,contemporary,sci-fi-fantasy,series,dystopia,young-adult,default,series,literature
7,novels,re-read,ya,adventure,science-fiction,science-fiction,picture-books,classics,dystopian,school
8,dystopian,adventure,romance,epic-fantasy,john-grisham,sci-fi,childhood,sci-fi,dystopia,historical
9,historical-fiction,children,non-fiction,ya,default,paranormal,kids,mystery,vampire,novels


In [19]:
# Names from inspection of top books and tags
manual_book_topics = {
    0: 'Modern Classics',
    1: 'Harry Potter',
    2: 'Fiction',
    3: 'Fantasy & Sci-Fi',
    4: 'Thrillers',
    5: 'Young Adult',
    6: 'Children\'s',
    7: 'Stephen King',
    8: 'Twilight & Fifty Shades',
    9: 'Austen & Brontës'
}

In [20]:
top_books_in_topic_df.rename(manual_book_topics, axis=1)

topic,Modern Classics,Modern Classics,Harry Potter,Harry Potter,Fiction,Fiction,Fantasy & Sci-Fi,Fantasy & Sci-Fi,Thrillers,Thrillers,Young Adult,Young Adult,Children's,Children's,Stephen King,Stephen King,Twilight & Fifty Shades,Twilight & Fifty Shades,Austen & Brontës,Austen & Brontës
Unnamed: 0_level_1,authors,title,authors,title,authors,title,authors,title,authors,title,authors,title,authors,title,authors,title,authors,title,authors,title
0,"George Orwell, Erich Fromm, Celâl Üster",1984,"J.K. Rowling, Mary GrandPré, Rufus Beck",Harry Potter and the Prisoner of Azkaban (Harr...,Kathryn Stockett,The Help,George R.R. Martin,"A Game of Thrones (A Song of Ice and Fire, #1)",John Grisham,"The Firm (Penguin Readers, Level 5)",Veronica Roth,"Divergent (Divergent, #1)","E.B. White, Garth Williams, Rosemary Wells",Charlotte's Web,Stephen King,It,Stephenie Meyer,"Twilight (Twilight, #1)",Jane Austen,Pride and Prejudice
1,George Orwell,Animal Farm,"J.K. Rowling, Mary GrandPré",Harry Potter and the Goblet of Fire (Harry Pot...,Gillian Flynn,Gone Girl,George R.R. Martin,"A Clash of Kings (A Song of Ice and Fire, #2)",John Grisham,A Time to Kill,Suzanne Collins,"The Hunger Games (The Hunger Games, #1)",Maurice Sendak,Where the Wild Things Are,"Stephen King, Bernie Wrightson",The Stand,Stephenie Meyer,"Breaking Dawn (Twilight, #4)","Charlotte Brontë, Michael Mason",Jane Eyre
2,Kurt Vonnegut Jr.,Slaughterhouse-Five,"J.K. Rowling, Mary GrandPré",Harry Potter and the Deathly Hallows (Harry Po...,Khaled Hosseini,The Kite Runner,George R.R. Martin,"A Storm of Swords (A Song of Ice and Fire, #3)",John Grisham,The Client,Veronica Roth,"Insurgent (Divergent, #2)",Shel Silverstein,The Giving Tree,Stephen King,The Shining (The Shining #1),Stephenie Meyer,"Eclipse (Twilight, #3)",Louisa May Alcott,"Little Women (Little Women, #1)"
3,J.D. Salinger,The Catcher in the Rye,"J.K. Rowling, Mary GrandPré",Harry Potter and the Half-Blood Prince (Harry ...,Sara Gruen,Water for Elephants,J.R.R. Tolkien,The Fellowship of the Ring (The Lord of the Ri...,John Grisham,The Pelican Brief,John Green,The Fault in Our Stars,Shel Silverstein,Where the Sidewalk Ends,Stephen King,Misery,Stephenie Meyer,"New Moon (Twilight, #2)","Jane Austen, Tony Tanner, Ros Ballaster",Sense and Sensibility
4,Aldous Huxley,Brave New World,"J.K. Rowling, Mary GrandPré",Harry Potter and the Order of the Phoenix (Har...,Markus Zusak,The Book Thief,J.R.R. Tolkien,The Hobbit,Dan Brown,"Angels & Demons (Robert Langdon, #1)",Cassandra Clare,"City of Bones (The Mortal Instruments, #1)","Dr. Seuss, לאה נאור",Green Eggs and Ham,Stephen King,Carrie,E.L. James,"Fifty Shades of Grey (Fifty Shades, #1)","Emily Brontë, Richard J. Dunn",Wuthering Heights
5,F. Scott Fitzgerald,The Great Gatsby,"J.K. Rowling, Mary GrandPré",Harry Potter and the Chamber of Secrets (Harry...,"Stieg Larsson, Reg Keeland","The Girl with the Dragon Tattoo (Millennium, #1)",Orson Scott Card,"Ender's Game (Ender's Saga, #1)",John Grisham,The Runaway Jury,Rick Riordan,The Lightning Thief (Percy Jackson and the Oly...,Dr. Seuss,The Cat in the Hat,Stephen King,Pet Sematary,E.L. James,"Fifty Shades Darker (Fifty Shades, #2)",Harper Lee,To Kill a Mockingbird
6,William Golding,Lord of the Flies,"J.K. Rowling, Mary GrandPré",Harry Potter and the Sorcerer's Stone (Harry P...,Paula Hawkins,The Girl on the Train,George R.R. Martin,"A Feast for Crows (A Song of Ice and Fire, #4)","Stieg Larsson, Reg Keeland","The Girl with the Dragon Tattoo (Millennium, #1)",Suzanne Collins,"Catching Fire (The Hunger Games, #2)","Margaret Wise Brown, Clement Hurd",Goodnight Moon,Stephen King,'Salem's Lot,E.L. James,"Fifty Shades Freed (Fifty Shades, #3)","Jane Austen, Fiona Stafford",Emma
7,Ray Bradbury,Fahrenheit 451,Dan Brown,"The Da Vinci Code (Robert Langdon, #2)",Sue Monk Kidd,The Secret Life of Bees,Patrick Rothfuss,The Name of the Wind (The Kingkiller Chronicle...,James Patterson,"Along Came a Spider (Alex Cross, #1)",Cassandra Clare,"City of Glass (The Mortal Instruments, #3)",Shel Silverstein,A Light in the Attic,Stephen King,Needful Things,Charlaine Harris,"Dead Until Dark (Sookie Stackhouse, #1)",Margaret Mitchell,Gone with the Wind
8,Joseph Heller,Catch-22,Suzanne Collins,"Catching Fire (The Hunger Games, #2)",Khaled Hosseini,A Thousand Splendid Suns,George R.R. Martin,"A Dance with Dragons (A Song of Ice and Fire, #5)",John Grisham,The Rainmaker,Cassandra Clare,"City of Ashes (The Mortal Instruments, #2)",Dr. Seuss,How the Grinch Stole Christmas!,Stephen King,"The Gunslinger (The Dark Tower, #1)",Suzanne Collins,"The Hunger Games (The Hunger Games, #1)","William Shakespeare, Robert Jackson",Romeo and Juliet
9,John Steinbeck,Of Mice and Men,Suzanne Collins,"Mockingjay (The Hunger Games, #3)",Anthony Doerr,All the Light We Cannot See,Douglas Adams,The Hitchhiker's Guide to the Galaxy (Hitchhik...,Robert Ludlum,"The Bourne Identity (Jason Bourne, #1)",Suzanne Collins,"Mockingjay (The Hunger Games, #3)",Dr. Seuss,"Oh, The Places You'll Go!",Stephen King,The Green Mile,Suzanne Collins,"Catching Fire (The Hunger Games, #2)","Anne Frank, Eleanor Roosevelt, B.M. Mooyaart-D...",The Diary of a Young Girl


In [21]:
top_tag_names_df.rename(manual_book_topics, axis=1)

Unnamed: 0,Modern Classics,Harry Potter,Fiction,Fantasy & Sci-Fi,Thrillers,Young Adult,Children's,Stephen King,Twilight & Fifty Shades,Austen & Brontës
0,classics,fantasy,historical-fiction,fantasy,mystery,young-adult,childrens,horror,young-adult,classics
1,classic,young-adult,young-adult,science-fiction,thriller,fantasy,classics,stephen-king,fantasy,fantasy
2,science-fiction,harry-potter,book-club,sci-fi,fantasy,ya,children,fantasy,romance,classic
3,sci-fi,ya,classics,classics,young-adult,dystopian,children-s-books,thriller,vampires,young-adult
4,literature,series,mystery,young-adult,crime,romance,fantasy,king,ya,romance
5,fantasy,magic,fantasy,series,suspense,series,children-s,science-fiction,paranormal,historical-fiction
6,school,childrens,contemporary,sci-fi-fantasy,series,dystopia,young-adult,default,series,literature
7,novels,re-read,ya,adventure,science-fiction,science-fiction,picture-books,classics,dystopian,school
8,dystopian,adventure,romance,epic-fantasy,john-grisham,sci-fi,childhood,sci-fi,dystopia,historical
9,historical-fiction,children,non-fiction,ya,default,paranormal,kids,mystery,vampire,novels


In [22]:
# Output

table_output_path = './table/goodreads-topics-recommendations/'

tags_df.head().to_latex(
    table_output_path + 'tags.tex',
    index=False) 

book_tags_df.head().to_latex(
    table_output_path + 'book-tags.tex',
    index=False)  

(tags_df.loc[book_topic_remove_tags_list]).to_latex(
    table_output_path + 'book-topic-remove-tags.tex',
    columns=['tag_name'],
    index=False)  

top_books_in_topic_df.rename(manual_book_topics, axis=1).to_latex(
    table_output_path + 'top-books-in-topic.tex',
    index=False,
    column_format='p{3cm}' * len(top_books_in_topic_df.columns))

top_tag_names_df.rename(manual_book_topics, axis=1).to_latex(
    table_output_path + 'top-tag-names-left.tex',
    index=False,
    columns=[manual_book_topics[topic] for topic in range(5)])

top_tag_names_df.rename(manual_book_topics, axis=1).to_latex(
    table_output_path + 'top-tag-names-right.tex',
    index=False,
    columns=[manual_book_topics[topic] for topic in range(5,10)])

for topic in range(n_components):
    top_books_in_topic_df.rename(
        manual_book_topics, axis=1).loc[:, manual_book_topics[topic]].to_latex(
            table_output_path + 'top-books-in-topic-' + str(topic) + '.tex',
    index=False)

### $k=25$

In [23]:
# Declare model
model_dfs = (pd.DataFrame(data=W25), pd.DataFrame(data=H25.T))

# Get the value of k from the model
n_components = len(model_dfs[1].columns)

In [24]:
# Make a dictionary of dataframes containing book topic vectors
books_and_topics_dict = {
    topic:
    (model_dfs[1].join(books_df[books_cols]).sort_values(by=topic,
                                                         ascending=False))
    for topic in range(len(model_dfs[1].columns))
}

In [25]:
show_rows = 12

top_books_in_topic_df = pd.DataFrame(
    pd.concat(
        [
            pd.DataFrame(data=books_and_topics_dict[topic][
                ['authors', 'title']].head(show_rows).reset_index(drop=True))
            for topic in range(len(model_dfs[1].columns))
        ],
        axis=1,
        #        ignore_index=True
    ))

top_books_in_topic_df.columns = pd.MultiIndex.from_product(
    [list(model_dfs[1].columns),
     list(top_books_in_topic_df.columns)[:2]],
    names=['topic', ''])

In [26]:
# Inspect a subset of topics
topics_list = list(range(5))

In [27]:
top_books_in_topic_df[topics_list]

topic,0,0,1,1,2,2,3,3,4,4
Unnamed: 0_level_1,authors,title,authors,title,authors,title,authors,title,authors,title
0,Harper Lee,To Kill a Mockingbird,"J.K. Rowling, Mary GrandPré, Rufus Beck",Harry Potter and the Prisoner of Azkaban (Harr...,Paula Hawkins,The Girl on the Train,Douglas Adams,The Hitchhiker's Guide to the Galaxy (Hitchhik...,Stephen King,The Shining (The Shining #1)
1,George Orwell,Animal Farm,"J.K. Rowling, Mary GrandPré",Harry Potter and the Goblet of Fire (Harry Pot...,Anthony Doerr,All the Light We Cannot See,Orson Scott Card,"Ender's Game (Ender's Saga, #1)",Stephen King,It
2,F. Scott Fitzgerald,The Great Gatsby,"J.K. Rowling, Mary GrandPré",Harry Potter and the Half-Blood Prince (Harry ...,Gillian Flynn,Gone Girl,Frank Herbert,Dune (Dune Chronicles #1),Stephen King,Misery
3,J.D. Salinger,The Catcher in the Rye,"J.K. Rowling, Mary GrandPré",Harry Potter and the Deathly Hallows (Harry Po...,Jojo Moyes,"Me Before You (Me Before You, #1)",Isaac Asimov,Foundation (Foundation #1),"Stephen King, Bernie Wrightson",The Stand
4,"George Orwell, Erich Fromm, Celâl Üster",1984,"J.K. Rowling, Mary GrandPré",Harry Potter and the Order of the Phoenix (Har...,Kristin Hannah,The Nightingale,Ray Bradbury,Fahrenheit 451,Stephen King,Carrie
5,William Golding,Lord of the Flies,"J.K. Rowling, Mary GrandPré",Harry Potter and the Chamber of Secrets (Harry...,Liane Moriarty,Big Little Lies,"George Orwell, Erich Fromm, Celâl Üster",1984,Stephen King,Pet Sematary
6,John Steinbeck,Of Mice and Men,"J.K. Rowling, Mary GrandPré",Harry Potter and the Sorcerer's Stone (Harry P...,John Green,The Fault in Our Stars,Neil Gaiman,"American Gods (American Gods, #1)",Stephen King,'Salem's Lot
7,Ray Bradbury,Fahrenheit 451,Dan Brown,"The Da Vinci Code (Robert Langdon, #2)",Liane Moriarty,The Husband's Secret,Aldous Huxley,Brave New World,Stephen King,Needful Things
8,Aldous Huxley,Brave New World,J.K. Rowling,The Tales of Beedle the Bard,Donna Tartt,The Goldfinch,"Alan Moore, Dave Gibbons, John Higgins",Watchmen,Stephen King,Cujo
9,"Mark Twain, John Seelye, Guy Cardwell",The Adventures of Huckleberry Finn,"C.S. Lewis, Pauline Baynes",The Chronicles of Narnia (Chronicles of Narnia...,Christina Baker Kline,Orphan Train,Isaac Asimov,"I, Robot (Robot #0.1)",Stephen King,Firestarter


In [28]:
pd.concat([
    show_top_book_topic_tags(
        books_and_topics_dict,
        topic).head(show_rows)['tag_name'].reset_index(drop=True).rename(topic)
    for topic in topics_list
],
          axis=1)

Unnamed: 0,0,1,2,3,4
0,classics,fantasy,book-club,science-fiction,horror
1,classic,young-adult,historical-fiction,sci-fi,stephen-king
2,fantasy,harry-potter,mystery,fantasy,fantasy
3,literature,ya,young-adult,classics,thriller
4,school,series,contemporary,young-adult,king
5,science-fiction,magic,ya,scifi,classics
6,young-adult,childrens,kindle,dystopian,science-fiction
7,historical-fiction,re-read,audiobook,classic,default
8,sci-fi,children,romance,dystopia,mystery
9,novels,adventure,thriller,sci-fi-fantasy,sci-fi


Further clustering leads to further refinements in topics. For example, there is now a single sci-fi topic ($k=10$ had a Sci-Fi and Fantasy topic); there are many fantasy topics centered around more specific fantasy series. In addiion, there are now two Stephen King topics. But there is a clear distinction between the sort of books in each topic.

In [29]:
# Sci-Fi Topic
sci_fi_topic_df = top_books_in_topic_df[[3]]
sci_fi_topic_df

topic,3,3
Unnamed: 0_level_1,authors,title
0,Douglas Adams,The Hitchhiker's Guide to the Galaxy (Hitchhik...
1,Orson Scott Card,"Ender's Game (Ender's Saga, #1)"
2,Frank Herbert,Dune (Dune Chronicles #1)
3,Isaac Asimov,Foundation (Foundation #1)
4,Ray Bradbury,Fahrenheit 451
5,"George Orwell, Erich Fromm, Celâl Üster",1984
6,Neil Gaiman,"American Gods (American Gods, #1)"
7,Aldous Huxley,Brave New World
8,"Alan Moore, Dave Gibbons, John Higgins",Watchmen
9,Isaac Asimov,"I, Robot (Robot #0.1)"


In [30]:
# Stephen King Topics
stephen_king_topics_df = top_books_in_topic_df.loc[:, [(4,'title'), (7,'title')]]

stephen_king_topics_df

# stephen_king_topics_df.to_latex(
#     table_output_path + 'stephen-king-topics.tex',
#     index=False) 

topic,4,7
Unnamed: 0_level_1,title,title
0,The Shining (The Shining #1),"The Drawing of the Three (The Dark Tower, #2)"
1,It,"The Waste Lands (The Dark Tower, #3)"
2,Misery,"Wizard and Glass (The Dark Tower, #4)"
3,The Stand,"Wolves of the Calla (The Dark Tower, #5)"
4,Carrie,"The Dark Tower (The Dark Tower, #7)"
5,Pet Sematary,"The Gunslinger (The Dark Tower, #1)"
6,'Salem's Lot,"Song of Susannah (The Dark Tower, #6)"
7,Needful Things,"The Wind Through the Keyhole (The Dark Tower, ..."
8,Cujo,The Eyes of the Dragon
9,Firestarter,"The Talisman (The Talisman, #1)"


In [31]:
stephen_king_tags_df = pd.concat(
    [
        show_top_book_topic_tags(books_and_topics_dict, topic).head(10)[[
            'tag_name', 0
        ]].reset_index(drop=True)
        for topic in [4, 7]
    ],
    axis=1)

king_cols = list(stephen_king_tags_df.columns)
king_cols[1] = 4
king_cols[3] = 7
stephen_king_tags_df.columns = king_cols

stephen_king_tags_df

Unnamed: 0,tag_name,4,tag_name.1,7
0,horror,764504.410156,fantasy,247250.445312
1,stephen-king,330394.240234,stephen-king,102357.71875
2,fantasy,138616.476562,horror,98633.960938
3,thriller,100242.935547,science-fiction,26623.748047
4,king,72494.458984,king,23965.853516
5,classics,67286.230469,series,23063.929688
6,science-fiction,55149.257812,sci-fi,22892.6875
7,default,54161.392578,default,18704.058594
8,mystery,43744.664062,western,14983.824219
9,sci-fi,43039.652344,dark-tower,14601.847656


Topic 4 is Stephen King (Horror) while Topic 7 is Stephen King (Fantasy).

In [32]:
# Output

In [33]:
# sci_fi_topic_df.to_latex(
#     table_output_path + 'sci-fi-topic.tex',
#     index=False,
#     column_format='p{0.45\linewidth}' * 2) 

In [34]:
# stephen_king_topics_df.to_latex(
#     table_output_path + 'stephen-king-topics.tex',
#     index=False)
# #    column_format='p{0.35\linewidth}p{0.45\linewidth}')

In [35]:
# stephen_king_tags_df.to_latex(
#     table_output_path + 'stephen-king-tags.tex',
#     index=False,
#     float_format="{:0.2f}".format) 