# Plot Relative Popularity of Work By Year — Interactive

Import plotly express

In [8]:
import plotly.express as px

Install, import, initialize Chart Studio for exporting interactive plot

In [9]:
#!pip install chart-studio

In [10]:
import chart_studio
import chart_studio.plotly as py
import chart_studio.tools as tls
# import api key
import config_secrets

username= config_secrets.chartstudio_username
api_key=  config_secrets.chartstudio_api_key
chart_studio.tools.set_credentials_file(username=username,
                                        api_key=api_key)



Function for converting last, first author name to first last without commas

In [11]:
def convert_string_to_author(author):
    if type(author) is list:
        author = author[0]
    if not pd.isnull(author):   
        author = str(author)
        if ',' in author:
            author = author.split(',')[1].strip() + ' ' + author.split(',')[0].strip()
    return author

Function for creating plot

In [57]:
from graph import get_goodreads_graph, get_sc_graph
import json
import numpy as np
import math
import os

# don't let matplotlib use xwindows
import matplotlib
#matplotlib.use('Agg')
import matplotlib.pyplot as plt
from matplotlib.pylab import savefig
import seaborn as sns
sns.set_style("ticks")
import pandas as pd

output_directory_path = './figures'
if not os.path.exists(output_directory_path):
    os.makedirs(output_directory_path)

def plot_relative_popularity_by_year():

    # get the shakespeare and company graph
    sc_books_in_vertex_order, sc_book_to_vertex_index, sc_edge_to_weight, sc_vertex_to_neighbors, sc_n, sc_book_uri_to_num_events, sc_book_uri_to_text, sc_book_uri_to_year, sc_book_uri_to_title, sc_book_uri_to_author = get_sc_graph()

    # and now get the goodreads graph
    gr_books_in_vertex_order, gr_book_to_vertex_index, gr_edge_to_weight, gr_vertex_to_neighbors, gr_n, gr_book_id_to_num_ratings, gr_book_id_to_text = get_goodreads_graph()

    with open('data/goodreads-book-id-to-sc-uri_full-matching.json', 'r') as f:
        goodreads_book_id_to_sc_uri = json.load(f)

    # load newly scraped data
    df = pd.read_json('data/matched-goodreads-metadata.json')
    gr_book_id_to_scraped_num_reviews = {str(gr_id): num_reviews for gr_id, num_reviews in zip(df['bookID'], df['numReviews'])}
    gr_book_id_to_scraped_year = {str(gr_id): year for gr_id, year in zip(df['bookID'], df['yearFirstPublished'])}
    gr_book_id_to_scraped_title = {str(gr_id): title for gr_id, title in zip(df['bookID'], df['title'])}
    gr_book_id_to_scraped_author = {str(gr_id): author for gr_id, author in zip(df['bookID'], df['author'])}
    
    years = []
    titles = []
    authors = []
    gr_popularity_ratios = []
    sc_popularity_ratios = []
    gr_reviews = []
    sc_borrows = []
    gr_total_reviews = sum(gr_book_id_to_scraped_num_reviews.values())
    sc_total_borrows = sum(sc_book_uri_to_num_events.values())
    sc_texts = []
    for gr_book_id, sc_uri in goodreads_book_id_to_sc_uri.items():

        # some matched books don't have years in the dataset!!
        if gr_book_id not in gr_book_id_to_scraped_year:
            continue
        if math.isnan(gr_book_id_to_scraped_year[gr_book_id]):
            continue

        year = int(gr_book_id_to_scraped_year[gr_book_id])
        title = sc_book_uri_to_title[sc_uri]
        author = sc_book_uri_to_author[sc_uri]

        # skip super old books
        if year < 1800 or year > 1940:
            continue

        # sometimes popularity is zero--skip!!!
        if gr_book_id_to_scraped_num_reviews[gr_book_id] == 0:
            continue
        if sc_book_uri_to_num_events[sc_uri] == 0:
            continue

        #gr_text = gr_book_id_to_text[gr_book_id]
        #sc_text = sc_book_uri_to_text[sc_uri]
        sc_text = '{}\t{}'.format(gr_book_id_to_scraped_title[gr_book_id], gr_book_id_to_scraped_author[gr_book_id])

        # get relative popularity ratios
        gr_popularity_ratios.append(gr_book_id_to_scraped_num_reviews[gr_book_id] / gr_total_reviews)
        sc_popularity_ratios.append(sc_book_uri_to_num_events[sc_uri] / sc_total_borrows)

        # add in Goodreads reviews and SC borrows
        gr_reviews.append(gr_book_id_to_scraped_num_reviews[gr_book_id])
        sc_borrows.append(sc_book_uri_to_num_events[sc_uri])
        years.append(year)
        titles.append(title)

        # add convert string to author
        authors.append(convert_string_to_author(author))

        sc_texts.append(sc_text)

    # now plot!
    log_ratios = [np.log(s / g) for s, g in zip(sc_popularity_ratios, gr_popularity_ratios)]

    point_types = []
    most_gr_examples = sorted(zip(log_ratios, years, sc_texts, [i for i in range(len(years))]), reverse=False)[:30]
    most_gr_idxs = [i for _, _, _, i in most_gr_examples]
    most_sc_examples = sorted(zip(log_ratios, years, sc_texts, [i for i in range(len(years))]), reverse=True)[:30]
    most_sc_idxs = [i for _, _, _, i in most_sc_examples]
    for i in range(len(log_ratios)):
        if i in most_gr_idxs:
            point_types.append('Much more popular on<br>Goodreads')
        elif i in most_sc_idxs:
            point_types.append('Much more popular in<br> Shakespeare & Company')
        else:
            point_types.append('Other work')

    # '#86ceeb'
    color_dict = {'Other work': '#b3cde3', 'Much more popular in<br> Shakespeare & Company': '#fc6b32', 'Much more popular on<br>Goodreads': '#13c28d'}
    marker_dict = {'normal': 'o', 'sc': 's', 'gr': 'D'}

    results = pd.DataFrame({'Year': years,
                        'log(SC/GR)': log_ratios,
                        'point_types': point_types,
                        'titles': titles,
                        'authors': authors,
                        'Goodreads reviews': gr_reviews,
                        'SC borrows': sc_borrows
                       })
    
    # Add in interactivity

    config = {'scrollZoom': True,
              'displaylogo': False,  
              'displayModeBar': True,
              'modeBarButtonsToRemove': ['lasso2d',
                                         'zoom2d', 
                                         'hoverCompareCartesian', 
                                         'hoverClosestCartesian', 
                                         'toggleSpikelines',
                                          'autoScale2d',
                                          'select2d']
              }

    fig = px.scatter(data_frame=results,
                    hover_data = ['Year', 'log(SC/GR)', 'point_types', 'titles', 'authors', 'Goodreads reviews', 'SC borrows'], 
                    x='Year',
                    y='log(SC/GR)',
                     color='point_types', color_discrete_map=color_dict,
                         symbol='point_types', symbol_sequence=['circle', 'square', 'diamond'],

                    hover_name = 'titles',
                    labels = {'log(SC/GR)': '<br><--- More Popular GR   &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;   More Popular SC ---><br><br> SC Borrows / GR Reviews* <br> (log scale)',
                            'Year': 'Publication Year',
                            },
                    title = '<br>Relative Popularity of Work by Publication Year</br>'
                    )
    
    # customize hover text

    fig.update_traces(hovertemplate='<b>%{hovertext}</b><br><br>Author: %{customdata[2]}<br>Publication Year: %{x}<br><br>S&C Borrows: %{customdata[4]}<br>Goodreads Reviews: %{customdata[3]:,.0f}<br><br>log(SC/GR): %{y:,.2f}<extra></extra>'
    ) 

    # customize x ticks

    fig.update_layout(
        xaxis = dict(
            tickmode = 'linear',
            tick0 = 1800,
            dtick = 20
        )
    )
    # set font color and size
    fig.update_layout(
        #font_family="Liberation Sans",
        font_color="black",
        font_size = 15,
        # title_font_family="Overpass",
        title_font_color="black",
        title_font_size = 20,
        #legend_title_font_color="green"
    )

    # set font size -- hover text

    fig.update_layout(
    hoverlabel=dict(
      
        font_size=16,
     
    )
)
    # set drag to pan 
    fig.update_layout(dragmode='pan')
    
    # Set legend

    fig.update_layout(legend_title_text='', legend_traceorder="reversed")
    
    # Center title
    fig.update_layout(title_x=0.45)

    # increase size of points
    fig.update_traces(marker=dict(size=7.5))

    #Write to HTML
    fig.write_html("relative-popularity-interactive.html", config = config)

    fig.show(config = config)

    # send plot to Chart Studio
    py.plot(fig, filename="relative_popularity-interactive-v2", auto_open = True, config = config)

    # print extreme books
    print('Most relatively popular in Goodreads:')
    for i, (ratio, year, title, author, text) in enumerate(sorted(zip(log_ratios, years, titles, authors, sc_texts), reverse=False)[:20]):
        print('\t{}\t{}\t{}\t{}'.format(i+1, year, title, author))
    print('Most relatively popular in Shakespeare and Company:')
    for i, (ratio, year, title, author, text) in enumerate(sorted(zip(log_ratios, years, titles, authors, sc_texts), reverse=True)[:20]):
        print('\t{}\t{}\t{}\t{}'.format(i+1, year, title, author))


plot_relative_popularity_by_year()


Most relatively popular in Goodreads:
	1	1880	Little Women	Louisa May Alcott
	2	1897	Dracula	Bram Stoker
	3	1877	Anna Karenina	Leo Tolstoy
	4	1900	The Wonderful Wizard of Oz	L. Frank Baum
	5	1857	Madame Bovary	Gustave Flaubert
	6	1929	All Quiet on the Western Front	Erich Maria Remarque
	7	1814	Mansfield Park	Jane Austen
	8	1925	The Trial	Franz Kafka
	9	1851	Uncle Tom's Cabin	Harriet Beecher Stowe
	10	1925	The Great Gatsby	F. Scott Fitzgerald
	11	1892	The Adventures of Sherlock Holmes	Arthur Conan Doyle
	12	1818	Northanger Abbey	Jane Austen
	13	1813	Pride and Prejudice	Jane Austen
	14	1817	Persuasion	Jane Austen
	15	1897	The Invisible Man	H. G. Wells
	16	1908	The Wind in the Willows	Kenneth Grahame
	17	1890	The Picture of Dorian Grey	Oscar Wilde
	18	1903	The Call of the Wild	Jack London
	19	1911	Ethan Frome	Edith Wharton
	20	1890	The Sign of Four	Arthur Conan Doyle
Most relatively popular in Shakespeare and Company:
	1	1938	The Midas Touch	Margaret Kennedy
	2	1935	Ripeness Is All	Eric L