# NYRB Classics Color Analysis

### Collect All NYRB Classics into a Dataset

In [1]:
import re
import requests
import pandas as pd
import urllib.request
from bs4 import BeautifulSoup
from datetime import datetime

In [2]:
# Create list of relevant URLs
base_url = 'https://www.nyrb.com/collections/classics'
url_list = ["{}?page={}".format(base_url, str(page)) for page in range(1,10)]

In [3]:
# Retrieve information and parse through BeautifulSoup
stack = []
for url_ in url_list:
    print("Processing {}".format(url_))
    r_new = requests.get(url_)
    soup_new = BeautifulSoup(r_new.text, "html5lib")
    for div in soup_new.find_all("div", class_="product"):
        stack.append(div)

Processing https://www.nyrb.com/collections/classics?page=1
Processing https://www.nyrb.com/collections/classics?page=2
Processing https://www.nyrb.com/collections/classics?page=3
Processing https://www.nyrb.com/collections/classics?page=4
Processing https://www.nyrb.com/collections/classics?page=5
Processing https://www.nyrb.com/collections/classics?page=6
Processing https://www.nyrb.com/collections/classics?page=7
Processing https://www.nyrb.com/collections/classics?page=8
Processing https://www.nyrb.com/collections/classics?page=9


In [4]:
# Parse bs4 ResultSet for information of interest - returns dictionary of values
def parse_product(product):
    
    # print(product)
    title = product.find("h4").text
    img = product.find("img")['src'][2:]
    detail = product.find("a")['href']
    nyrb_pub_date = pd.to_datetime(product['data-pubdate'])  # use this to filter published Classics from forthcoming Classics 
    
    return {
        "title": title,
        "img": img, 
        "detail": detail,
        "nyrb_pub_date": nyrb_pub_date
    }

In [5]:
# Parse information and add it to a pandas dataframe
all_books = pd.DataFrame()
for i in range(0, len(stack)):
    prod = parse_product(stack[i])
    prod = pd.DataFrame(prod, index=[0])
    all_books = all_books.append(prod)

In [6]:
# Filter to exclude forthcoming publications
books = all_books[all_books['nyrb_pub_date'] <= datetime.today()]
books = books.reset_index()

In [7]:
# Include Out of Print books
oop = {
    "title": ['Letty Fox: Her Luck',
               'To the Finland Station',
               'The Diary of a Rapist',
               'The Man Who Watched Trains Go By',
               'The Sorrow Beyond Dreams',
               'Selected Stories of Robert Walser',
               'The Towers of Trebizond'],
    "img": ['https://images.gr-assets.com/books/1320400476l/132508.jpg',
             'https://images.gr-assets.com/books/1320440378l/694282.jpg',
             'https://i2.wp.com/i4.photobucket.com/albums/y126/paradorlounge/159017094601LZZZZZZZ.jpg', 
             'https://images-na.ssl-images-amazon.com/images/I/41LLjzL%2B%2BML._SX311_BO1,204,203,200_.jpg',
             'https://images-na.ssl-images-amazon.com/images/I/41B238tikhL._SX294_BO1,204,203,200_.jpg',
             'https://images.gr-assets.com/books/1320472249l/160313.jpg',
             'https://images.gr-assets.com/books/1386748970l/192954.jpg'],
    "detail": ['NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA'],
    "nyrb_pub_date": ['NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA']
}

In [8]:
# Peep the dataset
books[0:6]

Unnamed: 0,index,title,img,detail,nyrb_pub_date
0,0,Journey into the Mind's Eye,cdn.shopify.com/s/files/1/0726/9203/products/J...,/collections/classics/products/journey-into-th...,2018-07-10
1,0,Sand,cdn.shopify.com/s/files/1/0726/9203/products/S...,/collections/classics/products/sand,2018-06-12
2,0,Havoc,cdn.shopify.com/s/files/1/0726/9203/products/H...,/collections/classics/products/havoc,2018-06-12
3,0,The Seventh Cross,cdn.shopify.com/s/files/1/0726/9203/products/s...,/collections/classics/products/the-seventh-cross,2018-05-22
4,0,Compulsory Games,cdn.shopify.com/s/files/1/0726/9203/products/9...,/collections/classics/products/compulsory-games,2018-05-08
5,0,Basic Black with Pearls,cdn.shopify.com/s/files/1/0726/9203/products/B...,/collections/classics/products/basic-black-wit...,2018-04-17


#### Quick Info about NYRB Classics Dataset

In [9]:
print("Number of books: {}".format(len(books)))
print("Publication dates range from {} to {}".format(min(books['nyrb_pub_date']), max(books['nyrb_pub_date'])))

Number of books: 486
Publication dates range from 1999-09-30 00:00:00 to 2018-07-10 00:00:00


### Scrape information from individual book pages

In [10]:
# Create list of relevant URLs from earlier collection
base_url = 'https://www.nyrb.com'
book_urls = [books['detail'][i] for i in range(0, len(books))]
url_list = ["{}{}".format(base_url, url) for url in book_urls]

In [11]:
# Retrieve information and parse through BeautifulSoup
stack = []
for url_ in url_list:
    # print("Processing {}".format(url_))
    r_new = requests.get(url_)
    soup_new = BeautifulSoup(r_new.text, "html5lib")
    stack.append(soup_new)

In [12]:
# Further parse BeautifulSoup for information of interest - returns dictionary of values
def parse_details(book):
    
    # "book" is a BeautifulSoup object - right after calling BeautifulSoup()
    title_str = str(book.find_all("div", class_='span8')[0].find("h1", class_='title'))
    title = re.search(">(.*?)<", title_str).group(1)
    people = book.find_all("div", class_='span8')[0].find("h2", class_="combined-authors").text.strip()
    isbn = book.find_all("div", class_='description additional')[0].find(class_='variant-sku').text
    more = book.find_all("div", class_='description additional')[0].find("p").text
    tags_mess = book.find_all("div",class_="span8")[0].find("div", class_="tags clearfix").find_all("a")
    
    # parse authors and language string
    author = re.split(",", people)[0][3:]
    try:
        phrase = re.search('translated from the [a-zA-z]{1,10}', people).group(0)
        language = re.split("\s+", phrase)[-1]
    except AttributeError:
        try:
            language = re.search('translated', people).group(0)
        except AttributeError:
            language = 'English'
 
    # parse additional information string
    pages = re.split("\s+", re.split("\s+\s+", more)[2])[1]
    
    # parse tags
    l = []
    for i in range(0, len(tags_mess)):
        l.append(tags_mess[i].text)
        
    tags = ",".join(l)
    
    return {
        "title": title,
        "author": author,
        "isbn": isbn,
        "pages": pages,
        "original_language": language,
        "tags": tags
    }

In [13]:
# Example
parse_details(stack[1])

{'author': 'Wolfgang Herrndorf',
 'isbn': '9781681372013',
 'original_language': 'German',
 'pages': '464',
 'tags': 'Available as E-Book,German Literature,Historical Fiction,Literary Fiction,Suspense & Crime',
 'title': 'Sand'}

In [14]:
# Parse information and add it to a pandas dataframe
all_details = pd.DataFrame()
for i in range(0, len(stack)):
    details = parse_details(stack[i])
    details = pd.DataFrame(details, index=[0])
    all_details = all_details.append(details)

In [15]:
# Peep the dataset
all_details = all_details.reset_index()
all_details[0:6]

Unnamed: 0,index,title,author,isbn,pages,original_language,tags
0,0,Journey into the Mind's Eye,Lesley Blanch,9781681371931,400,English,"Available as E-Book,Biography & Memoir,Literat..."
1,0,Sand,Wolfgang Herrndorf,9781681372013,464,German,"Available as E-Book,German Literature,Historic..."
2,0,Havoc,Tom Kristensen,9781681372075,528,Danish,"Available as E-Book,International Literature,L..."
3,0,The Seventh Cross,Anna Seghers,9781681372129,416,German,"Available as E-Book,German Literature,Historic..."
4,0,Compulsory Games,Robert Aickman,9781681371894,368,English,"Available as E-Book,British & Irish Literature..."
5,0,Basic Black with Pearls,Helen Weinzweig,9781681372167,160,English,"Available as E-Book,Literary Fiction,Literatur..."


In [16]:
# For downloading images later
books_urls = pd.concat([all_details, pd.DataFrame(book_urls)], axis=1)
books_urls.rename(columns={books_urls.columns[-1]:'detail'}, inplace=True)
imgs = pd.merge(books_urls, books, on='detail', how='inner')
slugs = [re.split("/", imgs['detail'][i])[-1] for i in range(0, len(imgs))]

In [17]:
# Parse NYRB tags into list with corresponding titles
tags_list = [re.split(",",all_details.tags[i]) for i in range(0, len(all_details))]
titles = []
nyrb_tags = []
for tag_list in range(0, len(tags_list)):
    for tag in range(0, len(tags_list[tag_list])):
        nyrb_tags.append([tags_list[tag_list][tag]])
        titles.append(all_details['title'][tag_list])

In [18]:
# Store books and tags information in dataframe
books_n_tags = pd.concat([pd.DataFrame(titles), pd.DataFrame(nyrb_tags)], axis=1)
books_n_tags.columns = ['title', 'nyrb_tag']
books_n_tags[0:6]

Unnamed: 0,title,nyrb_tag
0,Journey into the Mind's Eye,Available as E-Book
1,Journey into the Mind's Eye,Biography & Memoir
2,Journey into the Mind's Eye,Literature in English
3,Sand,Available as E-Book
4,Sand,German Literature
5,Sand,Historical Fiction


In [19]:
pd.DataFrame(books_n_tags).nyrb_tag.value_counts()[0:6]

Available as E-Book           322
International Literature      226
Literature in English         203
Literary Fiction              190
British & Irish Literature    116
American Literature           100
Name: nyrb_tag, dtype: int64

#### Quick Statistics about NYRB Classics - Details Dataset

In [20]:
print("There are {} books in this dataset that were translated into English".format(str(len(all_details[all_details['original_language'] != 'English']))))

There are 194 books in this dataset that were translated into English


In [21]:
all_details['original_language'].value_counts() # need to fix unknown languages

English       292
French         50
German         31
Russian        24
Italian        21
translated     19
Spanish         9
Hungarian       7
Chinese         4
Swedish         4
Serbo           3
Japanese        3
Polish          3
Danish          3
Greek           2
Czech           2
Dutch           2
Arabic          2
Occitan         1
Romanian        1
Catalan         1
Hindi           1
Turkish         1
Name: original_language, dtype: int64

In [22]:
# fix languages, including those not listed (like Maupassant)
# fix title - no subtitles
# fix tags - one tag per column?
# include author gender, country 

#### Find more information from [Publishers Weekly Translation Database](https://www.publishersweekly.com/pw/translation/search/index.html)

In [23]:
# Find page for NYRB specifically
url = 'https://www.publishersweekly.com/pw/translation/search/index.html?country=&language=&translator_gender=&submit=Search&author=&genre=&author_gender=&submitting=1&translator=&isbn=&title=&publisher=226'
r = requests.get(url)

In [24]:
# Parse page for book titles and authors
soup = BeautifulSoup(r.text, "html5lib")
list = soup.find("table", class_="table table-striped").find_all("tr")

In [25]:
# Collect list of relevant PW urls from the website itself
pw_urls = [list[i].find("a")['href'] for i in range(0, len(list))]

In [26]:
# Create URL list
base_url = 'https://www.publishersweekly.com'
url_list = ["{}{}".format(base_url, url) for url in pw_urls]

In [27]:
# Make get request and parse with BeautifulSoup for all URLs
stack = []
for url_ in url_list:
    # print("Processing {}".format(url_))
    r_new = requests.get(url_)
    soup_new = BeautifulSoup(r_new.text, "html5lib")
    stack.append(soup_new)

In [28]:
# Further parse BeautifulSoup for information of interest - returns dictionary of values
def pw_parser(book):
    
    # retrieve table with information
    info = [book.find("table").find_all("tr")[i].text for i in range(1,len(book.find("table").find_all("tr")))]
    author = re.split(":", info[0])[1]
    isbn = re.split(":", info[3])[1]
    pw_genre = re.split(":", info[5])[1]
    country = re.split(":", info[7])[1]
    gender = re.split(":", info[8])[1]
    
    return {
        "author": author,
        "isbn": isbn,
        "pw_genre": pw_genre,
        "country": country,
        "gender": gender
    }

In [29]:
# Example
pw_parser(stack[10])

{'author': 'Alfred Doblin',
 'country': 'Germany',
 'gender': 'Male',
 'isbn': '9781590179734',
 'pw_genre': 'Fiction'}

In [30]:
# Parse information and add it to a pandas dataframe
pw_details = pd.DataFrame()
for i in range(0, len(stack)):
    details = pw_parser(stack[i])
    details = pd.DataFrame(details, index=[0])
    pw_details = pw_details.append(details)

In [31]:
# Peep the dataset
pw_details[0:6]

Unnamed: 0,author,isbn,pw_genre,country,gender
0,Szilard Borbely,9781681370545,Poetry,Hungary,Male
0,Emmanuel Bove,9781590178324,Fiction,France,Male
0,Matei Calinescu,9781681371955,Fiction,Romania,Male
0,Eileen Chang,9781681371276,Fiction,Chile,Female
0,Anton Chekhov,9781590178362,Fiction,Russia,Male
0,Gabriel Chevallier,9781590177167,Fiction,France,Male


In [32]:
# Join information from PW and NYRB
more_details = pd.merge(pw_details, all_details, on='isbn', how='inner')
more_details[0:6]

Unnamed: 0,author_x,isbn,pw_genre,country,gender,index,title,author_y,pages,original_language,tags
0,Emmanuel Bove,9781590178324,Fiction,France,Male,0,Henri Duchemin and His Shadows,Emmanuel Bove,160,French,"Available as E-Book,International Literature,L..."
1,Matei Calinescu,9781681371955,Fiction,Romania,Male,0,The Life and Opinions of Zacharias Lichter,Matei Calinescu,160,Romanian,"Available as E-Book,International Literature,L..."
2,Eileen Chang,9781681371276,Fiction,Chile,Female,0,Little Reunions,Eileen Chang,352,translated,"Asian Literature,Available as E-Book,Historica..."
3,Anton Chekhov,9781590178362,Fiction,Russia,Male,0,The Prank,Anton Chekhov,168,English,"Available as E-Book,International Literature,L..."
4,Gabriel Chevallier,9781590177167,Fiction,France,Male,0,Fear,Gabriel Chevallier,328,French,"Available as E-Book,French Literature,Internat..."
5,Jean-Paul Clebert,9781590179574,Fiction,France,Male,0,Paris Vagabond,Jean-Paul Clébert,352,English,"Available as E-Book,Biography & Memoir,French ..."


In [54]:
# more_details.to_csv("nyrb_info.csv")

### Download Book Covers

In [None]:
# Download covers into your repo
#for i in range(0, len(books_urls)):
#    url = "https://{}".format(imgs['img'][i])
#    filename = "classics_covers/{}.png".format(slugs[i])   # change to jpg if url links to jpg
    
#    urllib.request.urlretrieve(url, filename)

### Post-Shell Scripting: Check Palette Results

In [34]:
palettes = pd.read_table("palettes.txt", header=None)

In [35]:
palettes.head()

Unnamed: 0,0
0,a-balcony-in-the-forest
1,#A4A19A
2,#514D4B
3,#A49C63
4,#423C3C


In [36]:
hex_cols = []
titles = []
for i in range(0, len(palettes)):
    check = re.search('#[0-9A-F]{6}', palettes[0][i])
    if check:
        hex_cols.append(check[0])
    else:
        titles.append(palettes[0][i])

In [37]:
# check that all titles are read
len(titles) 

486

In [38]:
# check that there are five colors for every title
len(hex_cols)/5 

486.0

In [39]:
# check first and last titles
script_titles = palettes[0][0::6]
script_titles.iloc[[0,len(script_titles) - 1]]

0       a-balcony-in-the-forest
2910                       zama
Name: 0, dtype: object

In [40]:
titles_list = [[i] * 5 for i in titles]
titles_rep = []
for i in range(0, len(titles_list)):
    for j in range(0,5):
        titles_rep.append(titles_list[i][j])

In [41]:
titles_swatches = pd.concat([pd.DataFrame(titles_rep), pd.DataFrame(hex_cols)], axis=1)
titles_swatches.columns = ['title', 'color']

In [42]:
a = ['color1', 'color2', 'color3', 'color4', 'color5']
labels = a * 486
titles_swatches = pd.concat([pd.DataFrame(titles_rep), pd.DataFrame(hex_cols), pd.DataFrame(labels)], axis=1)
titles_swatches.columns = ['title', 'color', 'color_index']

In [43]:
titles_palettes = titles_swatches.pivot(index='title', columns='color_index', values='color')
titles_palettes = titles_palettes.reset_index()

In [44]:
# titles_palettes.to_csv("titles_palettes.csv")

In [45]:
titles_palettes.head()  # need to attach link (ordered) to this dataset 

color_index,title,color1,color2,color3,color4,color5
0,a-balcony-in-the-forest,#A4A19A,#514D4B,#A49C63,#423C3C,#E1DED5
1,a-book-of-mediterranean-food,#BB613E,#E6D39D,#645D5A,#B7C4BE,#9D655A
2,a-fairly-good-time,#665E56,#A7A09C,#E4DDD8,#C1443B,#0A0A0A
3,a-favourite-of-the-gods-and-a-compass-error,#A4A5A5,#68658B,#4F272B,#5C5C2D,#A5C7BC
4,a-game-of-hide-and-seek,#6D6E6E,#B2B2B2,#B8C2C9,#4F4F50,#C0BFBF


In [46]:
# take image links from first dataset (books)
# make slugs to order the dataset 
slugs = [re.split("/", books.detail[i])[-1] for i in range(0, len(books))]
slugs_df = pd.concat([pd.Series(books.img), pd.Series(slugs)], axis=1)
slugs_df.columns = ['img', 'slug']
slugs_df = slugs_df.sort_values(by=['slug']).reset_index()
img_urls = pd.Series(slugs_df.img)

In [47]:
titles_palettes = pd.concat([titles_palettes, img_urls], axis=1)
titles_palettes.head()

Unnamed: 0,title,color1,color2,color3,color4,color5,img
0,a-balcony-in-the-forest,#A4A19A,#514D4B,#A49C63,#423C3C,#E1DED5,cdn.shopify.com/s/files/1/0726/9203/products/B...
1,a-book-of-mediterranean-food,#BB613E,#E6D39D,#645D5A,#B7C4BE,#9D655A,cdn.shopify.com/s/files/1/0726/9203/products/m...
2,a-fairly-good-time,#665E56,#A7A09C,#E4DDD8,#C1443B,#0A0A0A,cdn.shopify.com/s/files/1/0726/9203/products/A...
3,a-favourite-of-the-gods-and-a-compass-error,#A4A5A5,#68658B,#4F272B,#5C5C2D,#A5C7BC,cdn.shopify.com/s/files/1/0726/9203/products/A...
4,a-game-of-hide-and-seek,#6D6E6E,#B2B2B2,#B8C2C9,#4F4F50,#C0BFBF,cdn.shopify.com/s/files/1/0726/9203/products/p...


In [48]:
palette_dict = [dict(slug=titles_palettes.iloc[i][0],
                     color1=titles_palettes.iloc[i][1],
                     color2=titles_palettes.iloc[i][2],
                     color3=titles_palettes.iloc[i][3],
                     color4=titles_palettes.iloc[i][4],
                     color5=titles_palettes.iloc[i][5],
                     url = "https://{}".format(titles_palettes.iloc[i][6])) for i in range(0, len(titles_palettes))]

In [49]:
CSS_TEMPLATE = """
.%(slug)s .color1 { background-color: %(color1)s }
.%(slug)s .color2 { background-color: %(color2)s }
.%(slug)s .color3 { background-color:  %(color3)s }
.%(slug)s .color4 { background-color: %(color4)s }
.%(slug)s .color5 { background-color: %(color5)s }
"""

In [50]:
HTML_TEMPLATE = """
<section>
    <h3>{slug}</h3>
    <div class="book {slug}">
        <img class="book" src="{url}"/>
        <div class="swatch color1"></div>
        <div class="swatch color2"></div>
        <div class="swatch color3"></div>
        <div class="swatch color4"></div>
        <div class="swatch color5"></div>
    </div>
</section>
"""

In [51]:
css = ""
html = ""

In [52]:
for book in palette_dict:
    css += CSS_TEMPLATE % book
    html += HTML_TEMPLATE.format(**book)

In [53]:
with open("swatch.html", 'w') as f:
    f.write("<head>")
    f.write("<link href='style.css' rel='stylesheet' type='text/css'>")
    f.write("<style type='text/css'>")
    f.write(css)
    f.write("</style>")
    f.write("</head>")
    f.write("<body>")
    f.write(html)
    f.write("</body>")


## NEED TO FIX:

- Eileen Chang's page - country should be China?, not Chile
- Proensa volume should be by various authors
- Research and fix "translated" mark in original_language
- include OOP books in actual dataset
- why is 500 books not actually 500?
- We Think the World of You has a different cover??? investigate -- which books have been republished with new covers? slug is "we-think-the-world-of-you-3" ?? are there 1 and 2?
- differentiate between covers with the traditional square covers and those without
- creating palettes not always working -- sometimes histogram made for the wrong/previous book
- also not all palettes are being made and recorded **
- belchamber not quantized because of too few colors; equal danger hand processed for colors (why didn't it work in the automated script?)
- need a way to check palettes
- bad swatches (quantized 8): everything-flows, great-granny-webster, hons-and-rebels, houses, in-the-cafe-of-lost-youth, jejuri, living, naked_earth, names-on-the-land, novels-in-three-lines, peking-story, pitch-dark, poem-strip, red-shift, reveille-in-washington, schoolboys-diary-and-other-stories, the-collected-essays-of-elizabeth-hardwick, the-door, the-notebooks-of-joseph-joubert, the-radiance-of-the-king-1, the-skin, the-stories-of-jf-powers, the-wooden-shepherdess, tolstoy-rasputin-others-me-best-of-teffi

